libguile/Makefile.am (snarfcppopts): Remove CFLAGS
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014
2 * Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <unicase.h>
31 #include <unictype.h>
32 #include <c-strcase.h>
33 #include <c-ctype.h>
34
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/ports-internal.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
56
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
59
60
61 \f
62
63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
66 SCM_SYMBOL (sym_nil, "nil");
67
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73 scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
93 { 0, },
94 };
95
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100 enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
106
107 struct t_read_opts
108 {
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
118 unsigned int r7rs_symbols_p : 1;
119 };
120
121 typedef struct t_read_opts scm_t_read_opts;
122
123
124 /*
125 Give meaningful error messages for errors
126
127 We use the format
128
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
135 */
136
137
138 void
139 scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
141 {
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
145
146 SCM string_port = scm_open_output_string ();
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
150 scm_list_4 (fn,
151 scm_from_long (SCM_LINUM (port) + 1),
152 scm_from_int (SCM_COL (port) + 1),
153 scm_from_locale_string (message)));
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function? scm_from_locale_string (function) : SCM_BOOL_F,
159 string,
160 arg,
161 SCM_BOOL_F);
162 }
163
164
165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
166 (SCM setting),
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
171 {
172 SCM ans = scm_options (setting,
173 scm_read_opts,
174 FUNC_NAME);
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
177 return ans;
178 }
179 #undef FUNC_NAME
180
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM *scm_i_read_hash_procedures;
184
185 static SCM
186 scm_i_read_hash_procedures_ref (void)
187 {
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189 }
190
191 static void
192 scm_i_read_hash_procedures_set_x (SCM value)
193 {
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195 }
196
197 \f
198 /* Token readers. */
199
200
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
203
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
206
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
209
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
213
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
218
219 #ifdef MSDOS
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222 #else
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
224 #endif
225
226
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229 #define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
232
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
238
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
245 /* Read an SCSH block comment. */
246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
250 static SCM scm_get_hash_procedure (int);
251
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
256 static int
257 read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
259 {
260 *read = 0;
261
262 while (*read < buf_size)
263 {
264 int chr;
265
266 chr = scm_get_byte_or_eof_unlocked (port);
267
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
272 scm_unget_byte_unlocked (chr, port);
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
281
282 return 1;
283 }
284
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287 static char *
288 read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
290 {
291 int overflow = 0;
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
294
295 do
296 {
297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
306 overflow_size = bytes_read;
307 }
308 else
309 {
310 char *new_buf =
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
317 overflow_size += bytes_read;
318 }
319 }
320 }
321 while (overflow);
322
323 if (overflow_size)
324 *read = overflow_size;
325 else
326 *read = bytes_read;
327
328 return (overflow_size > 0 ? overflow_buffer : buffer);
329 }
330
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333 static int
334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
335 {
336 scm_t_wchar c;
337 while (1)
338 switch (c = scm_getc_unlocked (port))
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
343 {
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
348 }
349 return c;
350
351 case ';':
352 lp:
353 switch (c = scm_getc_unlocked (port))
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
363
364 case '#':
365 switch (c = scm_getc_unlocked (port))
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
371 scm_read_shebang (c, port, opts);
372 break;
373 case ';':
374 scm_read_commented_expression (c, port, opts);
375 break;
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
383 default:
384 scm_ungetc_unlocked (c, port);
385 return '#';
386 }
387 break;
388
389 case SCM_LINE_INCREMENTORS:
390 case SCM_SINGLE_SPACES:
391 case '\t':
392 break;
393
394 default:
395 return c;
396 }
397
398 return 0;
399 }
400
401
402 \f
403 /* Token readers. */
404
405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
408
409
410 static SCM
411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
413 {
414 /* This condition can be caused by a user calling
415 set-port-column!. */
416 if (line < 0 || column < 0)
417 return x;
418
419 if (opts->record_positions_p)
420 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
421 return x;
422 }
423
424 static SCM
425 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
426 #define FUNC_NAME "scm_i_lreadparen"
427 {
428 int c;
429 SCM tmp, tl, ans = SCM_EOL;
430 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
431 const int terminating_char = ((chr == '{') ? '}'
432 : ((chr == '[') ? ']'
433 : ')'));
434
435 /* Need to capture line and column numbers here. */
436 long line = SCM_LINUM (port);
437 int column = SCM_COL (port) - 1;
438
439 c = flush_ws (port, opts, FUNC_NAME);
440 if (terminating_char == c)
441 return SCM_EOL;
442
443 scm_ungetc_unlocked (c, port);
444 tmp = scm_read_expression (port, opts);
445
446 /* Note that it is possible for scm_read_expression to return
447 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
448 check that it's a real dot by checking `c'. */
449 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
450 {
451 ans = scm_read_expression (port, opts);
452 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
453 scm_i_input_error (FUNC_NAME, port, "missing close paren",
454 SCM_EOL);
455 return ans;
456 }
457
458 /* Build the head of the list structure. */
459 ans = tl = scm_cons (tmp, SCM_EOL);
460
461 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
462 {
463 SCM new_tail;
464
465 if (c == ')' || (c == ']' && opts->square_brackets_p)
466 || ((c == '}' || c == ']') && opts->curly_infix_p))
467 scm_i_input_error (FUNC_NAME, port,
468 "in pair: mismatched close paren: ~A",
469 scm_list_1 (SCM_MAKE_CHAR (c)));
470
471 scm_ungetc_unlocked (c, port);
472 tmp = scm_read_expression (port, opts);
473
474 /* See above note about scm_sym_dot. */
475 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
476 {
477 SCM_SETCDR (tl, scm_read_expression (port, opts));
478
479 c = flush_ws (port, opts, FUNC_NAME);
480 if (terminating_char != c)
481 scm_i_input_error (FUNC_NAME, port,
482 "in pair: missing close paren", SCM_EOL);
483 break;
484 }
485
486 new_tail = scm_cons (tmp, SCM_EOL);
487 SCM_SETCDR (tl, new_tail);
488 tl = new_tail;
489 }
490
491 if (curly_list_p)
492 {
493 /* In addition to finding the length, 'scm_ilength' checks for
494 improper or circular lists, in which case it returns -1. */
495 int len = scm_ilength (ans);
496
497 /* The (len == 0) case is handled above */
498 if (len == 1)
499 /* Return directly to avoid re-annotating the element's source
500 location with the position of the outer brace. Also, it
501 might not be possible to annotate the element. */
502 return scm_car (ans); /* {e} => e */
503 else if (len == 2)
504 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
505 else if (len >= 3 && (len & 1))
506 {
507 /* It's a proper list whose length is odd and at least 3. If
508 the elements at odd indices (the infix operator positions)
509 are all 'equal?', then it's a simple curly-infix list.
510 Otherwise it's a mixed curly-infix list. */
511 SCM op = scm_cadr (ans);
512
513 /* Check to see if the elements at odd indices are 'equal?' */
514 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
515 {
516 if (scm_is_null (tl))
517 {
518 /* Convert simple curly-infix list to prefix:
519 {a <op> b <op> ...} => (<op> a b ...) */
520 tl = ans;
521 while (scm_is_pair (scm_cdr (tl)))
522 {
523 tmp = scm_cddr (tl);
524 SCM_SETCDR (tl, tmp);
525 tl = tmp;
526 }
527 ans = scm_cons (op, ans);
528 break;
529 }
530 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
531 {
532 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
533 ans = scm_cons (sym_nfx, ans);
534 break;
535 }
536 }
537 }
538 else
539 /* Mixed curly-infix (possibly improper) list:
540 {e . tail} => ($nfx$ e . tail) */
541 ans = scm_cons (sym_nfx, ans);
542 }
543
544 return maybe_annotate_source (ans, port, opts, line, column);
545 }
546 #undef FUNC_NAME
547
548
549 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
550 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
551 found. */
552 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
553 do \
554 { \
555 scm_t_wchar a; \
556 size_t i = 0; \
557 c = 0; \
558 while (i < ndigits) \
559 { \
560 a = scm_getc_unlocked (port); \
561 if (a == EOF) \
562 goto str_eof; \
563 if (terminator \
564 && (a == (scm_t_wchar) terminator) \
565 && (i > 0)) \
566 break; \
567 if ('0' <= a && a <= '9') \
568 a -= '0'; \
569 else if ('A' <= a && a <= 'F') \
570 a = a - 'A' + 10; \
571 else if ('a' <= a && a <= 'f') \
572 a = a - 'a' + 10; \
573 else \
574 { \
575 c = a; \
576 goto bad_escaped; \
577 } \
578 c = c * 16 + a; \
579 i ++; \
580 } \
581 } while (0)
582
583 static void
584 skip_intraline_whitespace (SCM port)
585 {
586 scm_t_wchar c;
587
588 do
589 {
590 c = scm_getc_unlocked (port);
591 if (c == EOF)
592 return;
593 }
594 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
595
596 scm_ungetc_unlocked (c, port);
597 }
598
599 /* Read either a double-quoted string or an R7RS-style symbol delimited
600 by vertical lines, depending on the value of 'chr' ('"' or '|').
601 Regardless, the result is always returned as a string. */
602 static SCM
603 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
604 #define FUNC_NAME "scm_lreadr"
605 {
606 /* For strings smaller than C_STR, this function creates only one Scheme
607 object (the string returned). */
608
609 SCM str = SCM_EOL;
610 size_t c_str_len = 0;
611 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
612
613 /* Need to capture line and column numbers here. */
614 long line = SCM_LINUM (port);
615 int column = SCM_COL (port) - 1;
616
617 while (chr != (c = scm_getc_unlocked (port)))
618 {
619 if (c == EOF)
620 {
621 str_eof:
622 scm_i_input_error (FUNC_NAME, port,
623 (chr == '|'
624 ? "end of file in symbol"
625 : "end of file in string constant"),
626 SCM_EOL);
627 }
628
629 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
630 {
631 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
632 c_str_len = 0;
633 }
634
635 if (c == '\\')
636 {
637 switch (c = scm_getc_unlocked (port))
638 {
639 case EOF:
640 goto str_eof;
641 case '|':
642 case '\\':
643 case '(': /* Accept "\(" for use at the beginning of lines
644 in multiline strings to avoid confusing emacs
645 lisp modes. */
646 break;
647 case '\n':
648 if (opts->hungry_eol_escapes_p)
649 skip_intraline_whitespace (port);
650 continue;
651 case '0':
652 c = '\0';
653 break;
654 case 'f':
655 c = '\f';
656 break;
657 case 'n':
658 c = '\n';
659 break;
660 case 'r':
661 c = '\r';
662 break;
663 case 't':
664 c = '\t';
665 break;
666 case 'a':
667 c = '\007';
668 break;
669 case 'v':
670 c = '\v';
671 break;
672 case 'b':
673 c = '\010';
674 break;
675 case 'x':
676 if (opts->r6rs_escapes_p || chr == '|')
677 SCM_READ_HEX_ESCAPE (10, ';');
678 else
679 SCM_READ_HEX_ESCAPE (2, '\0');
680 break;
681 case 'u':
682 if (!opts->r6rs_escapes_p)
683 {
684 SCM_READ_HEX_ESCAPE (4, '\0');
685 break;
686 }
687 case 'U':
688 if (!opts->r6rs_escapes_p)
689 {
690 SCM_READ_HEX_ESCAPE (6, '\0');
691 break;
692 }
693 default:
694 if (c == chr)
695 break;
696 bad_escaped:
697 scm_i_input_error (FUNC_NAME, port,
698 "illegal character in escape sequence: ~S",
699 scm_list_1 (SCM_MAKE_CHAR (c)));
700 }
701 }
702
703 c_str[c_str_len++] = c;
704 }
705
706 if (scm_is_null (str))
707 /* Fast path: we got a string that fits in C_STR. */
708 str = scm_from_utf32_stringn (c_str, c_str_len);
709 else
710 {
711 if (c_str_len > 0)
712 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
713
714 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
715 }
716
717 return maybe_annotate_source (str, port, opts, line, column);
718 }
719 #undef FUNC_NAME
720
721 static SCM
722 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
723 {
724 return scm_read_string_like_syntax (chr, port, opts);
725 }
726
727 static SCM
728 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
729 {
730 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
731 }
732
733 static SCM
734 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
735 {
736 SCM result, str = SCM_EOL;
737 char local_buffer[READER_BUFFER_SIZE], *buffer;
738 size_t bytes_read;
739
740 /* Need to capture line and column numbers here. */
741 long line = SCM_LINUM (port);
742 int column = SCM_COL (port) - 1;
743
744 scm_ungetc_unlocked (chr, port);
745 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
746 &bytes_read);
747
748 str = scm_from_port_stringn (buffer, bytes_read, port);
749
750 result = scm_string_to_number (str, SCM_UNDEFINED);
751 if (scm_is_false (result))
752 {
753 /* Return a symbol instead of a number */
754 if (opts->case_insensitive_p)
755 str = scm_string_downcase_x (str);
756 result = scm_string_to_symbol (str);
757 }
758 else if (SCM_NIMP (result))
759 result = maybe_annotate_source (result, port, opts, line, column);
760
761 SCM_COL (port) += scm_i_string_length (str);
762 return result;
763 }
764
765 static SCM
766 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
767 {
768 SCM result;
769 int ends_with_colon = 0;
770 size_t bytes_read;
771 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
772 char local_buffer[READER_BUFFER_SIZE], *buffer;
773 SCM str;
774
775 scm_ungetc_unlocked (chr, port);
776 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
777 &bytes_read);
778 if (bytes_read > 0)
779 ends_with_colon = buffer[bytes_read - 1] == ':';
780
781 if (postfix && ends_with_colon && (bytes_read > 1))
782 {
783 str = scm_from_port_stringn (buffer, bytes_read - 1, port);
784
785 if (opts->case_insensitive_p)
786 str = scm_string_downcase_x (str);
787 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
788 }
789 else
790 {
791 str = scm_from_port_stringn (buffer, bytes_read, port);
792
793 if (opts->case_insensitive_p)
794 str = scm_string_downcase_x (str);
795 result = scm_string_to_symbol (str);
796 }
797
798 SCM_COL (port) += scm_i_string_length (str);
799 return result;
800 }
801
802 static SCM
803 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
804 #define FUNC_NAME "scm_lreadr"
805 {
806 SCM result;
807 size_t read;
808 char local_buffer[READER_BUFFER_SIZE], *buffer;
809 unsigned int radix;
810 SCM str;
811
812 switch (chr)
813 {
814 case 'B':
815 case 'b':
816 radix = 2;
817 break;
818
819 case 'o':
820 case 'O':
821 radix = 8;
822 break;
823
824 case 'd':
825 case 'D':
826 radix = 10;
827 break;
828
829 case 'x':
830 case 'X':
831 radix = 16;
832 break;
833
834 default:
835 scm_ungetc_unlocked (chr, port);
836 scm_ungetc_unlocked ('#', port);
837 radix = 10;
838 }
839
840 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
841 &read);
842
843 str = scm_from_port_stringn (buffer, read, port);
844
845 result = scm_string_to_number (str, scm_from_uint (radix));
846
847 SCM_COL (port) += scm_i_string_length (str);
848
849 if (scm_is_true (result))
850 return result;
851
852 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
853
854 return SCM_BOOL_F;
855 }
856 #undef FUNC_NAME
857
858 static SCM
859 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
860 {
861 SCM p;
862 long line = SCM_LINUM (port);
863 int column = SCM_COL (port) - 1;
864
865 switch (chr)
866 {
867 case '`':
868 p = scm_sym_quasiquote;
869 break;
870
871 case '\'':
872 p = scm_sym_quote;
873 break;
874
875 case ',':
876 {
877 scm_t_wchar c;
878
879 c = scm_getc_unlocked (port);
880 if ('@' == c)
881 p = scm_sym_uq_splicing;
882 else
883 {
884 scm_ungetc_unlocked (c, port);
885 p = scm_sym_unquote;
886 }
887 break;
888 }
889
890 default:
891 fprintf (stderr, "%s: unhandled quote character (%i)\n",
892 "scm_read_quote", chr);
893 abort ();
894 }
895
896 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
897 return maybe_annotate_source (p, port, opts, line, column);
898 }
899
900 SCM_SYMBOL (sym_syntax, "syntax");
901 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
902 SCM_SYMBOL (sym_unsyntax, "unsyntax");
903 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
904
905 static SCM
906 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
907 {
908 SCM p;
909 long line = SCM_LINUM (port);
910 int column = SCM_COL (port) - 1;
911
912 switch (chr)
913 {
914 case '`':
915 p = sym_quasisyntax;
916 break;
917
918 case '\'':
919 p = sym_syntax;
920 break;
921
922 case ',':
923 {
924 int c;
925
926 c = scm_getc_unlocked (port);
927 if ('@' == c)
928 p = sym_unsyntax_splicing;
929 else
930 {
931 scm_ungetc_unlocked (c, port);
932 p = sym_unsyntax;
933 }
934 break;
935 }
936
937 default:
938 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
939 "scm_read_syntax", chr);
940 abort ();
941 }
942
943 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
944 return maybe_annotate_source (p, port, opts, line, column);
945 }
946
947 static SCM
948 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
949 {
950 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
951
952 if (!scm_is_eq (id, sym_nil))
953 scm_i_input_error ("scm_read_nil", port,
954 "unexpected input while reading #nil: ~a",
955 scm_list_1 (id));
956
957 return SCM_ELISP_NIL;
958 }
959
960 static SCM
961 scm_read_semicolon_comment (int chr, SCM port)
962 {
963 int c;
964
965 /* We use the get_byte here because there is no need to get the
966 locale correct with comment input. This presumes that newline
967 always represents itself no matter what the encoding is. */
968 for (c = scm_get_byte_or_eof_unlocked (port);
969 (c != EOF) && (c != '\n');
970 c = scm_get_byte_or_eof_unlocked (port));
971
972 return SCM_UNSPECIFIED;
973 }
974
975 /* If the EXPECTED_CHARS are the next ones available from PORT, then
976 consume them and return 1. Otherwise leave the port position where
977 it was and return 0. EXPECTED_CHARS should be all lowercase, and
978 will be matched case-insensitively against the characters read from
979 PORT. */
980 static int
981 try_read_ci_chars (SCM port, const char *expected_chars)
982 {
983 int num_chars_wanted = strlen (expected_chars);
984 int num_chars_read = 0;
985 char *chars_read = alloca (num_chars_wanted);
986 int c;
987
988 while (num_chars_read < num_chars_wanted)
989 {
990 c = scm_getc_unlocked (port);
991 if (c == EOF)
992 break;
993 else if (c_tolower (c) != expected_chars[num_chars_read])
994 {
995 scm_ungetc_unlocked (c, port);
996 break;
997 }
998 else
999 chars_read[num_chars_read++] = c;
1000 }
1001
1002 if (num_chars_read == num_chars_wanted)
1003 return 1;
1004 else
1005 {
1006 while (num_chars_read > 0)
1007 scm_ungetc_unlocked (chars_read[--num_chars_read], port);
1008 return 0;
1009 }
1010 }
1011
1012 \f
1013 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1014
1015 static SCM
1016 scm_read_boolean (int chr, SCM port)
1017 {
1018 switch (chr)
1019 {
1020 case 't':
1021 case 'T':
1022 try_read_ci_chars (port, "rue");
1023 return SCM_BOOL_T;
1024
1025 case 'f':
1026 case 'F':
1027 try_read_ci_chars (port, "alse");
1028 return SCM_BOOL_F;
1029 }
1030
1031 return SCM_UNSPECIFIED;
1032 }
1033
1034 static SCM
1035 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1036 #define FUNC_NAME "scm_lreadr"
1037 {
1038 char buffer[READER_CHAR_NAME_MAX_SIZE];
1039 SCM charname;
1040 size_t charname_len, bytes_read;
1041 scm_t_wchar cp;
1042 int overflow;
1043 scm_t_port_internal *pti;
1044
1045 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1046 &bytes_read);
1047 if (overflow)
1048 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1049
1050 if (bytes_read == 0)
1051 {
1052 chr = scm_getc_unlocked (port);
1053 if (chr == EOF)
1054 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1055 "while reading character", SCM_EOL);
1056
1057 /* CHR must be a token delimiter, like a whitespace. */
1058 return (SCM_MAKE_CHAR (chr));
1059 }
1060
1061 pti = SCM_PORT_GET_INTERNAL (port);
1062
1063 /* Simple ASCII characters can be processed immediately. Also, simple
1064 ISO-8859-1 characters can be processed immediately if the encoding for this
1065 port is ISO-8859-1. */
1066 if (bytes_read == 1 &&
1067 ((unsigned char) buffer[0] <= 127
1068 || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
1069 {
1070 SCM_COL (port) += 1;
1071 return SCM_MAKE_CHAR (buffer[0]);
1072 }
1073
1074 /* Otherwise, convert the buffer into a proper scheme string for
1075 processing. */
1076 charname = scm_from_port_stringn (buffer, bytes_read, port);
1077 charname_len = scm_i_string_length (charname);
1078 SCM_COL (port) += charname_len;
1079 cp = scm_i_string_ref (charname, 0);
1080 if (charname_len == 1)
1081 return SCM_MAKE_CHAR (cp);
1082
1083 /* Ignore dotted circles, which may be used to keep combining characters from
1084 combining with the backslash in #\charname. */
1085 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1086 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1087
1088 if (cp >= '0' && cp < '8')
1089 {
1090 /* Dirk:FIXME:: This type of character syntax is not R5RS
1091 * compliant. Further, it should be verified that the constant
1092 * does only consist of octal digits. */
1093 SCM p = scm_string_to_number (charname, scm_from_uint (8));
1094 if (SCM_I_INUMP (p))
1095 {
1096 scm_t_wchar c = scm_to_uint32 (p);
1097 if (SCM_IS_UNICODE_CHAR (c))
1098 return SCM_MAKE_CHAR (c);
1099 else
1100 scm_i_input_error (FUNC_NAME, port,
1101 "out-of-range octal character escape: ~a",
1102 scm_list_1 (charname));
1103 }
1104 }
1105
1106 if (cp == 'x' && (charname_len > 1))
1107 {
1108 SCM p;
1109
1110 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1111 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1112 scm_from_uint (16));
1113 if (SCM_I_INUMP (p))
1114 {
1115 scm_t_wchar c = scm_to_uint32 (p);
1116 if (SCM_IS_UNICODE_CHAR (c))
1117 return SCM_MAKE_CHAR (c);
1118 else
1119 scm_i_input_error (FUNC_NAME, port,
1120 "out-of-range hex character escape: ~a",
1121 scm_list_1 (charname));
1122 }
1123 }
1124
1125 /* The names of characters should never have non-Latin1
1126 characters. */
1127 if (scm_i_is_narrow_string (charname)
1128 || scm_i_try_narrow_string (charname))
1129 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1130 charname_len);
1131 if (scm_is_true (ch))
1132 return ch;
1133 }
1134
1135 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1136 scm_list_1 (charname));
1137
1138 return SCM_UNSPECIFIED;
1139 }
1140 #undef FUNC_NAME
1141
1142 static SCM
1143 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1144 {
1145 SCM symbol;
1146
1147 /* Read the symbol that comprises the keyword. Doing this instead of
1148 invoking a specific symbol reader function allows `scm_read_keyword ()'
1149 to adapt to the delimiters currently valid of symbols.
1150
1151 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1152 symbol = scm_read_expression (port, opts);
1153 if (!scm_is_symbol (symbol))
1154 scm_i_input_error ("scm_read_keyword", port,
1155 "keyword prefix `~a' not followed by a symbol: ~s",
1156 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1157
1158 return (scm_symbol_to_keyword (symbol));
1159 }
1160
1161 static SCM
1162 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1163 long line, int column)
1164 {
1165 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1166 guarantee that it's going to do what we want. After all, this is an
1167 implementation detail of `scm_read_vector ()', not a desirable
1168 property. */
1169 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1170 port, opts, line, column);
1171 }
1172
1173 /* Helper used by scm_read_array */
1174 static int
1175 read_decimal_integer (SCM port, int c, ssize_t *resp)
1176 {
1177 ssize_t sign = 1;
1178 ssize_t res = 0;
1179 int got_it = 0;
1180
1181 if (c == '-')
1182 {
1183 sign = -1;
1184 c = scm_getc_unlocked (port);
1185 }
1186
1187 while ('0' <= c && c <= '9')
1188 {
1189 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1190 scm_i_input_error ("read_decimal_integer", port,
1191 "number too large", SCM_EOL);
1192 res = 10*res + c-'0';
1193 got_it = 1;
1194 c = scm_getc_unlocked (port);
1195 }
1196
1197 if (got_it)
1198 *resp = sign * res;
1199 return c;
1200 }
1201
1202 /* Read an array. This function can also read vectors and uniform
1203 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1204 handled here.
1205
1206 C is the first character read after the '#'. */
1207 static SCM
1208 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1209 {
1210 ssize_t rank;
1211 scm_t_wchar tag_buf[8];
1212 int tag_len;
1213
1214 SCM tag, shape = SCM_BOOL_F, elements, array;
1215
1216 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1217 the array code can not deal with zero-length dimensions yet, and
1218 we want to allow zero-length vectors, of course. */
1219 if (c == '(')
1220 return scm_read_vector (c, port, opts, line, column);
1221
1222 /* Disambiguate between '#f' and uniform floating point vectors. */
1223 if (c == 'f')
1224 {
1225 c = scm_getc_unlocked (port);
1226 if (c != '3' && c != '6')
1227 {
1228 if (c == 'a' && try_read_ci_chars (port, "lse"))
1229 return SCM_BOOL_F;
1230 else if (c != EOF)
1231 scm_ungetc_unlocked (c, port);
1232 return SCM_BOOL_F;
1233 }
1234 rank = 1;
1235 tag_buf[0] = 'f';
1236 tag_len = 1;
1237 goto continue_reading_tag;
1238 }
1239
1240 /* Read rank. */
1241 rank = 1;
1242 c = read_decimal_integer (port, c, &rank);
1243 if (rank < 0)
1244 scm_i_input_error (NULL, port, "array rank must be non-negative",
1245 SCM_EOL);
1246
1247 /* Read tag. */
1248 tag_len = 0;
1249 continue_reading_tag:
1250 while (c != EOF && c != '(' && c != '@' && c != ':'
1251 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1252 {
1253 tag_buf[tag_len++] = c;
1254 c = scm_getc_unlocked (port);
1255 }
1256 if (tag_len == 0)
1257 tag = SCM_BOOL_T;
1258 else
1259 {
1260 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1261 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1262 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1263 scm_list_1 (tag));
1264 }
1265
1266 /* Read shape. */
1267 if (c == '@' || c == ':')
1268 {
1269 shape = SCM_EOL;
1270
1271 do
1272 {
1273 ssize_t lbnd = 0, len = 0;
1274 SCM s;
1275
1276 if (c == '@')
1277 {
1278 c = scm_getc_unlocked (port);
1279 c = read_decimal_integer (port, c, &lbnd);
1280 }
1281
1282 s = scm_from_ssize_t (lbnd);
1283
1284 if (c == ':')
1285 {
1286 c = scm_getc_unlocked (port);
1287 c = read_decimal_integer (port, c, &len);
1288 if (len < 0)
1289 scm_i_input_error (NULL, port,
1290 "array length must be non-negative",
1291 SCM_EOL);
1292
1293 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1294 }
1295
1296 shape = scm_cons (s, shape);
1297 } while (c == '@' || c == ':');
1298
1299 shape = scm_reverse_x (shape, SCM_EOL);
1300 }
1301
1302 /* Read nested lists of elements. */
1303 if (c != '(')
1304 scm_i_input_error (NULL, port,
1305 "missing '(' in vector or array literal",
1306 SCM_EOL);
1307 elements = scm_read_sexp (c, port, opts);
1308
1309 if (scm_is_false (shape))
1310 shape = scm_from_ssize_t (rank);
1311 else if (scm_ilength (shape) != rank)
1312 scm_i_input_error
1313 (NULL, port,
1314 "the number of shape specifications must match the array rank",
1315 SCM_EOL);
1316
1317 /* Handle special print syntax of rank zero arrays; see
1318 scm_i_print_array for a rationale. */
1319 if (rank == 0)
1320 {
1321 if (!scm_is_pair (elements))
1322 scm_i_input_error (NULL, port,
1323 "too few elements in array literal, need 1",
1324 SCM_EOL);
1325 if (!scm_is_null (SCM_CDR (elements)))
1326 scm_i_input_error (NULL, port,
1327 "too many elements in array literal, want 1",
1328 SCM_EOL);
1329 elements = SCM_CAR (elements);
1330 }
1331
1332 /* Construct array, annotate with source location, and return. */
1333 array = scm_list_to_typed_array (tag, shape, elements);
1334 return maybe_annotate_source (array, port, opts, line, column);
1335 }
1336
1337 static SCM
1338 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1339 long line, int column)
1340 {
1341 return scm_read_array (chr, port, opts, line, column);
1342 }
1343
1344 static SCM
1345 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1346 long line, int column)
1347 {
1348 chr = scm_getc_unlocked (port);
1349 if (chr != 'u')
1350 goto syntax;
1351
1352 chr = scm_getc_unlocked (port);
1353 if (chr != '8')
1354 goto syntax;
1355
1356 chr = scm_getc_unlocked (port);
1357 if (chr != '(')
1358 goto syntax;
1359
1360 return maybe_annotate_source
1361 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1362 port, opts, line, column);
1363
1364 syntax:
1365 scm_i_input_error ("read_bytevector", port,
1366 "invalid bytevector prefix",
1367 SCM_MAKE_CHAR (chr));
1368 return SCM_UNSPECIFIED;
1369 }
1370
1371 static SCM
1372 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1373 long line, int column)
1374 {
1375 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1376 terribly inefficient but who cares? */
1377 SCM s_bits = SCM_EOL;
1378
1379 for (chr = scm_getc_unlocked (port);
1380 (chr != EOF) && ((chr == '0') || (chr == '1'));
1381 chr = scm_getc_unlocked (port))
1382 {
1383 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1384 }
1385
1386 if (chr != EOF)
1387 scm_ungetc_unlocked (chr, port);
1388
1389 return maybe_annotate_source
1390 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1391 port, opts, line, column);
1392 }
1393
1394 static SCM
1395 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1396 {
1397 int bang_seen = 0;
1398
1399 for (;;)
1400 {
1401 int c = scm_getc_unlocked (port);
1402
1403 if (c == EOF)
1404 scm_i_input_error ("skip_block_comment", port,
1405 "unterminated `#! ... !#' comment", SCM_EOL);
1406
1407 if (c == '!')
1408 bang_seen = 1;
1409 else if (c == '#' && bang_seen)
1410 break;
1411 else
1412 bang_seen = 0;
1413 }
1414
1415 return SCM_UNSPECIFIED;
1416 }
1417
1418 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1419 int value);
1420 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1421 int value);
1422 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1423 int value);
1424
1425 static SCM
1426 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1427 {
1428 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1429 int c;
1430 int i = 0;
1431
1432 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1433 {
1434 c = scm_getc_unlocked (port);
1435 if (c == EOF)
1436 scm_i_input_error ("skip_block_comment", port,
1437 "unterminated `#! ... !#' comment", SCM_EOL);
1438 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1439 name[i++] = c;
1440 else if (CHAR_IS_DELIMITER (c))
1441 {
1442 scm_ungetc_unlocked (c, port);
1443 name[i] = '\0';
1444 if (0 == strcmp ("r6rs", name))
1445 ; /* Silently ignore */
1446 else if (0 == strcmp ("fold-case", name))
1447 set_port_case_insensitive_p (port, opts, 1);
1448 else if (0 == strcmp ("no-fold-case", name))
1449 set_port_case_insensitive_p (port, opts, 0);
1450 else if (0 == strcmp ("curly-infix", name))
1451 set_port_curly_infix_p (port, opts, 1);
1452 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1453 {
1454 set_port_curly_infix_p (port, opts, 1);
1455 set_port_square_brackets_p (port, opts, 0);
1456 }
1457 else
1458 break;
1459
1460 return SCM_UNSPECIFIED;
1461 }
1462 else
1463 {
1464 scm_ungetc_unlocked (c, port);
1465 break;
1466 }
1467 }
1468 while (i > 0)
1469 scm_ungetc_unlocked (name[--i], port);
1470 return scm_read_scsh_block_comment (chr, port);
1471 }
1472
1473 static SCM
1474 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1475 {
1476 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1477 nested. So care must be taken. */
1478 int nesting_level = 1;
1479
1480 int a = scm_getc_unlocked (port);
1481
1482 if (a == EOF)
1483 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1484 "unterminated `#| ... |#' comment", SCM_EOL);
1485
1486 while (nesting_level > 0)
1487 {
1488 int b = scm_getc_unlocked (port);
1489
1490 if (b == EOF)
1491 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1492 "unterminated `#| ... |#' comment", SCM_EOL);
1493
1494 if (a == '|' && b == '#')
1495 {
1496 nesting_level--;
1497 b = EOF;
1498 }
1499 else if (a == '#' && b == '|')
1500 {
1501 nesting_level++;
1502 b = EOF;
1503 }
1504
1505 a = b;
1506 }
1507
1508 return SCM_UNSPECIFIED;
1509 }
1510
1511 static SCM
1512 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1513 scm_t_read_opts *opts)
1514 {
1515 scm_t_wchar c;
1516
1517 c = flush_ws (port, opts, (char *) NULL);
1518 if (EOF == c)
1519 scm_i_input_error ("read_commented_expression", port,
1520 "no expression after #; comment", SCM_EOL);
1521 scm_ungetc_unlocked (c, port);
1522 scm_read_expression (port, opts);
1523 return SCM_UNSPECIFIED;
1524 }
1525
1526 static SCM
1527 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1528 {
1529 /* Guile's extended symbol read syntax looks like this:
1530
1531 #{This is all a symbol name}#
1532
1533 So here, CHR is expected to be `{'. */
1534 int saw_brace = 0;
1535 size_t len = 0;
1536 SCM buf = scm_i_make_string (1024, NULL, 0);
1537
1538 buf = scm_i_string_start_writing (buf);
1539
1540 while ((chr = scm_getc_unlocked (port)) != EOF)
1541 {
1542 if (saw_brace)
1543 {
1544 if (chr == '#')
1545 {
1546 break;
1547 }
1548 else
1549 {
1550 saw_brace = 0;
1551 scm_i_string_set_x (buf, len++, '}');
1552 }
1553 }
1554
1555 if (chr == '}')
1556 saw_brace = 1;
1557 else if (chr == '\\')
1558 {
1559 /* It used to be that print.c would print extended-read-syntax
1560 symbols with backslashes before "non-standard" chars, but
1561 this routine wouldn't do anything with those escapes.
1562 Bummer. What we've done is to change print.c to output
1563 R6RS hex escapes for those characters, relying on the fact
1564 that the extended read syntax would never put a `\' before
1565 an `x'. For now, we just ignore other instances of
1566 backslash in the string. */
1567 switch ((chr = scm_getc_unlocked (port)))
1568 {
1569 case EOF:
1570 goto done;
1571 case 'x':
1572 {
1573 scm_t_wchar c;
1574
1575 SCM_READ_HEX_ESCAPE (10, ';');
1576 scm_i_string_set_x (buf, len++, c);
1577 break;
1578
1579 str_eof:
1580 chr = EOF;
1581 goto done;
1582
1583 bad_escaped:
1584 scm_i_string_stop_writing ();
1585 scm_i_input_error ("scm_read_extended_symbol", port,
1586 "illegal character in escape sequence: ~S",
1587 scm_list_1 (SCM_MAKE_CHAR (c)));
1588 break;
1589 }
1590 default:
1591 scm_i_string_set_x (buf, len++, chr);
1592 break;
1593 }
1594 }
1595 else
1596 scm_i_string_set_x (buf, len++, chr);
1597
1598 if (len >= scm_i_string_length (buf) - 2)
1599 {
1600 SCM addy;
1601
1602 scm_i_string_stop_writing ();
1603 addy = scm_i_make_string (1024, NULL, 0);
1604 buf = scm_string_append (scm_list_2 (buf, addy));
1605 len = 0;
1606 buf = scm_i_string_start_writing (buf);
1607 }
1608 }
1609
1610 done:
1611 scm_i_string_stop_writing ();
1612 if (chr == EOF)
1613 scm_i_input_error ("scm_read_extended_symbol", port,
1614 "end of file while reading symbol", SCM_EOL);
1615
1616 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1617 }
1618
1619
1620 \f
1621 /* Top-level token readers, i.e., dispatchers. */
1622
1623 static SCM
1624 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1625 {
1626 SCM proc;
1627
1628 proc = scm_get_hash_procedure (chr);
1629 if (scm_is_true (scm_procedure_p (proc)))
1630 {
1631 long line = SCM_LINUM (port);
1632 int column = SCM_COL (port) - 2;
1633 SCM got;
1634
1635 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1636
1637 if (opts->record_positions_p && SCM_NIMP (got)
1638 && !scm_i_has_source_properties (got))
1639 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1640
1641 return got;
1642 }
1643
1644 return SCM_UNSPECIFIED;
1645 }
1646
1647 /* The reader for the sharp `#' character. It basically dispatches reads
1648 among the above token readers. */
1649 static SCM
1650 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1651 long line, int column)
1652 #define FUNC_NAME "scm_lreadr"
1653 {
1654 SCM result;
1655
1656 chr = scm_getc_unlocked (port);
1657
1658 result = scm_read_sharp_extension (chr, port, opts);
1659 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1660 return result;
1661
1662 switch (chr)
1663 {
1664 case '\\':
1665 return (scm_read_character (chr, port, opts));
1666 case '(':
1667 return (scm_read_vector (chr, port, opts, line, column));
1668 case 's':
1669 case 'u':
1670 case 'f':
1671 case 'c':
1672 /* This one may return either a boolean or an SRFI-4 vector. */
1673 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1674 case 'v':
1675 return (scm_read_bytevector (chr, port, opts, line, column));
1676 case '*':
1677 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1678 case 't':
1679 case 'T':
1680 case 'F':
1681 return (scm_read_boolean (chr, port));
1682 case ':':
1683 return (scm_read_keyword (chr, port, opts));
1684 case '0': case '1': case '2': case '3': case '4':
1685 case '5': case '6': case '7': case '8': case '9':
1686 case '@':
1687 return (scm_read_array (chr, port, opts, line, column));
1688
1689 case 'i':
1690 case 'e':
1691 case 'b':
1692 case 'B':
1693 case 'o':
1694 case 'O':
1695 case 'd':
1696 case 'D':
1697 case 'x':
1698 case 'X':
1699 case 'I':
1700 case 'E':
1701 return (scm_read_number_and_radix (chr, port, opts));
1702 case '{':
1703 return (scm_read_extended_symbol (chr, port));
1704 case '!':
1705 return (scm_read_shebang (chr, port, opts));
1706 case ';':
1707 return (scm_read_commented_expression (chr, port, opts));
1708 case '`':
1709 case '\'':
1710 case ',':
1711 return (scm_read_syntax (chr, port, opts));
1712 case 'n':
1713 return (scm_read_nil (chr, port, opts));
1714 default:
1715 result = scm_read_sharp_extension (chr, port, opts);
1716 if (scm_is_eq (result, SCM_UNSPECIFIED))
1717 {
1718 /* To remain compatible with 1.8 and earlier, the following
1719 characters have lower precedence than `read-hash-extend'
1720 characters. */
1721 switch (chr)
1722 {
1723 case '|':
1724 return scm_read_r6rs_block_comment (chr, port);
1725 default:
1726 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1727 scm_list_1 (SCM_MAKE_CHAR (chr)));
1728 }
1729 }
1730 else
1731 return result;
1732 }
1733
1734 return SCM_UNSPECIFIED;
1735 }
1736 #undef FUNC_NAME
1737
1738 static SCM
1739 read_inner_expression (SCM port, scm_t_read_opts *opts)
1740 #define FUNC_NAME "read_inner_expression"
1741 {
1742 while (1)
1743 {
1744 scm_t_wchar chr;
1745
1746 chr = scm_getc_unlocked (port);
1747
1748 switch (chr)
1749 {
1750 case SCM_WHITE_SPACES:
1751 case SCM_LINE_INCREMENTORS:
1752 break;
1753 case ';':
1754 (void) scm_read_semicolon_comment (chr, port);
1755 break;
1756 case '{':
1757 if (opts->curly_infix_p)
1758 {
1759 if (opts->neoteric_p)
1760 return scm_read_sexp (chr, port, opts);
1761 else
1762 {
1763 SCM expr;
1764
1765 /* Enable neoteric expressions within curly braces */
1766 opts->neoteric_p = 1;
1767 expr = scm_read_sexp (chr, port, opts);
1768 opts->neoteric_p = 0;
1769 return expr;
1770 }
1771 }
1772 else
1773 return scm_read_mixed_case_symbol (chr, port, opts);
1774 case '[':
1775 if (opts->square_brackets_p)
1776 return scm_read_sexp (chr, port, opts);
1777 else if (opts->curly_infix_p)
1778 {
1779 /* The syntax of neoteric expressions requires that '[' be
1780 a delimiter when curly-infix is enabled, so it cannot
1781 be part of an unescaped symbol. We might as well do
1782 something useful with it, so we adopt Kawa's convention:
1783 [...] => ($bracket-list$ ...) */
1784 long line = SCM_LINUM (port);
1785 int column = SCM_COL (port) - 1;
1786 return maybe_annotate_source
1787 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1788 port, opts, line, column);
1789 }
1790 else
1791 return scm_read_mixed_case_symbol (chr, port, opts);
1792 case '(':
1793 return (scm_read_sexp (chr, port, opts));
1794 case '"':
1795 return (scm_read_string (chr, port, opts));
1796 case '|':
1797 if (opts->r7rs_symbols_p)
1798 return scm_read_r7rs_symbol (chr, port, opts);
1799 else
1800 return scm_read_mixed_case_symbol (chr, port, opts);
1801 case '\'':
1802 case '`':
1803 case ',':
1804 return (scm_read_quote (chr, port, opts));
1805 case '#':
1806 {
1807 long line = SCM_LINUM (port);
1808 int column = SCM_COL (port) - 1;
1809 SCM result = scm_read_sharp (chr, port, opts, line, column);
1810 if (scm_is_eq (result, SCM_UNSPECIFIED))
1811 /* We read a comment or some such. */
1812 break;
1813 else
1814 return result;
1815 }
1816 case ')':
1817 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1818 break;
1819 case '}':
1820 if (opts->curly_infix_p)
1821 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1822 else
1823 return scm_read_mixed_case_symbol (chr, port, opts);
1824 case ']':
1825 if (opts->square_brackets_p)
1826 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1827 /* otherwise fall through */
1828 case EOF:
1829 return SCM_EOF_VAL;
1830 case ':':
1831 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1832 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1833 /* Fall through. */
1834
1835 default:
1836 {
1837 if (((chr >= '0') && (chr <= '9'))
1838 || (strchr ("+-.", chr)))
1839 return (scm_read_number (chr, port, opts));
1840 else
1841 return (scm_read_mixed_case_symbol (chr, port, opts));
1842 }
1843 }
1844 }
1845 }
1846 #undef FUNC_NAME
1847
1848 static SCM
1849 scm_read_expression (SCM port, scm_t_read_opts *opts)
1850 #define FUNC_NAME "scm_read_expression"
1851 {
1852 if (!opts->neoteric_p)
1853 return read_inner_expression (port, opts);
1854 else
1855 {
1856 long line = 0;
1857 int column = 0;
1858 SCM expr;
1859
1860 if (opts->record_positions_p)
1861 {
1862 /* We need to get the position of the first non-whitespace
1863 character in order to correctly annotate neoteric
1864 expressions. For example, for the expression 'f(x)', the
1865 first call to 'read_inner_expression' reads the 'f' (which
1866 cannot be annotated), and then we later read the '(x)' and
1867 use it to construct the new list (f x). */
1868 int c = flush_ws (port, opts, (char *) NULL);
1869 if (c == EOF)
1870 return SCM_EOF_VAL;
1871 scm_ungetc_unlocked (c, port);
1872 line = SCM_LINUM (port);
1873 column = SCM_COL (port);
1874 }
1875
1876 expr = read_inner_expression (port, opts);
1877
1878 /* 'expr' is the first component of the neoteric expression. Now
1879 we loop, and as long as the next character is '(', '[', or '{',
1880 (without any intervening whitespace), we use it to construct a
1881 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1882 for (;;)
1883 {
1884 int chr = scm_getc_unlocked (port);
1885
1886 if (chr == '(')
1887 /* e(...) => (e ...) */
1888 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1889 else if (chr == '[')
1890 /* e[...] => ($bracket-apply$ e ...) */
1891 expr = scm_cons (sym_bracket_apply,
1892 scm_cons (expr,
1893 scm_read_sexp (chr, port, opts)));
1894 else if (chr == '{')
1895 {
1896 SCM arg = scm_read_sexp (chr, port, opts);
1897
1898 if (scm_is_null (arg))
1899 expr = scm_list_1 (expr); /* e{} => (e) */
1900 else
1901 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1902 }
1903 else
1904 {
1905 if (chr != EOF)
1906 scm_ungetc_unlocked (chr, port);
1907 break;
1908 }
1909 maybe_annotate_source (expr, port, opts, line, column);
1910 }
1911 return expr;
1912 }
1913 }
1914 #undef FUNC_NAME
1915
1916 \f
1917 /* Actual reader. */
1918
1919 static void init_read_options (SCM port, scm_t_read_opts *opts);
1920
1921 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1922 (SCM port),
1923 "Read an s-expression from the input port @var{port}, or from\n"
1924 "the current input port if @var{port} is not specified.\n"
1925 "Any whitespace before the next token is discarded.")
1926 #define FUNC_NAME s_scm_read
1927 {
1928 scm_t_read_opts opts;
1929 int c;
1930
1931 if (SCM_UNBNDP (port))
1932 port = scm_current_input_port ();
1933 SCM_VALIDATE_OPINPORT (1, port);
1934
1935 init_read_options (port, &opts);
1936
1937 c = flush_ws (port, &opts, (char *) NULL);
1938 if (EOF == c)
1939 return SCM_EOF_VAL;
1940 scm_ungetc_unlocked (c, port);
1941
1942 return (scm_read_expression (port, &opts));
1943 }
1944 #undef FUNC_NAME
1945
1946
1947 \f
1948
1949 /* Manipulate the read-hash-procedures alist. This could be written in
1950 Scheme, but maybe it will also be used by C code during initialisation. */
1951 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1952 (SCM chr, SCM proc),
1953 "Install the procedure @var{proc} for reading expressions\n"
1954 "starting with the character sequence @code{#} and @var{chr}.\n"
1955 "@var{proc} will be called with two arguments: the character\n"
1956 "@var{chr} and the port to read further data from. The object\n"
1957 "returned will be the return value of @code{read}. \n"
1958 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1959 )
1960 #define FUNC_NAME s_scm_read_hash_extend
1961 {
1962 SCM this;
1963 SCM prev;
1964
1965 SCM_VALIDATE_CHAR (1, chr);
1966 SCM_ASSERT (scm_is_false (proc)
1967 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1968 proc, SCM_ARG2, FUNC_NAME);
1969
1970 /* Check if chr is already in the alist. */
1971 this = scm_i_read_hash_procedures_ref ();
1972 prev = SCM_BOOL_F;
1973 while (1)
1974 {
1975 if (scm_is_null (this))
1976 {
1977 /* not found, so add it to the beginning. */
1978 if (scm_is_true (proc))
1979 {
1980 SCM new = scm_cons (scm_cons (chr, proc),
1981 scm_i_read_hash_procedures_ref ());
1982 scm_i_read_hash_procedures_set_x (new);
1983 }
1984 break;
1985 }
1986 if (scm_is_eq (chr, SCM_CAAR (this)))
1987 {
1988 /* already in the alist. */
1989 if (scm_is_false (proc))
1990 {
1991 /* remove it. */
1992 if (scm_is_false (prev))
1993 {
1994 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1995 scm_i_read_hash_procedures_set_x (rest);
1996 }
1997 else
1998 scm_set_cdr_x (prev, SCM_CDR (this));
1999 }
2000 else
2001 {
2002 /* replace it. */
2003 scm_set_cdr_x (SCM_CAR (this), proc);
2004 }
2005 break;
2006 }
2007 prev = this;
2008 this = SCM_CDR (this);
2009 }
2010
2011 return SCM_UNSPECIFIED;
2012 }
2013 #undef FUNC_NAME
2014
2015 /* Recover the read-hash procedure corresponding to char c. */
2016 static SCM
2017 scm_get_hash_procedure (int c)
2018 {
2019 SCM rest = scm_i_read_hash_procedures_ref ();
2020
2021 while (1)
2022 {
2023 if (scm_is_null (rest))
2024 return SCM_BOOL_F;
2025
2026 if (SCM_CHAR (SCM_CAAR (rest)) == c)
2027 return SCM_CDAR (rest);
2028
2029 rest = SCM_CDR (rest);
2030 }
2031 }
2032
2033 static int
2034 is_encoding_char (char c)
2035 {
2036 if (c >= 'a' && c <= 'z') return 1;
2037 if (c >= 'A' && c <= 'Z') return 1;
2038 if (c >= '0' && c <= '9') return 1;
2039 return strchr ("_-.:/,+=()", c) != NULL;
2040 }
2041
2042 /* Maximum size of an encoding name. This is a bit more than the
2043 longest name listed at
2044 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2045 characters.) */
2046 #define ENCODING_NAME_MAX_SIZE 20
2047
2048 /* Number of bytes at the beginning or end of a file that are scanned
2049 for a "coding:" declaration. */
2050 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2051
2052
2053 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2054 coding declaration. Returns either NULL or a string whose storage
2055 has been allocated with `scm_gc_malloc'. */
2056 char *
2057 scm_i_scan_for_encoding (SCM port)
2058 {
2059 scm_t_port *pt;
2060 char header[SCM_ENCODING_SEARCH_SIZE+1];
2061 size_t bytes_read, encoding_length, i;
2062 char *encoding = NULL;
2063 char *pos, *encoding_start;
2064 int in_comment;
2065
2066 pt = SCM_PTAB_ENTRY (port);
2067
2068 if (pt->rw_active == SCM_PORT_WRITE)
2069 scm_flush_unlocked (port);
2070
2071 if (pt->rw_random)
2072 pt->rw_active = SCM_PORT_READ;
2073
2074 if (pt->read_pos == pt->read_end)
2075 {
2076 /* We can use the read buffer, and thus avoid a seek. */
2077 if (scm_fill_input_unlocked (port) == EOF)
2078 return NULL;
2079
2080 bytes_read = pt->read_end - pt->read_pos;
2081 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2082 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2083
2084 if (bytes_read <= 1)
2085 /* An unbuffered port -- don't scan. */
2086 return NULL;
2087
2088 memcpy (header, pt->read_pos, bytes_read);
2089 header[bytes_read] = '\0';
2090 }
2091 else
2092 {
2093 /* Try to read some bytes and then seek back. Not all ports
2094 support seeking back; and indeed some file ports (like
2095 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2096 check performed by SCM_FPORT_FDES---but fail to seek
2097 backwards. Hence this block comes second. We prefer to use
2098 the read buffer in-place. */
2099 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2100 return NULL;
2101
2102 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2103 header[bytes_read] = '\0';
2104 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2105 }
2106
2107 /* search past "coding[:=]" */
2108 pos = header;
2109 while (1)
2110 {
2111 if ((pos = strstr(pos, "coding")) == NULL)
2112 return NULL;
2113
2114 pos += strlen ("coding");
2115 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2116 (*pos == ':' || *pos == '='))
2117 {
2118 pos ++;
2119 break;
2120 }
2121 }
2122
2123 /* skip spaces */
2124 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2125 (*pos == ' ' || *pos == '\t'))
2126 pos ++;
2127
2128 if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2129 /* We found the "coding:" string, but there is probably not enough
2130 room to store an encoding name in its entirety, so ignore it.
2131 This makes sure we do not end up returning a truncated encoding
2132 name. */
2133 return NULL;
2134
2135 /* grab the next token */
2136 encoding_start = pos;
2137 i = 0;
2138 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2139 && encoding_start + i - header < bytes_read
2140 && is_encoding_char (encoding_start[i]))
2141 i++;
2142
2143 encoding_length = i;
2144 if (encoding_length == 0)
2145 return NULL;
2146
2147 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2148
2149 /* push backwards to make sure we were in a comment */
2150 in_comment = 0;
2151 pos = encoding_start;
2152 while (pos >= header)
2153 {
2154 if (*pos == ';')
2155 {
2156 in_comment = 1;
2157 break;
2158 }
2159 else if (*pos == '\n' || pos == header)
2160 {
2161 /* This wasn't in a semicolon comment. Check for a
2162 hash-bang comment. */
2163 char *beg = strstr (header, "#!");
2164 char *end = strstr (header, "!#");
2165 if (beg < encoding_start && encoding_start + encoding_length <= end)
2166 in_comment = 1;
2167 break;
2168 }
2169 else
2170 {
2171 pos --;
2172 continue;
2173 }
2174 }
2175 if (!in_comment)
2176 /* This wasn't in a comment */
2177 return NULL;
2178
2179 return encoding;
2180 }
2181
2182 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2183 (SCM port),
2184 "Scans the port for an Emacs-like character coding declaration\n"
2185 "near the top of the contents of a port with random-accessible contents.\n"
2186 "The coding declaration is of the form\n"
2187 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2188 "\n"
2189 "Returns a string containing the character encoding of the file\n"
2190 "if a declaration was found, or @code{#f} otherwise.\n")
2191 #define FUNC_NAME s_scm_file_encoding
2192 {
2193 char *enc;
2194 SCM s_enc;
2195
2196 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2197
2198 enc = scm_i_scan_for_encoding (port);
2199 if (enc == NULL)
2200 return SCM_BOOL_F;
2201 else
2202 {
2203 s_enc = scm_string_upcase (scm_from_locale_string (enc));
2204 return s_enc;
2205 }
2206
2207 return SCM_BOOL_F;
2208 }
2209 #undef FUNC_NAME
2210
2211 \f
2212 /* Per-port read options.
2213
2214 We store per-port read options in the 'port-read-options' port
2215 property, which is stored in the internal port structure. The value
2216 stored is a single integer that contains a two-bit field for each
2217 read option.
2218
2219 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2220 the applicable value should be inherited from the corresponding
2221 global read option. Otherwise, the bit field contains the value of
2222 the read option. For boolean read options that have been set
2223 per-port, the possible values are 0 or 1. If the 'keyword_style'
2224 read option has been set per-port, its possible values are those in
2225 'enum t_keyword_style'. */
2226
2227 /* Key to read options in port properties. */
2228 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2229
2230 /* Offsets of bit fields for each per-port override */
2231 #define READ_OPTION_COPY_SOURCE_P 0
2232 #define READ_OPTION_RECORD_POSITIONS_P 2
2233 #define READ_OPTION_CASE_INSENSITIVE_P 4
2234 #define READ_OPTION_KEYWORD_STYLE 6
2235 #define READ_OPTION_R6RS_ESCAPES_P 8
2236 #define READ_OPTION_SQUARE_BRACKETS_P 10
2237 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2238 #define READ_OPTION_CURLY_INFIX_P 14
2239 #define READ_OPTION_R7RS_SYMBOLS_P 16
2240
2241 /* The total width in bits of the per-port overrides */
2242 #define READ_OPTIONS_NUM_BITS 18
2243
2244 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2245 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2246
2247 #define READ_OPTION_MASK 3
2248 #define READ_OPTION_INHERIT 3
2249
2250 static void
2251 set_port_read_option (SCM port, int option, int new_value)
2252 {
2253 SCM scm_read_options;
2254 unsigned int read_options;
2255
2256 new_value &= READ_OPTION_MASK;
2257
2258 scm_dynwind_begin (0);
2259 scm_dynwind_lock_port (port);
2260
2261 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2262 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2263 read_options = scm_to_uint (scm_read_options);
2264 else
2265 read_options = READ_OPTIONS_INHERIT_ALL;
2266 read_options &= ~(READ_OPTION_MASK << option);
2267 read_options |= new_value << option;
2268 scm_read_options = scm_from_uint (read_options);
2269 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2270
2271 scm_dynwind_end ();
2272 }
2273
2274 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2275 static void
2276 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2277 {
2278 value = !!value;
2279 opts->case_insensitive_p = value;
2280 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2281 }
2282
2283 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2284 static void
2285 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2286 {
2287 value = !!value;
2288 opts->square_brackets_p = value;
2289 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2290 }
2291
2292 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2293 static void
2294 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2295 {
2296 value = !!value;
2297 opts->curly_infix_p = value;
2298 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2299 }
2300
2301 /* Initialize OPTS based on PORT's read options and the global read
2302 options. */
2303 static void
2304 init_read_options (SCM port, scm_t_read_opts *opts)
2305 {
2306 SCM val, scm_read_options;
2307 unsigned int read_options, x;
2308
2309 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2310
2311 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2312 read_options = scm_to_uint (scm_read_options);
2313 else
2314 read_options = READ_OPTIONS_INHERIT_ALL;
2315
2316 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2317 if (x == READ_OPTION_INHERIT)
2318 {
2319 val = SCM_PACK (SCM_KEYWORD_STYLE);
2320 if (scm_is_eq (val, scm_keyword_prefix))
2321 x = KEYWORD_STYLE_PREFIX;
2322 else if (scm_is_eq (val, scm_keyword_postfix))
2323 x = KEYWORD_STYLE_POSTFIX;
2324 else
2325 x = KEYWORD_STYLE_HASH_PREFIX;
2326 }
2327 opts->keyword_style = x;
2328
2329 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2330 do \
2331 { \
2332 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2333 if (x == READ_OPTION_INHERIT) \
2334 x = !!SCM_ ## NAME; \
2335 opts->name = x; \
2336 } \
2337 while (0)
2338
2339 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2340 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2341 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2342 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2343 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2344 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2345 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
2346 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
2347
2348 #undef RESOLVE_BOOLEAN_OPTION
2349
2350 opts->neoteric_p = 0;
2351 }
2352
2353 void
2354 scm_init_read ()
2355 {
2356 SCM read_hash_procs;
2357
2358 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2359
2360 scm_i_read_hash_procedures =
2361 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2362
2363 scm_init_opts (scm_read_options, scm_read_opts);
2364 #include "libguile/read.x"
2365 }
2366
2367 /*
2368 Local Variables:
2369 c-file-style: "gnu"
2370 End:
2371 */