Implement #!fold-case and #!no-fold-case reader directives.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /* Internal read options structure. This is initialized by 'scm_read'
85 from the global and per-port read options, and a pointer is passed
86 down to all helper functions. */
87
88 enum t_keyword_style
89 {
90 KEYWORD_STYLE_HASH_PREFIX,
91 KEYWORD_STYLE_PREFIX,
92 KEYWORD_STYLE_POSTFIX
93 };
94
95 struct t_read_opts
96 {
97 enum t_keyword_style keyword_style;
98 unsigned int copy_source_p : 1;
99 unsigned int record_positions_p : 1;
100 unsigned int case_insensitive_p : 1;
101 unsigned int r6rs_escapes_p : 1;
102 unsigned int square_brackets_p : 1;
103 unsigned int hungry_eol_escapes_p : 1;
104 };
105
106 typedef struct t_read_opts scm_t_read_opts;
107
108
109 /*
110 Give meaningful error messages for errors
111
112 We use the format
113
114 FILE:LINE:COL: MESSAGE
115 This happened in ....
116
117 This is not standard GNU format, but the test-suite likes the real
118 message to be in front.
119
120 */
121
122
123 void
124 scm_i_input_error (char const *function,
125 SCM port, const char *message, SCM arg)
126 {
127 SCM fn = (scm_is_string (SCM_FILENAME(port))
128 ? SCM_FILENAME(port)
129 : scm_from_locale_string ("#<unknown port>"));
130
131 SCM string_port = scm_open_output_string ();
132 SCM string = SCM_EOL;
133 scm_simple_format (string_port,
134 scm_from_locale_string ("~A:~S:~S: ~A"),
135 scm_list_4 (fn,
136 scm_from_long (SCM_LINUM (port) + 1),
137 scm_from_int (SCM_COL (port) + 1),
138 scm_from_locale_string (message)));
139
140 string = scm_get_output_string (string_port);
141 scm_close_output_port (string_port);
142 scm_error_scm (scm_from_latin1_symbol ("read-error"),
143 function? scm_from_locale_string (function) : SCM_BOOL_F,
144 string,
145 arg,
146 SCM_BOOL_F);
147 }
148
149
150 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
151 (SCM setting),
152 "Option interface for the read options. Instead of using\n"
153 "this procedure directly, use the procedures @code{read-enable},\n"
154 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
155 #define FUNC_NAME s_scm_read_options
156 {
157 SCM ans = scm_options (setting,
158 scm_read_opts,
159 FUNC_NAME);
160 if (SCM_COPY_SOURCE_P)
161 SCM_RECORD_POSITIONS_P = 1;
162 return ans;
163 }
164 #undef FUNC_NAME
165
166 /* A fluid referring to an association list mapping extra hash
167 characters to procedures. */
168 static SCM *scm_i_read_hash_procedures;
169
170 static SCM
171 scm_i_read_hash_procedures_ref (void)
172 {
173 return scm_fluid_ref (*scm_i_read_hash_procedures);
174 }
175
176 static void
177 scm_i_read_hash_procedures_set_x (SCM value)
178 {
179 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
180 }
181
182 \f
183 /* Token readers. */
184
185
186 /* Size of the C buffer used to read symbols and numbers. */
187 #define READER_BUFFER_SIZE 128
188
189 /* Number of 32-bit codepoints in the buffer used to read strings. */
190 #define READER_STRING_BUFFER_SIZE 128
191
192 /* The maximum size of Scheme character names. */
193 #define READER_CHAR_NAME_MAX_SIZE 50
194
195 /* The maximum size of reader directive names. */
196 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
197
198
199 /* `isblank' is only in C99. */
200 #define CHAR_IS_BLANK_(_chr) \
201 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
202 || ((_chr) == '\f') || ((_chr) == '\r'))
203
204 #ifdef MSDOS
205 # define CHAR_IS_BLANK(_chr) \
206 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
207 #else
208 # define CHAR_IS_BLANK CHAR_IS_BLANK_
209 #endif
210
211
212 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
213 structure''). */
214 #define CHAR_IS_R5RS_DELIMITER(c) \
215 (CHAR_IS_BLANK (c) \
216 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
217
218 #define CHAR_IS_DELIMITER(c) \
219 (CHAR_IS_R5RS_DELIMITER (c) \
220 || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
221
222 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
223 Structure''. */
224 #define CHAR_IS_EXPONENT_MARKER(_chr) \
225 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
226 || ((_chr) == 'd') || ((_chr) == 'l'))
227
228 /* Read an SCSH block comment. */
229 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
230 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
231 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
232 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
233 static SCM scm_get_hash_procedure (int);
234
235 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
236 result in the pre-allocated buffer BUF. Return zero if the whole token has
237 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
238 bytes actually read. */
239 static int
240 read_token (SCM port, scm_t_read_opts *opts,
241 char *buf, size_t buf_size, size_t *read)
242 {
243 *read = 0;
244
245 while (*read < buf_size)
246 {
247 int chr;
248
249 chr = scm_get_byte_or_eof (port);
250
251 if (chr == EOF)
252 return 0;
253 else if (CHAR_IS_DELIMITER (chr))
254 {
255 scm_unget_byte (chr, port);
256 return 0;
257 }
258 else
259 {
260 *buf = (char) chr;
261 buf++, (*read)++;
262 }
263 }
264
265 return 1;
266 }
267
268 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
269 if the token doesn't fit in BUFFER_SIZE bytes. */
270 static char *
271 read_complete_token (SCM port, scm_t_read_opts *opts,
272 char *buffer, size_t buffer_size, size_t *read)
273 {
274 int overflow = 0;
275 size_t bytes_read, overflow_size = 0;
276 char *overflow_buffer = NULL;
277
278 do
279 {
280 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
281 if (bytes_read == 0)
282 break;
283 if (overflow || overflow_size != 0)
284 {
285 if (overflow_size == 0)
286 {
287 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
288 memcpy (overflow_buffer, buffer, bytes_read);
289 overflow_size = bytes_read;
290 }
291 else
292 {
293 char *new_buf =
294 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
295
296 memcpy (new_buf, overflow_buffer, overflow_size);
297 memcpy (new_buf + overflow_size, buffer, bytes_read);
298
299 overflow_buffer = new_buf;
300 overflow_size += bytes_read;
301 }
302 }
303 }
304 while (overflow);
305
306 if (overflow_size)
307 *read = overflow_size;
308 else
309 *read = bytes_read;
310
311 return (overflow_size > 0 ? overflow_buffer : buffer);
312 }
313
314 /* Skip whitespace from PORT and return the first non-whitespace character
315 read. Raise an error on end-of-file. */
316 static int
317 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
318 {
319 scm_t_wchar c;
320 while (1)
321 switch (c = scm_getc (port))
322 {
323 case EOF:
324 goteof:
325 if (eoferr)
326 {
327 scm_i_input_error (eoferr,
328 port,
329 "end of file",
330 SCM_EOL);
331 }
332 return c;
333
334 case ';':
335 lp:
336 switch (c = scm_getc (port))
337 {
338 case EOF:
339 goto goteof;
340 default:
341 goto lp;
342 case SCM_LINE_INCREMENTORS:
343 break;
344 }
345 break;
346
347 case '#':
348 switch (c = scm_getc (port))
349 {
350 case EOF:
351 eoferr = "read_sharp";
352 goto goteof;
353 case '!':
354 scm_read_shebang (c, port, opts);
355 break;
356 case ';':
357 scm_read_commented_expression (c, port, opts);
358 break;
359 case '|':
360 if (scm_is_false (scm_get_hash_procedure (c)))
361 {
362 scm_read_r6rs_block_comment (c, port);
363 break;
364 }
365 /* fall through */
366 default:
367 scm_ungetc (c, port);
368 return '#';
369 }
370 break;
371
372 case SCM_LINE_INCREMENTORS:
373 case SCM_SINGLE_SPACES:
374 case '\t':
375 break;
376
377 default:
378 return c;
379 }
380
381 return 0;
382 }
383
384
385 \f
386 /* Token readers. */
387
388 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
389 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
390 long line, int column);
391
392
393 static SCM
394 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
395 long line, int column)
396 {
397 if (opts->record_positions_p)
398 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
399 return x;
400 }
401
402 static SCM
403 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
404 #define FUNC_NAME "scm_i_lreadparen"
405 {
406 int c;
407 SCM tmp, tl, ans = SCM_EOL;
408 const int terminating_char = ((chr == '[') ? ']' : ')');
409
410 /* Need to capture line and column numbers here. */
411 long line = SCM_LINUM (port);
412 int column = SCM_COL (port) - 1;
413
414 c = flush_ws (port, opts, FUNC_NAME);
415 if (terminating_char == c)
416 return SCM_EOL;
417
418 scm_ungetc (c, port);
419 tmp = scm_read_expression (port, opts);
420
421 /* Note that it is possible for scm_read_expression to return
422 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
423 check that it's a real dot by checking `c'. */
424 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
425 {
426 ans = scm_read_expression (port, opts);
427 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
428 scm_i_input_error (FUNC_NAME, port, "missing close paren",
429 SCM_EOL);
430 return ans;
431 }
432
433 /* Build the head of the list structure. */
434 ans = tl = scm_cons (tmp, SCM_EOL);
435
436 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
437 {
438 SCM new_tail;
439
440 if (c == ')' || (c == ']' && opts->square_brackets_p))
441 scm_i_input_error (FUNC_NAME, port,
442 "in pair: mismatched close paren: ~A",
443 scm_list_1 (SCM_MAKE_CHAR (c)));
444
445 scm_ungetc (c, port);
446 tmp = scm_read_expression (port, opts);
447
448 /* See above note about scm_sym_dot. */
449 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
450 {
451 SCM_SETCDR (tl, scm_read_expression (port, opts));
452
453 c = flush_ws (port, opts, FUNC_NAME);
454 if (terminating_char != c)
455 scm_i_input_error (FUNC_NAME, port,
456 "in pair: missing close paren", SCM_EOL);
457 goto exit;
458 }
459
460 new_tail = scm_cons (tmp, SCM_EOL);
461 SCM_SETCDR (tl, new_tail);
462 tl = new_tail;
463 }
464
465 exit:
466 return maybe_annotate_source (ans, port, opts, line, column);
467 }
468 #undef FUNC_NAME
469
470
471 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
472 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
473 found. */
474 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
475 do \
476 { \
477 scm_t_wchar a; \
478 size_t i = 0; \
479 c = 0; \
480 while (i < ndigits) \
481 { \
482 a = scm_getc (port); \
483 if (a == EOF) \
484 goto str_eof; \
485 if (terminator \
486 && (a == (scm_t_wchar) terminator) \
487 && (i > 0)) \
488 break; \
489 if ('0' <= a && a <= '9') \
490 a -= '0'; \
491 else if ('A' <= a && a <= 'F') \
492 a = a - 'A' + 10; \
493 else if ('a' <= a && a <= 'f') \
494 a = a - 'a' + 10; \
495 else \
496 { \
497 c = a; \
498 goto bad_escaped; \
499 } \
500 c = c * 16 + a; \
501 i ++; \
502 } \
503 } while (0)
504
505 static void
506 skip_intraline_whitespace (SCM port)
507 {
508 scm_t_wchar c;
509
510 do
511 {
512 c = scm_getc (port);
513 if (c == EOF)
514 return;
515 }
516 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
517
518 scm_ungetc (c, port);
519 }
520
521 static SCM
522 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
523 #define FUNC_NAME "scm_lreadr"
524 {
525 /* For strings smaller than C_STR, this function creates only one Scheme
526 object (the string returned). */
527
528 SCM str = SCM_EOL;
529 size_t c_str_len = 0;
530 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
531
532 /* Need to capture line and column numbers here. */
533 long line = SCM_LINUM (port);
534 int column = SCM_COL (port) - 1;
535
536 while ('"' != (c = scm_getc (port)))
537 {
538 if (c == EOF)
539 {
540 str_eof:
541 scm_i_input_error (FUNC_NAME, port,
542 "end of file in string constant", SCM_EOL);
543 }
544
545 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
546 {
547 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
548 c_str_len = 0;
549 }
550
551 if (c == '\\')
552 {
553 switch (c = scm_getc (port))
554 {
555 case EOF:
556 goto str_eof;
557 case '"':
558 case '\\':
559 break;
560 case '\n':
561 if (opts->hungry_eol_escapes_p)
562 skip_intraline_whitespace (port);
563 continue;
564 case '0':
565 c = '\0';
566 break;
567 case 'f':
568 c = '\f';
569 break;
570 case 'n':
571 c = '\n';
572 break;
573 case 'r':
574 c = '\r';
575 break;
576 case 't':
577 c = '\t';
578 break;
579 case 'a':
580 c = '\007';
581 break;
582 case 'v':
583 c = '\v';
584 break;
585 case 'b':
586 c = '\010';
587 break;
588 case 'x':
589 if (opts->r6rs_escapes_p)
590 SCM_READ_HEX_ESCAPE (10, ';');
591 else
592 SCM_READ_HEX_ESCAPE (2, '\0');
593 break;
594 case 'u':
595 if (!opts->r6rs_escapes_p)
596 {
597 SCM_READ_HEX_ESCAPE (4, '\0');
598 break;
599 }
600 case 'U':
601 if (!opts->r6rs_escapes_p)
602 {
603 SCM_READ_HEX_ESCAPE (6, '\0');
604 break;
605 }
606 default:
607 bad_escaped:
608 scm_i_input_error (FUNC_NAME, port,
609 "illegal character in escape sequence: ~S",
610 scm_list_1 (SCM_MAKE_CHAR (c)));
611 }
612 }
613
614 c_str[c_str_len++] = c;
615 }
616
617 if (scm_is_null (str))
618 /* Fast path: we got a string that fits in C_STR. */
619 str = scm_from_utf32_stringn (c_str, c_str_len);
620 else
621 {
622 if (c_str_len > 0)
623 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
624
625 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
626 }
627
628 return maybe_annotate_source (str, port, opts, line, column);
629 }
630 #undef FUNC_NAME
631
632
633 static SCM
634 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
635 {
636 SCM result, str = SCM_EOL;
637 char local_buffer[READER_BUFFER_SIZE], *buffer;
638 size_t bytes_read;
639 scm_t_port *pt = SCM_PTAB_ENTRY (port);
640
641 /* Need to capture line and column numbers here. */
642 long line = SCM_LINUM (port);
643 int column = SCM_COL (port) - 1;
644
645 scm_ungetc (chr, port);
646 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
647 &bytes_read);
648
649 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
650
651 result = scm_string_to_number (str, SCM_UNDEFINED);
652 if (scm_is_false (result))
653 {
654 /* Return a symbol instead of a number */
655 if (opts->case_insensitive_p)
656 str = scm_string_downcase_x (str);
657 result = scm_string_to_symbol (str);
658 }
659 else if (SCM_NIMP (result))
660 result = maybe_annotate_source (result, port, opts, line, column);
661
662 SCM_COL (port) += scm_i_string_length (str);
663 return result;
664 }
665
666 static SCM
667 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
668 {
669 SCM result;
670 int ends_with_colon = 0;
671 size_t bytes_read;
672 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
673 char local_buffer[READER_BUFFER_SIZE], *buffer;
674 scm_t_port *pt = SCM_PTAB_ENTRY (port);
675 SCM str;
676
677 scm_ungetc (chr, port);
678 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
679 &bytes_read);
680 if (bytes_read > 0)
681 ends_with_colon = buffer[bytes_read - 1] == ':';
682
683 if (postfix && ends_with_colon && (bytes_read > 1))
684 {
685 str = scm_from_stringn (buffer, bytes_read - 1,
686 pt->encoding, pt->ilseq_handler);
687
688 if (opts->case_insensitive_p)
689 str = scm_string_downcase_x (str);
690 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
691 }
692 else
693 {
694 str = scm_from_stringn (buffer, bytes_read,
695 pt->encoding, pt->ilseq_handler);
696
697 if (opts->case_insensitive_p)
698 str = scm_string_downcase_x (str);
699 result = scm_string_to_symbol (str);
700 }
701
702 SCM_COL (port) += scm_i_string_length (str);
703 return result;
704 }
705
706 static SCM
707 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
708 #define FUNC_NAME "scm_lreadr"
709 {
710 SCM result;
711 size_t read;
712 char local_buffer[READER_BUFFER_SIZE], *buffer;
713 unsigned int radix;
714 SCM str;
715 scm_t_port *pt;
716
717 switch (chr)
718 {
719 case 'B':
720 case 'b':
721 radix = 2;
722 break;
723
724 case 'o':
725 case 'O':
726 radix = 8;
727 break;
728
729 case 'd':
730 case 'D':
731 radix = 10;
732 break;
733
734 case 'x':
735 case 'X':
736 radix = 16;
737 break;
738
739 default:
740 scm_ungetc (chr, port);
741 scm_ungetc ('#', port);
742 radix = 10;
743 }
744
745 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
746 &read);
747
748 pt = SCM_PTAB_ENTRY (port);
749 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
750
751 result = scm_string_to_number (str, scm_from_uint (radix));
752
753 SCM_COL (port) += scm_i_string_length (str);
754
755 if (scm_is_true (result))
756 return result;
757
758 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
759
760 return SCM_BOOL_F;
761 }
762 #undef FUNC_NAME
763
764 static SCM
765 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
766 {
767 SCM p;
768 long line = SCM_LINUM (port);
769 int column = SCM_COL (port) - 1;
770
771 switch (chr)
772 {
773 case '`':
774 p = scm_sym_quasiquote;
775 break;
776
777 case '\'':
778 p = scm_sym_quote;
779 break;
780
781 case ',':
782 {
783 scm_t_wchar c;
784
785 c = scm_getc (port);
786 if ('@' == c)
787 p = scm_sym_uq_splicing;
788 else
789 {
790 scm_ungetc (c, port);
791 p = scm_sym_unquote;
792 }
793 break;
794 }
795
796 default:
797 fprintf (stderr, "%s: unhandled quote character (%i)\n",
798 "scm_read_quote", chr);
799 abort ();
800 }
801
802 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
803 return maybe_annotate_source (p, port, opts, line, column);
804 }
805
806 SCM_SYMBOL (sym_syntax, "syntax");
807 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
808 SCM_SYMBOL (sym_unsyntax, "unsyntax");
809 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
810
811 static SCM
812 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
813 {
814 SCM p;
815 long line = SCM_LINUM (port);
816 int column = SCM_COL (port) - 1;
817
818 switch (chr)
819 {
820 case '`':
821 p = sym_quasisyntax;
822 break;
823
824 case '\'':
825 p = sym_syntax;
826 break;
827
828 case ',':
829 {
830 int c;
831
832 c = scm_getc (port);
833 if ('@' == c)
834 p = sym_unsyntax_splicing;
835 else
836 {
837 scm_ungetc (c, port);
838 p = sym_unsyntax;
839 }
840 break;
841 }
842
843 default:
844 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
845 "scm_read_syntax", chr);
846 abort ();
847 }
848
849 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
850 return maybe_annotate_source (p, port, opts, line, column);
851 }
852
853 static SCM
854 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
855 {
856 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
857
858 if (!scm_is_eq (id, sym_nil))
859 scm_i_input_error ("scm_read_nil", port,
860 "unexpected input while reading #nil: ~a",
861 scm_list_1 (id));
862
863 return SCM_ELISP_NIL;
864 }
865
866 static SCM
867 scm_read_semicolon_comment (int chr, SCM port)
868 {
869 int c;
870
871 /* We use the get_byte here because there is no need to get the
872 locale correct with comment input. This presumes that newline
873 always represents itself no matter what the encoding is. */
874 for (c = scm_get_byte_or_eof (port);
875 (c != EOF) && (c != '\n');
876 c = scm_get_byte_or_eof (port));
877
878 return SCM_UNSPECIFIED;
879 }
880
881 \f
882 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
883
884 static SCM
885 scm_read_boolean (int chr, SCM port)
886 {
887 switch (chr)
888 {
889 case 't':
890 case 'T':
891 return SCM_BOOL_T;
892
893 case 'f':
894 case 'F':
895 return SCM_BOOL_F;
896 }
897
898 return SCM_UNSPECIFIED;
899 }
900
901 static SCM
902 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
903 #define FUNC_NAME "scm_lreadr"
904 {
905 char buffer[READER_CHAR_NAME_MAX_SIZE];
906 SCM charname;
907 size_t charname_len, bytes_read;
908 scm_t_wchar cp;
909 int overflow;
910 scm_t_port *pt;
911
912 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
913 &bytes_read);
914 if (overflow)
915 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
916
917 if (bytes_read == 0)
918 {
919 chr = scm_getc (port);
920 if (chr == EOF)
921 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
922 "while reading character", SCM_EOL);
923
924 /* CHR must be a token delimiter, like a whitespace. */
925 return (SCM_MAKE_CHAR (chr));
926 }
927
928 pt = SCM_PTAB_ENTRY (port);
929
930 /* Simple ASCII characters can be processed immediately. Also, simple
931 ISO-8859-1 characters can be processed immediately if the encoding for this
932 port is ISO-8859-1. */
933 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
934 {
935 SCM_COL (port) += 1;
936 return SCM_MAKE_CHAR (buffer[0]);
937 }
938
939 /* Otherwise, convert the buffer into a proper scheme string for
940 processing. */
941 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
942 pt->ilseq_handler);
943 charname_len = scm_i_string_length (charname);
944 SCM_COL (port) += charname_len;
945 cp = scm_i_string_ref (charname, 0);
946 if (charname_len == 1)
947 return SCM_MAKE_CHAR (cp);
948
949 /* Ignore dotted circles, which may be used to keep combining characters from
950 combining with the backslash in #\charname. */
951 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
952 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
953
954 if (cp >= '0' && cp < '8')
955 {
956 /* Dirk:FIXME:: This type of character syntax is not R5RS
957 * compliant. Further, it should be verified that the constant
958 * does only consist of octal digits. */
959 SCM p = scm_string_to_number (charname, scm_from_uint (8));
960 if (SCM_I_INUMP (p))
961 {
962 scm_t_wchar c = scm_to_uint32 (p);
963 if (SCM_IS_UNICODE_CHAR (c))
964 return SCM_MAKE_CHAR (c);
965 else
966 scm_i_input_error (FUNC_NAME, port,
967 "out-of-range octal character escape: ~a",
968 scm_list_1 (charname));
969 }
970 }
971
972 if (cp == 'x' && (charname_len > 1))
973 {
974 SCM p;
975
976 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
977 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
978 scm_from_uint (16));
979 if (SCM_I_INUMP (p))
980 {
981 scm_t_wchar c = scm_to_uint32 (p);
982 if (SCM_IS_UNICODE_CHAR (c))
983 return SCM_MAKE_CHAR (c);
984 else
985 scm_i_input_error (FUNC_NAME, port,
986 "out-of-range hex character escape: ~a",
987 scm_list_1 (charname));
988 }
989 }
990
991 /* The names of characters should never have non-Latin1
992 characters. */
993 if (scm_i_is_narrow_string (charname)
994 || scm_i_try_narrow_string (charname))
995 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
996 charname_len);
997 if (scm_is_true (ch))
998 return ch;
999 }
1000
1001 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1002 scm_list_1 (charname));
1003
1004 return SCM_UNSPECIFIED;
1005 }
1006 #undef FUNC_NAME
1007
1008 static SCM
1009 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1010 {
1011 SCM symbol;
1012
1013 /* Read the symbol that comprises the keyword. Doing this instead of
1014 invoking a specific symbol reader function allows `scm_read_keyword ()'
1015 to adapt to the delimiters currently valid of symbols.
1016
1017 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1018 symbol = scm_read_expression (port, opts);
1019 if (!scm_is_symbol (symbol))
1020 scm_i_input_error ("scm_read_keyword", port,
1021 "keyword prefix `~a' not followed by a symbol: ~s",
1022 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1023
1024 return (scm_symbol_to_keyword (symbol));
1025 }
1026
1027 static SCM
1028 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1029 long line, int column)
1030 {
1031 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1032 guarantee that it's going to do what we want. After all, this is an
1033 implementation detail of `scm_read_vector ()', not a desirable
1034 property. */
1035 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1036 port, opts, line, column);
1037 }
1038
1039 /* Helper used by scm_read_array */
1040 static int
1041 read_decimal_integer (SCM port, int c, ssize_t *resp)
1042 {
1043 ssize_t sign = 1;
1044 ssize_t res = 0;
1045 int got_it = 0;
1046
1047 if (c == '-')
1048 {
1049 sign = -1;
1050 c = scm_getc (port);
1051 }
1052
1053 while ('0' <= c && c <= '9')
1054 {
1055 res = 10*res + c-'0';
1056 got_it = 1;
1057 c = scm_getc (port);
1058 }
1059
1060 if (got_it)
1061 *resp = sign * res;
1062 return c;
1063 }
1064
1065 /* Read an array. This function can also read vectors and uniform
1066 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1067 handled here.
1068
1069 C is the first character read after the '#'.
1070 */
1071 static SCM
1072 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1073 {
1074 ssize_t rank;
1075 scm_t_wchar tag_buf[8];
1076 int tag_len;
1077
1078 SCM tag, shape = SCM_BOOL_F, elements, array;
1079
1080 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1081 the array code can not deal with zero-length dimensions yet, and
1082 we want to allow zero-length vectors, of course.
1083 */
1084 if (c == '(')
1085 return scm_read_vector (c, port, opts, line, column);
1086
1087 /* Disambiguate between '#f' and uniform floating point vectors.
1088 */
1089 if (c == 'f')
1090 {
1091 c = scm_getc (port);
1092 if (c != '3' && c != '6')
1093 {
1094 if (c != EOF)
1095 scm_ungetc (c, port);
1096 return SCM_BOOL_F;
1097 }
1098 rank = 1;
1099 tag_buf[0] = 'f';
1100 tag_len = 1;
1101 goto continue_reading_tag;
1102 }
1103
1104 /* Read rank. */
1105 rank = 1;
1106 c = read_decimal_integer (port, c, &rank);
1107 if (rank < 0)
1108 scm_i_input_error (NULL, port, "array rank must be non-negative",
1109 SCM_EOL);
1110
1111 /* Read tag. */
1112 tag_len = 0;
1113 continue_reading_tag:
1114 while (c != EOF && c != '(' && c != '@' && c != ':'
1115 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1116 {
1117 tag_buf[tag_len++] = c;
1118 c = scm_getc (port);
1119 }
1120 if (tag_len == 0)
1121 tag = SCM_BOOL_T;
1122 else
1123 {
1124 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1125 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1126 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1127 scm_list_1 (tag));
1128 }
1129
1130 /* Read shape. */
1131 if (c == '@' || c == ':')
1132 {
1133 shape = SCM_EOL;
1134
1135 do
1136 {
1137 ssize_t lbnd = 0, len = 0;
1138 SCM s;
1139
1140 if (c == '@')
1141 {
1142 c = scm_getc (port);
1143 c = read_decimal_integer (port, c, &lbnd);
1144 }
1145
1146 s = scm_from_ssize_t (lbnd);
1147
1148 if (c == ':')
1149 {
1150 c = scm_getc (port);
1151 c = read_decimal_integer (port, c, &len);
1152 if (len < 0)
1153 scm_i_input_error (NULL, port,
1154 "array length must be non-negative",
1155 SCM_EOL);
1156
1157 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1158 }
1159
1160 shape = scm_cons (s, shape);
1161 } while (c == '@' || c == ':');
1162
1163 shape = scm_reverse_x (shape, SCM_EOL);
1164 }
1165
1166 /* Read nested lists of elements. */
1167 if (c != '(')
1168 scm_i_input_error (NULL, port,
1169 "missing '(' in vector or array literal",
1170 SCM_EOL);
1171 elements = scm_read_sexp (c, port, opts);
1172
1173 if (scm_is_false (shape))
1174 shape = scm_from_ssize_t (rank);
1175 else if (scm_ilength (shape) != rank)
1176 scm_i_input_error
1177 (NULL, port,
1178 "the number of shape specifications must match the array rank",
1179 SCM_EOL);
1180
1181 /* Handle special print syntax of rank zero arrays; see
1182 scm_i_print_array for a rationale. */
1183 if (rank == 0)
1184 {
1185 if (!scm_is_pair (elements))
1186 scm_i_input_error (NULL, port,
1187 "too few elements in array literal, need 1",
1188 SCM_EOL);
1189 if (!scm_is_null (SCM_CDR (elements)))
1190 scm_i_input_error (NULL, port,
1191 "too many elements in array literal, want 1",
1192 SCM_EOL);
1193 elements = SCM_CAR (elements);
1194 }
1195
1196 /* Construct array, annotate with source location, and return. */
1197 array = scm_list_to_typed_array (tag, shape, elements);
1198 return maybe_annotate_source (array, port, opts, line, column);
1199 }
1200
1201 static SCM
1202 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1203 long line, int column)
1204 {
1205 return scm_read_array (chr, port, opts, line, column);
1206 }
1207
1208 static SCM
1209 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1210 long line, int column)
1211 {
1212 chr = scm_getc (port);
1213 if (chr != 'u')
1214 goto syntax;
1215
1216 chr = scm_getc (port);
1217 if (chr != '8')
1218 goto syntax;
1219
1220 chr = scm_getc (port);
1221 if (chr != '(')
1222 goto syntax;
1223
1224 return maybe_annotate_source
1225 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1226 port, opts, line, column);
1227
1228 syntax:
1229 scm_i_input_error ("read_bytevector", port,
1230 "invalid bytevector prefix",
1231 SCM_MAKE_CHAR (chr));
1232 return SCM_UNSPECIFIED;
1233 }
1234
1235 static SCM
1236 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1237 long line, int column)
1238 {
1239 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1240 terribly inefficient but who cares? */
1241 SCM s_bits = SCM_EOL;
1242
1243 for (chr = scm_getc (port);
1244 (chr != EOF) && ((chr == '0') || (chr == '1'));
1245 chr = scm_getc (port))
1246 {
1247 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1248 }
1249
1250 if (chr != EOF)
1251 scm_ungetc (chr, port);
1252
1253 return maybe_annotate_source
1254 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1255 port, opts, line, column);
1256 }
1257
1258 static SCM
1259 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1260 {
1261 int bang_seen = 0;
1262
1263 for (;;)
1264 {
1265 int c = scm_getc (port);
1266
1267 if (c == EOF)
1268 scm_i_input_error ("skip_block_comment", port,
1269 "unterminated `#! ... !#' comment", SCM_EOL);
1270
1271 if (c == '!')
1272 bang_seen = 1;
1273 else if (c == '#' && bang_seen)
1274 break;
1275 else
1276 bang_seen = 0;
1277 }
1278
1279 return SCM_UNSPECIFIED;
1280 }
1281
1282 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1283 int value);
1284
1285 static SCM
1286 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1287 {
1288 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1289 int c;
1290 int i = 0;
1291
1292 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1293 {
1294 c = scm_getc (port);
1295 if (c == EOF)
1296 scm_i_input_error ("skip_block_comment", port,
1297 "unterminated `#! ... !#' comment", SCM_EOL);
1298 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1299 name[i++] = c;
1300 else if (CHAR_IS_DELIMITER (c))
1301 {
1302 scm_ungetc (c, port);
1303 name[i] = '\0';
1304 if (0 == strcmp ("r6rs", name))
1305 ; /* Silently ignore */
1306 else if (0 == strcmp ("fold-case", name))
1307 set_port_case_insensitive_p (port, opts, 1);
1308 else if (0 == strcmp ("no-fold-case", name))
1309 set_port_case_insensitive_p (port, opts, 0);
1310 else
1311 break;
1312
1313 return SCM_UNSPECIFIED;
1314 }
1315 }
1316 while (i > 0)
1317 scm_ungetc (name[--i], port);
1318 return scm_read_scsh_block_comment (chr, port);
1319 }
1320
1321 static SCM
1322 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1323 {
1324 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1325 nested. So care must be taken. */
1326 int nesting_level = 1;
1327
1328 int a = scm_getc (port);
1329
1330 if (a == EOF)
1331 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1332 "unterminated `#| ... |#' comment", SCM_EOL);
1333
1334 while (nesting_level > 0)
1335 {
1336 int b = scm_getc (port);
1337
1338 if (b == EOF)
1339 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1340 "unterminated `#| ... |#' comment", SCM_EOL);
1341
1342 if (a == '|' && b == '#')
1343 {
1344 nesting_level--;
1345 b = EOF;
1346 }
1347 else if (a == '#' && b == '|')
1348 {
1349 nesting_level++;
1350 b = EOF;
1351 }
1352
1353 a = b;
1354 }
1355
1356 return SCM_UNSPECIFIED;
1357 }
1358
1359 static SCM
1360 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1361 scm_t_read_opts *opts)
1362 {
1363 scm_t_wchar c;
1364
1365 c = flush_ws (port, opts, (char *) NULL);
1366 if (EOF == c)
1367 scm_i_input_error ("read_commented_expression", port,
1368 "no expression after #; comment", SCM_EOL);
1369 scm_ungetc (c, port);
1370 scm_read_expression (port, opts);
1371 return SCM_UNSPECIFIED;
1372 }
1373
1374 static SCM
1375 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1376 {
1377 /* Guile's extended symbol read syntax looks like this:
1378
1379 #{This is all a symbol name}#
1380
1381 So here, CHR is expected to be `{'. */
1382 int saw_brace = 0;
1383 size_t len = 0;
1384 SCM buf = scm_i_make_string (1024, NULL, 0);
1385
1386 buf = scm_i_string_start_writing (buf);
1387
1388 while ((chr = scm_getc (port)) != EOF)
1389 {
1390 if (saw_brace)
1391 {
1392 if (chr == '#')
1393 {
1394 break;
1395 }
1396 else
1397 {
1398 saw_brace = 0;
1399 scm_i_string_set_x (buf, len++, '}');
1400 }
1401 }
1402
1403 if (chr == '}')
1404 saw_brace = 1;
1405 else if (chr == '\\')
1406 {
1407 /* It used to be that print.c would print extended-read-syntax
1408 symbols with backslashes before "non-standard" chars, but
1409 this routine wouldn't do anything with those escapes.
1410 Bummer. What we've done is to change print.c to output
1411 R6RS hex escapes for those characters, relying on the fact
1412 that the extended read syntax would never put a `\' before
1413 an `x'. For now, we just ignore other instances of
1414 backslash in the string. */
1415 switch ((chr = scm_getc (port)))
1416 {
1417 case EOF:
1418 goto done;
1419 case 'x':
1420 {
1421 scm_t_wchar c;
1422
1423 SCM_READ_HEX_ESCAPE (10, ';');
1424 scm_i_string_set_x (buf, len++, c);
1425 break;
1426
1427 str_eof:
1428 chr = EOF;
1429 goto done;
1430
1431 bad_escaped:
1432 scm_i_string_stop_writing ();
1433 scm_i_input_error ("scm_read_extended_symbol", port,
1434 "illegal character in escape sequence: ~S",
1435 scm_list_1 (SCM_MAKE_CHAR (c)));
1436 break;
1437 }
1438 default:
1439 scm_i_string_set_x (buf, len++, chr);
1440 break;
1441 }
1442 }
1443 else
1444 scm_i_string_set_x (buf, len++, chr);
1445
1446 if (len >= scm_i_string_length (buf) - 2)
1447 {
1448 SCM addy;
1449
1450 scm_i_string_stop_writing ();
1451 addy = scm_i_make_string (1024, NULL, 0);
1452 buf = scm_string_append (scm_list_2 (buf, addy));
1453 len = 0;
1454 buf = scm_i_string_start_writing (buf);
1455 }
1456 }
1457
1458 done:
1459 scm_i_string_stop_writing ();
1460 if (chr == EOF)
1461 scm_i_input_error ("scm_read_extended_symbol", port,
1462 "end of file while reading symbol", SCM_EOL);
1463
1464 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1465 }
1466
1467
1468 \f
1469 /* Top-level token readers, i.e., dispatchers. */
1470
1471 static SCM
1472 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1473 {
1474 SCM proc;
1475
1476 proc = scm_get_hash_procedure (chr);
1477 if (scm_is_true (scm_procedure_p (proc)))
1478 {
1479 long line = SCM_LINUM (port);
1480 int column = SCM_COL (port) - 2;
1481 SCM got;
1482
1483 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1484
1485 if (opts->record_positions_p && SCM_NIMP (got)
1486 && !scm_i_has_source_properties (got))
1487 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1488
1489 return got;
1490 }
1491
1492 return SCM_UNSPECIFIED;
1493 }
1494
1495 /* The reader for the sharp `#' character. It basically dispatches reads
1496 among the above token readers. */
1497 static SCM
1498 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1499 long line, int column)
1500 #define FUNC_NAME "scm_lreadr"
1501 {
1502 SCM result;
1503
1504 chr = scm_getc (port);
1505
1506 result = scm_read_sharp_extension (chr, port, opts);
1507 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1508 return result;
1509
1510 switch (chr)
1511 {
1512 case '\\':
1513 return (scm_read_character (chr, port, opts));
1514 case '(':
1515 return (scm_read_vector (chr, port, opts, line, column));
1516 case 's':
1517 case 'u':
1518 case 'f':
1519 case 'c':
1520 /* This one may return either a boolean or an SRFI-4 vector. */
1521 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1522 case 'v':
1523 return (scm_read_bytevector (chr, port, opts, line, column));
1524 case '*':
1525 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1526 case 't':
1527 case 'T':
1528 case 'F':
1529 return (scm_read_boolean (chr, port));
1530 case ':':
1531 return (scm_read_keyword (chr, port, opts));
1532 case '0': case '1': case '2': case '3': case '4':
1533 case '5': case '6': case '7': case '8': case '9':
1534 case '@':
1535 #if SCM_ENABLE_DEPRECATED
1536 /* See below for 'i' and 'e'. */
1537 case 'a':
1538 case 'y':
1539 case 'h':
1540 case 'l':
1541 #endif
1542 return (scm_read_array (chr, port, opts, line, column));
1543
1544 case 'i':
1545 case 'e':
1546 #if SCM_ENABLE_DEPRECATED
1547 {
1548 /* When next char is '(', it really is an old-style
1549 uniform array. */
1550 scm_t_wchar next_c = scm_getc (port);
1551 if (next_c != EOF)
1552 scm_ungetc (next_c, port);
1553 if (next_c == '(')
1554 return scm_read_array (chr, port, opts, line, column);
1555 /* Fall through. */
1556 }
1557 #endif
1558 case 'b':
1559 case 'B':
1560 case 'o':
1561 case 'O':
1562 case 'd':
1563 case 'D':
1564 case 'x':
1565 case 'X':
1566 case 'I':
1567 case 'E':
1568 return (scm_read_number_and_radix (chr, port, opts));
1569 case '{':
1570 return (scm_read_extended_symbol (chr, port));
1571 case '!':
1572 return (scm_read_shebang (chr, port, opts));
1573 case ';':
1574 return (scm_read_commented_expression (chr, port, opts));
1575 case '`':
1576 case '\'':
1577 case ',':
1578 return (scm_read_syntax (chr, port, opts));
1579 case 'n':
1580 return (scm_read_nil (chr, port, opts));
1581 default:
1582 result = scm_read_sharp_extension (chr, port, opts);
1583 if (scm_is_eq (result, SCM_UNSPECIFIED))
1584 {
1585 /* To remain compatible with 1.8 and earlier, the following
1586 characters have lower precedence than `read-hash-extend'
1587 characters. */
1588 switch (chr)
1589 {
1590 case '|':
1591 return scm_read_r6rs_block_comment (chr, port);
1592 default:
1593 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1594 scm_list_1 (SCM_MAKE_CHAR (chr)));
1595 }
1596 }
1597 else
1598 return result;
1599 }
1600
1601 return SCM_UNSPECIFIED;
1602 }
1603 #undef FUNC_NAME
1604
1605 static SCM
1606 scm_read_expression (SCM port, scm_t_read_opts *opts)
1607 #define FUNC_NAME "scm_read_expression"
1608 {
1609 while (1)
1610 {
1611 scm_t_wchar chr;
1612
1613 chr = scm_getc (port);
1614
1615 switch (chr)
1616 {
1617 case SCM_WHITE_SPACES:
1618 case SCM_LINE_INCREMENTORS:
1619 break;
1620 case ';':
1621 (void) scm_read_semicolon_comment (chr, port);
1622 break;
1623 case '[':
1624 if (!opts->square_brackets_p)
1625 return (scm_read_mixed_case_symbol (chr, port, opts));
1626 /* otherwise fall through */
1627 case '(':
1628 return (scm_read_sexp (chr, port, opts));
1629 case '"':
1630 return (scm_read_string (chr, port, opts));
1631 case '\'':
1632 case '`':
1633 case ',':
1634 return (scm_read_quote (chr, port, opts));
1635 case '#':
1636 {
1637 long line = SCM_LINUM (port);
1638 int column = SCM_COL (port) - 1;
1639 SCM result = scm_read_sharp (chr, port, opts, line, column);
1640 if (scm_is_eq (result, SCM_UNSPECIFIED))
1641 /* We read a comment or some such. */
1642 break;
1643 else
1644 return result;
1645 }
1646 case ')':
1647 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1648 break;
1649 case ']':
1650 if (opts->square_brackets_p)
1651 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1652 /* otherwise fall through */
1653 case EOF:
1654 return SCM_EOF_VAL;
1655 case ':':
1656 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1657 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1658 /* Fall through. */
1659
1660 default:
1661 {
1662 if (((chr >= '0') && (chr <= '9'))
1663 || (strchr ("+-.", chr)))
1664 return (scm_read_number (chr, port, opts));
1665 else
1666 return (scm_read_mixed_case_symbol (chr, port, opts));
1667 }
1668 }
1669 }
1670 }
1671 #undef FUNC_NAME
1672
1673 \f
1674 /* Actual reader. */
1675
1676 static void init_read_options (SCM port, scm_t_read_opts *opts);
1677
1678 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1679 (SCM port),
1680 "Read an s-expression from the input port @var{port}, or from\n"
1681 "the current input port if @var{port} is not specified.\n"
1682 "Any whitespace before the next token is discarded.")
1683 #define FUNC_NAME s_scm_read
1684 {
1685 scm_t_read_opts opts;
1686 int c;
1687
1688 if (SCM_UNBNDP (port))
1689 port = scm_current_input_port ();
1690 SCM_VALIDATE_OPINPORT (1, port);
1691
1692 init_read_options (port, &opts);
1693
1694 c = flush_ws (port, &opts, (char *) NULL);
1695 if (EOF == c)
1696 return SCM_EOF_VAL;
1697 scm_ungetc (c, port);
1698
1699 return (scm_read_expression (port, &opts));
1700 }
1701 #undef FUNC_NAME
1702
1703
1704 \f
1705
1706 /* Manipulate the read-hash-procedures alist. This could be written in
1707 Scheme, but maybe it will also be used by C code during initialisation. */
1708 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1709 (SCM chr, SCM proc),
1710 "Install the procedure @var{proc} for reading expressions\n"
1711 "starting with the character sequence @code{#} and @var{chr}.\n"
1712 "@var{proc} will be called with two arguments: the character\n"
1713 "@var{chr} and the port to read further data from. The object\n"
1714 "returned will be the return value of @code{read}. \n"
1715 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1716 )
1717 #define FUNC_NAME s_scm_read_hash_extend
1718 {
1719 SCM this;
1720 SCM prev;
1721
1722 SCM_VALIDATE_CHAR (1, chr);
1723 SCM_ASSERT (scm_is_false (proc)
1724 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1725 proc, SCM_ARG2, FUNC_NAME);
1726
1727 /* Check if chr is already in the alist. */
1728 this = scm_i_read_hash_procedures_ref ();
1729 prev = SCM_BOOL_F;
1730 while (1)
1731 {
1732 if (scm_is_null (this))
1733 {
1734 /* not found, so add it to the beginning. */
1735 if (scm_is_true (proc))
1736 {
1737 SCM new = scm_cons (scm_cons (chr, proc),
1738 scm_i_read_hash_procedures_ref ());
1739 scm_i_read_hash_procedures_set_x (new);
1740 }
1741 break;
1742 }
1743 if (scm_is_eq (chr, SCM_CAAR (this)))
1744 {
1745 /* already in the alist. */
1746 if (scm_is_false (proc))
1747 {
1748 /* remove it. */
1749 if (scm_is_false (prev))
1750 {
1751 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1752 scm_i_read_hash_procedures_set_x (rest);
1753 }
1754 else
1755 scm_set_cdr_x (prev, SCM_CDR (this));
1756 }
1757 else
1758 {
1759 /* replace it. */
1760 scm_set_cdr_x (SCM_CAR (this), proc);
1761 }
1762 break;
1763 }
1764 prev = this;
1765 this = SCM_CDR (this);
1766 }
1767
1768 return SCM_UNSPECIFIED;
1769 }
1770 #undef FUNC_NAME
1771
1772 /* Recover the read-hash procedure corresponding to char c. */
1773 static SCM
1774 scm_get_hash_procedure (int c)
1775 {
1776 SCM rest = scm_i_read_hash_procedures_ref ();
1777
1778 while (1)
1779 {
1780 if (scm_is_null (rest))
1781 return SCM_BOOL_F;
1782
1783 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1784 return SCM_CDAR (rest);
1785
1786 rest = SCM_CDR (rest);
1787 }
1788 }
1789
1790 #define SCM_ENCODING_SEARCH_SIZE (500)
1791
1792 /* Search the first few hundred characters of a file for an Emacs-like coding
1793 declaration. Returns either NULL or a string whose storage has been
1794 allocated with `scm_gc_malloc ()'. */
1795 char *
1796 scm_i_scan_for_encoding (SCM port)
1797 {
1798 scm_t_port *pt;
1799 char header[SCM_ENCODING_SEARCH_SIZE+1];
1800 size_t bytes_read, encoding_length, i;
1801 char *encoding = NULL;
1802 int utf8_bom = 0;
1803 char *pos, *encoding_start;
1804 int in_comment;
1805
1806 pt = SCM_PTAB_ENTRY (port);
1807
1808 if (pt->rw_active == SCM_PORT_WRITE)
1809 scm_flush (port);
1810
1811 if (pt->rw_random)
1812 pt->rw_active = SCM_PORT_READ;
1813
1814 if (pt->read_pos == pt->read_end)
1815 {
1816 /* We can use the read buffer, and thus avoid a seek. */
1817 if (scm_fill_input (port) == EOF)
1818 return NULL;
1819
1820 bytes_read = pt->read_end - pt->read_pos;
1821 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1822 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1823
1824 if (bytes_read <= 1)
1825 /* An unbuffered port -- don't scan. */
1826 return NULL;
1827
1828 memcpy (header, pt->read_pos, bytes_read);
1829 header[bytes_read] = '\0';
1830 }
1831 else
1832 {
1833 /* Try to read some bytes and then seek back. Not all ports
1834 support seeking back; and indeed some file ports (like
1835 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1836 check performed by SCM_FPORT_FDES---but fail to seek
1837 backwards. Hence this block comes second. We prefer to use
1838 the read buffer in-place. */
1839 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1840 return NULL;
1841
1842 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1843 header[bytes_read] = '\0';
1844 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1845 }
1846
1847 if (bytes_read > 3
1848 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1849 utf8_bom = 1;
1850
1851 /* search past "coding[:=]" */
1852 pos = header;
1853 while (1)
1854 {
1855 if ((pos = strstr(pos, "coding")) == NULL)
1856 return NULL;
1857
1858 pos += strlen("coding");
1859 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1860 (*pos == ':' || *pos == '='))
1861 {
1862 pos ++;
1863 break;
1864 }
1865 }
1866
1867 /* skip spaces */
1868 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1869 (*pos == ' ' || *pos == '\t'))
1870 pos ++;
1871
1872 /* grab the next token */
1873 encoding_start = pos;
1874 i = 0;
1875 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1876 && encoding_start + i - header < bytes_read
1877 && (isalnum ((int) encoding_start[i])
1878 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1879 i++;
1880
1881 encoding_length = i;
1882 if (encoding_length == 0)
1883 return NULL;
1884
1885 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1886 for (i = 0; i < encoding_length; i++)
1887 encoding[i] = toupper ((int) encoding[i]);
1888
1889 /* push backwards to make sure we were in a comment */
1890 in_comment = 0;
1891 pos = encoding_start;
1892 while (pos >= header)
1893 {
1894 if (*pos == ';')
1895 {
1896 in_comment = 1;
1897 break;
1898 }
1899 else if (*pos == '\n' || pos == header)
1900 {
1901 /* This wasn't in a semicolon comment. Check for a
1902 hash-bang comment. */
1903 char *beg = strstr (header, "#!");
1904 char *end = strstr (header, "!#");
1905 if (beg < encoding_start && encoding_start + encoding_length <= end)
1906 in_comment = 1;
1907 break;
1908 }
1909 else
1910 {
1911 pos --;
1912 continue;
1913 }
1914 }
1915 if (!in_comment)
1916 /* This wasn't in a comment */
1917 return NULL;
1918
1919 if (utf8_bom && strcmp(encoding, "UTF-8"))
1920 scm_misc_error (NULL,
1921 "the port input declares the encoding ~s but is encoded as UTF-8",
1922 scm_list_1 (scm_from_locale_string (encoding)));
1923
1924 return encoding;
1925 }
1926
1927 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1928 (SCM port),
1929 "Scans the port for an Emacs-like character coding declaration\n"
1930 "near the top of the contents of a port with random-accessible contents.\n"
1931 "The coding declaration is of the form\n"
1932 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1933 "\n"
1934 "Returns a string containing the character encoding of the file\n"
1935 "if a declaration was found, or @code{#f} otherwise.\n")
1936 #define FUNC_NAME s_scm_file_encoding
1937 {
1938 char *enc;
1939 SCM s_enc;
1940
1941 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1942
1943 enc = scm_i_scan_for_encoding (port);
1944 if (enc == NULL)
1945 return SCM_BOOL_F;
1946 else
1947 {
1948 s_enc = scm_from_locale_string (enc);
1949 return s_enc;
1950 }
1951
1952 return SCM_BOOL_F;
1953 }
1954 #undef FUNC_NAME
1955
1956 \f
1957 /* Per-port read options.
1958
1959 We store per-port read options in the 'port-read-options' key of the
1960 port's alist, which is stored in 'scm_i_port_weak_hash'. The value
1961 stored in the alist is a single integer that contains a two-bit field
1962 for each read option.
1963
1964 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
1965 the applicable value should be inherited from the corresponding
1966 global read option. Otherwise, the bit field contains the value of
1967 the read option. For boolean read options that have been set
1968 per-port, the possible values are 0 or 1. If the 'keyword_style'
1969 read option has been set per-port, its possible values are those in
1970 'enum t_keyword_style'. */
1971
1972 /* Key to read options in per-port alists. */
1973 SCM_SYMBOL (sym_port_read_options, "port-read-options");
1974
1975 /* Offsets of bit fields for each per-port override */
1976 #define READ_OPTION_COPY_SOURCE_P 0
1977 #define READ_OPTION_RECORD_POSITIONS_P 2
1978 #define READ_OPTION_CASE_INSENSITIVE_P 4
1979 #define READ_OPTION_KEYWORD_STYLE 6
1980 #define READ_OPTION_R6RS_ESCAPES_P 8
1981 #define READ_OPTION_SQUARE_BRACKETS_P 10
1982 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
1983
1984 #define READ_OPTIONS_NUM_BITS 14
1985
1986 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
1987 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
1988
1989 #define READ_OPTION_MASK 3
1990 #define READ_OPTION_INHERIT 3
1991
1992 static void
1993 set_port_read_option (SCM port, int option, int new_value)
1994 {
1995 SCM alist, scm_read_options;
1996 unsigned int read_options;
1997
1998 new_value &= READ_OPTION_MASK;
1999 scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2000 alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2001 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2002 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2003 read_options = scm_to_uint (scm_read_options);
2004 else
2005 read_options = READ_OPTIONS_INHERIT_ALL;
2006 read_options &= ~(READ_OPTION_MASK << option);
2007 read_options |= new_value << option;
2008 scm_read_options = scm_from_uint (read_options);
2009 alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2010 scm_hashq_set_x (scm_i_port_weak_hash, port, alist);
2011 scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2012 }
2013
2014 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2015 static void
2016 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2017 {
2018 value = !!value;
2019 opts->case_insensitive_p = value;
2020 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2021 }
2022
2023 /* Initialize OPTS based on PORT's read options and the global read
2024 options. */
2025 static void
2026 init_read_options (SCM port, scm_t_read_opts *opts)
2027 {
2028 SCM alist, val, scm_read_options;
2029 unsigned int read_options, x;
2030
2031 scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2032 alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2033 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2034 scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2035
2036 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2037 read_options = scm_to_uint (scm_read_options);
2038 else
2039 read_options = READ_OPTIONS_INHERIT_ALL;
2040
2041 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2042 if (x == READ_OPTION_INHERIT)
2043 {
2044 val = SCM_PACK (SCM_KEYWORD_STYLE);
2045 if (scm_is_eq (val, scm_keyword_prefix))
2046 x = KEYWORD_STYLE_PREFIX;
2047 else if (scm_is_eq (val, scm_keyword_postfix))
2048 x = KEYWORD_STYLE_POSTFIX;
2049 else
2050 x = KEYWORD_STYLE_HASH_PREFIX;
2051 }
2052 opts->keyword_style = x;
2053
2054 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2055 do \
2056 { \
2057 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2058 if (x == READ_OPTION_INHERIT) \
2059 x = !!SCM_ ## NAME; \
2060 opts->name = x; \
2061 } \
2062 while (0)
2063
2064 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2065 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2066 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2067 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2068 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2069 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2070
2071 #undef RESOLVE_BOOLEAN_OPTION
2072 }
2073
2074 void
2075 scm_init_read ()
2076 {
2077 SCM read_hash_procs;
2078
2079 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2080
2081 scm_i_read_hash_procedures =
2082 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2083
2084 scm_init_opts (scm_read_options, scm_read_opts);
2085 #include "libguile/read.x"
2086 }
2087
2088 /*
2089 Local Variables:
2090 c-file-style: "gnu"
2091 End:
2092 */