Implement per-port read options.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /* Internal read options structure. This is initialized by 'scm_read'
85 from the global and per-port read options, and a pointer is passed
86 down to all helper functions. */
87
88 enum t_keyword_style
89 {
90 KEYWORD_STYLE_HASH_PREFIX,
91 KEYWORD_STYLE_PREFIX,
92 KEYWORD_STYLE_POSTFIX
93 };
94
95 struct t_read_opts
96 {
97 enum t_keyword_style keyword_style;
98 unsigned int copy_source_p : 1;
99 unsigned int record_positions_p : 1;
100 unsigned int case_insensitive_p : 1;
101 unsigned int r6rs_escapes_p : 1;
102 unsigned int square_brackets_p : 1;
103 unsigned int hungry_eol_escapes_p : 1;
104 };
105
106 typedef struct t_read_opts scm_t_read_opts;
107
108
109 /*
110 Give meaningful error messages for errors
111
112 We use the format
113
114 FILE:LINE:COL: MESSAGE
115 This happened in ....
116
117 This is not standard GNU format, but the test-suite likes the real
118 message to be in front.
119
120 */
121
122
123 void
124 scm_i_input_error (char const *function,
125 SCM port, const char *message, SCM arg)
126 {
127 SCM fn = (scm_is_string (SCM_FILENAME(port))
128 ? SCM_FILENAME(port)
129 : scm_from_locale_string ("#<unknown port>"));
130
131 SCM string_port = scm_open_output_string ();
132 SCM string = SCM_EOL;
133 scm_simple_format (string_port,
134 scm_from_locale_string ("~A:~S:~S: ~A"),
135 scm_list_4 (fn,
136 scm_from_long (SCM_LINUM (port) + 1),
137 scm_from_int (SCM_COL (port) + 1),
138 scm_from_locale_string (message)));
139
140 string = scm_get_output_string (string_port);
141 scm_close_output_port (string_port);
142 scm_error_scm (scm_from_latin1_symbol ("read-error"),
143 function? scm_from_locale_string (function) : SCM_BOOL_F,
144 string,
145 arg,
146 SCM_BOOL_F);
147 }
148
149
150 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
151 (SCM setting),
152 "Option interface for the read options. Instead of using\n"
153 "this procedure directly, use the procedures @code{read-enable},\n"
154 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
155 #define FUNC_NAME s_scm_read_options
156 {
157 SCM ans = scm_options (setting,
158 scm_read_opts,
159 FUNC_NAME);
160 if (SCM_COPY_SOURCE_P)
161 SCM_RECORD_POSITIONS_P = 1;
162 return ans;
163 }
164 #undef FUNC_NAME
165
166 /* A fluid referring to an association list mapping extra hash
167 characters to procedures. */
168 static SCM *scm_i_read_hash_procedures;
169
170 static SCM
171 scm_i_read_hash_procedures_ref (void)
172 {
173 return scm_fluid_ref (*scm_i_read_hash_procedures);
174 }
175
176 static void
177 scm_i_read_hash_procedures_set_x (SCM value)
178 {
179 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
180 }
181
182 \f
183 /* Token readers. */
184
185
186 /* Size of the C buffer used to read symbols and numbers. */
187 #define READER_BUFFER_SIZE 128
188
189 /* Number of 32-bit codepoints in the buffer used to read strings. */
190 #define READER_STRING_BUFFER_SIZE 128
191
192 /* The maximum size of Scheme character names. */
193 #define READER_CHAR_NAME_MAX_SIZE 50
194
195 /* The maximum size of reader directive names. */
196 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
197
198
199 /* `isblank' is only in C99. */
200 #define CHAR_IS_BLANK_(_chr) \
201 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
202 || ((_chr) == '\f') || ((_chr) == '\r'))
203
204 #ifdef MSDOS
205 # define CHAR_IS_BLANK(_chr) \
206 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
207 #else
208 # define CHAR_IS_BLANK CHAR_IS_BLANK_
209 #endif
210
211
212 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
213 structure''). */
214 #define CHAR_IS_R5RS_DELIMITER(c) \
215 (CHAR_IS_BLANK (c) \
216 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
217
218 #define CHAR_IS_DELIMITER(c) \
219 (CHAR_IS_R5RS_DELIMITER (c) \
220 || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
221
222 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
223 Structure''. */
224 #define CHAR_IS_EXPONENT_MARKER(_chr) \
225 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
226 || ((_chr) == 'd') || ((_chr) == 'l'))
227
228 /* Read an SCSH block comment. */
229 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
230 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
231 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
232 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
233 static SCM scm_get_hash_procedure (int);
234
235 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
236 result in the pre-allocated buffer BUF. Return zero if the whole token has
237 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
238 bytes actually read. */
239 static int
240 read_token (SCM port, scm_t_read_opts *opts,
241 char *buf, size_t buf_size, size_t *read)
242 {
243 *read = 0;
244
245 while (*read < buf_size)
246 {
247 int chr;
248
249 chr = scm_get_byte_or_eof (port);
250
251 if (chr == EOF)
252 return 0;
253 else if (CHAR_IS_DELIMITER (chr))
254 {
255 scm_unget_byte (chr, port);
256 return 0;
257 }
258 else
259 {
260 *buf = (char) chr;
261 buf++, (*read)++;
262 }
263 }
264
265 return 1;
266 }
267
268 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
269 if the token doesn't fit in BUFFER_SIZE bytes. */
270 static char *
271 read_complete_token (SCM port, scm_t_read_opts *opts,
272 char *buffer, size_t buffer_size, size_t *read)
273 {
274 int overflow = 0;
275 size_t bytes_read, overflow_size = 0;
276 char *overflow_buffer = NULL;
277
278 do
279 {
280 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
281 if (bytes_read == 0)
282 break;
283 if (overflow || overflow_size != 0)
284 {
285 if (overflow_size == 0)
286 {
287 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
288 memcpy (overflow_buffer, buffer, bytes_read);
289 overflow_size = bytes_read;
290 }
291 else
292 {
293 char *new_buf =
294 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
295
296 memcpy (new_buf, overflow_buffer, overflow_size);
297 memcpy (new_buf + overflow_size, buffer, bytes_read);
298
299 overflow_buffer = new_buf;
300 overflow_size += bytes_read;
301 }
302 }
303 }
304 while (overflow);
305
306 if (overflow_size)
307 *read = overflow_size;
308 else
309 *read = bytes_read;
310
311 return (overflow_size > 0 ? overflow_buffer : buffer);
312 }
313
314 /* Skip whitespace from PORT and return the first non-whitespace character
315 read. Raise an error on end-of-file. */
316 static int
317 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
318 {
319 scm_t_wchar c;
320 while (1)
321 switch (c = scm_getc (port))
322 {
323 case EOF:
324 goteof:
325 if (eoferr)
326 {
327 scm_i_input_error (eoferr,
328 port,
329 "end of file",
330 SCM_EOL);
331 }
332 return c;
333
334 case ';':
335 lp:
336 switch (c = scm_getc (port))
337 {
338 case EOF:
339 goto goteof;
340 default:
341 goto lp;
342 case SCM_LINE_INCREMENTORS:
343 break;
344 }
345 break;
346
347 case '#':
348 switch (c = scm_getc (port))
349 {
350 case EOF:
351 eoferr = "read_sharp";
352 goto goteof;
353 case '!':
354 scm_read_shebang (c, port, opts);
355 break;
356 case ';':
357 scm_read_commented_expression (c, port, opts);
358 break;
359 case '|':
360 if (scm_is_false (scm_get_hash_procedure (c)))
361 {
362 scm_read_r6rs_block_comment (c, port);
363 break;
364 }
365 /* fall through */
366 default:
367 scm_ungetc (c, port);
368 return '#';
369 }
370 break;
371
372 case SCM_LINE_INCREMENTORS:
373 case SCM_SINGLE_SPACES:
374 case '\t':
375 break;
376
377 default:
378 return c;
379 }
380
381 return 0;
382 }
383
384
385 \f
386 /* Token readers. */
387
388 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
389 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
390 long line, int column);
391
392
393 static SCM
394 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
395 long line, int column)
396 {
397 if (opts->record_positions_p)
398 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
399 return x;
400 }
401
402 static SCM
403 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
404 #define FUNC_NAME "scm_i_lreadparen"
405 {
406 int c;
407 SCM tmp, tl, ans = SCM_EOL;
408 const int terminating_char = ((chr == '[') ? ']' : ')');
409
410 /* Need to capture line and column numbers here. */
411 long line = SCM_LINUM (port);
412 int column = SCM_COL (port) - 1;
413
414 c = flush_ws (port, opts, FUNC_NAME);
415 if (terminating_char == c)
416 return SCM_EOL;
417
418 scm_ungetc (c, port);
419 tmp = scm_read_expression (port, opts);
420
421 /* Note that it is possible for scm_read_expression to return
422 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
423 check that it's a real dot by checking `c'. */
424 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
425 {
426 ans = scm_read_expression (port, opts);
427 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
428 scm_i_input_error (FUNC_NAME, port, "missing close paren",
429 SCM_EOL);
430 return ans;
431 }
432
433 /* Build the head of the list structure. */
434 ans = tl = scm_cons (tmp, SCM_EOL);
435
436 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
437 {
438 SCM new_tail;
439
440 if (c == ')' || (c == ']' && opts->square_brackets_p))
441 scm_i_input_error (FUNC_NAME, port,
442 "in pair: mismatched close paren: ~A",
443 scm_list_1 (SCM_MAKE_CHAR (c)));
444
445 scm_ungetc (c, port);
446 tmp = scm_read_expression (port, opts);
447
448 /* See above note about scm_sym_dot. */
449 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
450 {
451 SCM_SETCDR (tl, scm_read_expression (port, opts));
452
453 c = flush_ws (port, opts, FUNC_NAME);
454 if (terminating_char != c)
455 scm_i_input_error (FUNC_NAME, port,
456 "in pair: missing close paren", SCM_EOL);
457 goto exit;
458 }
459
460 new_tail = scm_cons (tmp, SCM_EOL);
461 SCM_SETCDR (tl, new_tail);
462 tl = new_tail;
463 }
464
465 exit:
466 return maybe_annotate_source (ans, port, opts, line, column);
467 }
468 #undef FUNC_NAME
469
470
471 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
472 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
473 found. */
474 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
475 do \
476 { \
477 scm_t_wchar a; \
478 size_t i = 0; \
479 c = 0; \
480 while (i < ndigits) \
481 { \
482 a = scm_getc (port); \
483 if (a == EOF) \
484 goto str_eof; \
485 if (terminator \
486 && (a == (scm_t_wchar) terminator) \
487 && (i > 0)) \
488 break; \
489 if ('0' <= a && a <= '9') \
490 a -= '0'; \
491 else if ('A' <= a && a <= 'F') \
492 a = a - 'A' + 10; \
493 else if ('a' <= a && a <= 'f') \
494 a = a - 'a' + 10; \
495 else \
496 { \
497 c = a; \
498 goto bad_escaped; \
499 } \
500 c = c * 16 + a; \
501 i ++; \
502 } \
503 } while (0)
504
505 static void
506 skip_intraline_whitespace (SCM port)
507 {
508 scm_t_wchar c;
509
510 do
511 {
512 c = scm_getc (port);
513 if (c == EOF)
514 return;
515 }
516 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
517
518 scm_ungetc (c, port);
519 }
520
521 static SCM
522 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
523 #define FUNC_NAME "scm_lreadr"
524 {
525 /* For strings smaller than C_STR, this function creates only one Scheme
526 object (the string returned). */
527
528 SCM str = SCM_EOL;
529 size_t c_str_len = 0;
530 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
531
532 /* Need to capture line and column numbers here. */
533 long line = SCM_LINUM (port);
534 int column = SCM_COL (port) - 1;
535
536 while ('"' != (c = scm_getc (port)))
537 {
538 if (c == EOF)
539 {
540 str_eof:
541 scm_i_input_error (FUNC_NAME, port,
542 "end of file in string constant", SCM_EOL);
543 }
544
545 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
546 {
547 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
548 c_str_len = 0;
549 }
550
551 if (c == '\\')
552 {
553 switch (c = scm_getc (port))
554 {
555 case EOF:
556 goto str_eof;
557 case '"':
558 case '\\':
559 break;
560 case '\n':
561 if (opts->hungry_eol_escapes_p)
562 skip_intraline_whitespace (port);
563 continue;
564 case '0':
565 c = '\0';
566 break;
567 case 'f':
568 c = '\f';
569 break;
570 case 'n':
571 c = '\n';
572 break;
573 case 'r':
574 c = '\r';
575 break;
576 case 't':
577 c = '\t';
578 break;
579 case 'a':
580 c = '\007';
581 break;
582 case 'v':
583 c = '\v';
584 break;
585 case 'b':
586 c = '\010';
587 break;
588 case 'x':
589 if (opts->r6rs_escapes_p)
590 SCM_READ_HEX_ESCAPE (10, ';');
591 else
592 SCM_READ_HEX_ESCAPE (2, '\0');
593 break;
594 case 'u':
595 if (!opts->r6rs_escapes_p)
596 {
597 SCM_READ_HEX_ESCAPE (4, '\0');
598 break;
599 }
600 case 'U':
601 if (!opts->r6rs_escapes_p)
602 {
603 SCM_READ_HEX_ESCAPE (6, '\0');
604 break;
605 }
606 default:
607 bad_escaped:
608 scm_i_input_error (FUNC_NAME, port,
609 "illegal character in escape sequence: ~S",
610 scm_list_1 (SCM_MAKE_CHAR (c)));
611 }
612 }
613
614 c_str[c_str_len++] = c;
615 }
616
617 if (scm_is_null (str))
618 /* Fast path: we got a string that fits in C_STR. */
619 str = scm_from_utf32_stringn (c_str, c_str_len);
620 else
621 {
622 if (c_str_len > 0)
623 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
624
625 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
626 }
627
628 return maybe_annotate_source (str, port, opts, line, column);
629 }
630 #undef FUNC_NAME
631
632
633 static SCM
634 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
635 {
636 SCM result, str = SCM_EOL;
637 char local_buffer[READER_BUFFER_SIZE], *buffer;
638 size_t bytes_read;
639 scm_t_port *pt = SCM_PTAB_ENTRY (port);
640
641 /* Need to capture line and column numbers here. */
642 long line = SCM_LINUM (port);
643 int column = SCM_COL (port) - 1;
644
645 scm_ungetc (chr, port);
646 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
647 &bytes_read);
648
649 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
650
651 result = scm_string_to_number (str, SCM_UNDEFINED);
652 if (scm_is_false (result))
653 {
654 /* Return a symbol instead of a number */
655 if (opts->case_insensitive_p)
656 str = scm_string_downcase_x (str);
657 result = scm_string_to_symbol (str);
658 }
659 else if (SCM_NIMP (result))
660 result = maybe_annotate_source (result, port, opts, line, column);
661
662 SCM_COL (port) += scm_i_string_length (str);
663 return result;
664 }
665
666 static SCM
667 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
668 {
669 SCM result;
670 int ends_with_colon = 0;
671 size_t bytes_read;
672 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
673 char local_buffer[READER_BUFFER_SIZE], *buffer;
674 scm_t_port *pt = SCM_PTAB_ENTRY (port);
675 SCM str;
676
677 scm_ungetc (chr, port);
678 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
679 &bytes_read);
680 if (bytes_read > 0)
681 ends_with_colon = buffer[bytes_read - 1] == ':';
682
683 if (postfix && ends_with_colon && (bytes_read > 1))
684 {
685 str = scm_from_stringn (buffer, bytes_read - 1,
686 pt->encoding, pt->ilseq_handler);
687
688 if (opts->case_insensitive_p)
689 str = scm_string_downcase_x (str);
690 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
691 }
692 else
693 {
694 str = scm_from_stringn (buffer, bytes_read,
695 pt->encoding, pt->ilseq_handler);
696
697 if (opts->case_insensitive_p)
698 str = scm_string_downcase_x (str);
699 result = scm_string_to_symbol (str);
700 }
701
702 SCM_COL (port) += scm_i_string_length (str);
703 return result;
704 }
705
706 static SCM
707 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
708 #define FUNC_NAME "scm_lreadr"
709 {
710 SCM result;
711 size_t read;
712 char local_buffer[READER_BUFFER_SIZE], *buffer;
713 unsigned int radix;
714 SCM str;
715 scm_t_port *pt;
716
717 switch (chr)
718 {
719 case 'B':
720 case 'b':
721 radix = 2;
722 break;
723
724 case 'o':
725 case 'O':
726 radix = 8;
727 break;
728
729 case 'd':
730 case 'D':
731 radix = 10;
732 break;
733
734 case 'x':
735 case 'X':
736 radix = 16;
737 break;
738
739 default:
740 scm_ungetc (chr, port);
741 scm_ungetc ('#', port);
742 radix = 10;
743 }
744
745 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
746 &read);
747
748 pt = SCM_PTAB_ENTRY (port);
749 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
750
751 result = scm_string_to_number (str, scm_from_uint (radix));
752
753 SCM_COL (port) += scm_i_string_length (str);
754
755 if (scm_is_true (result))
756 return result;
757
758 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
759
760 return SCM_BOOL_F;
761 }
762 #undef FUNC_NAME
763
764 static SCM
765 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
766 {
767 SCM p;
768 long line = SCM_LINUM (port);
769 int column = SCM_COL (port) - 1;
770
771 switch (chr)
772 {
773 case '`':
774 p = scm_sym_quasiquote;
775 break;
776
777 case '\'':
778 p = scm_sym_quote;
779 break;
780
781 case ',':
782 {
783 scm_t_wchar c;
784
785 c = scm_getc (port);
786 if ('@' == c)
787 p = scm_sym_uq_splicing;
788 else
789 {
790 scm_ungetc (c, port);
791 p = scm_sym_unquote;
792 }
793 break;
794 }
795
796 default:
797 fprintf (stderr, "%s: unhandled quote character (%i)\n",
798 "scm_read_quote", chr);
799 abort ();
800 }
801
802 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
803 return maybe_annotate_source (p, port, opts, line, column);
804 }
805
806 SCM_SYMBOL (sym_syntax, "syntax");
807 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
808 SCM_SYMBOL (sym_unsyntax, "unsyntax");
809 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
810
811 static SCM
812 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
813 {
814 SCM p;
815 long line = SCM_LINUM (port);
816 int column = SCM_COL (port) - 1;
817
818 switch (chr)
819 {
820 case '`':
821 p = sym_quasisyntax;
822 break;
823
824 case '\'':
825 p = sym_syntax;
826 break;
827
828 case ',':
829 {
830 int c;
831
832 c = scm_getc (port);
833 if ('@' == c)
834 p = sym_unsyntax_splicing;
835 else
836 {
837 scm_ungetc (c, port);
838 p = sym_unsyntax;
839 }
840 break;
841 }
842
843 default:
844 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
845 "scm_read_syntax", chr);
846 abort ();
847 }
848
849 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
850 return maybe_annotate_source (p, port, opts, line, column);
851 }
852
853 static SCM
854 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
855 {
856 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
857
858 if (!scm_is_eq (id, sym_nil))
859 scm_i_input_error ("scm_read_nil", port,
860 "unexpected input while reading #nil: ~a",
861 scm_list_1 (id));
862
863 return SCM_ELISP_NIL;
864 }
865
866 static SCM
867 scm_read_semicolon_comment (int chr, SCM port)
868 {
869 int c;
870
871 /* We use the get_byte here because there is no need to get the
872 locale correct with comment input. This presumes that newline
873 always represents itself no matter what the encoding is. */
874 for (c = scm_get_byte_or_eof (port);
875 (c != EOF) && (c != '\n');
876 c = scm_get_byte_or_eof (port));
877
878 return SCM_UNSPECIFIED;
879 }
880
881 \f
882 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
883
884 static SCM
885 scm_read_boolean (int chr, SCM port)
886 {
887 switch (chr)
888 {
889 case 't':
890 case 'T':
891 return SCM_BOOL_T;
892
893 case 'f':
894 case 'F':
895 return SCM_BOOL_F;
896 }
897
898 return SCM_UNSPECIFIED;
899 }
900
901 static SCM
902 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
903 #define FUNC_NAME "scm_lreadr"
904 {
905 char buffer[READER_CHAR_NAME_MAX_SIZE];
906 SCM charname;
907 size_t charname_len, bytes_read;
908 scm_t_wchar cp;
909 int overflow;
910 scm_t_port *pt;
911
912 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
913 &bytes_read);
914 if (overflow)
915 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
916
917 if (bytes_read == 0)
918 {
919 chr = scm_getc (port);
920 if (chr == EOF)
921 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
922 "while reading character", SCM_EOL);
923
924 /* CHR must be a token delimiter, like a whitespace. */
925 return (SCM_MAKE_CHAR (chr));
926 }
927
928 pt = SCM_PTAB_ENTRY (port);
929
930 /* Simple ASCII characters can be processed immediately. Also, simple
931 ISO-8859-1 characters can be processed immediately if the encoding for this
932 port is ISO-8859-1. */
933 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
934 {
935 SCM_COL (port) += 1;
936 return SCM_MAKE_CHAR (buffer[0]);
937 }
938
939 /* Otherwise, convert the buffer into a proper scheme string for
940 processing. */
941 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
942 pt->ilseq_handler);
943 charname_len = scm_i_string_length (charname);
944 SCM_COL (port) += charname_len;
945 cp = scm_i_string_ref (charname, 0);
946 if (charname_len == 1)
947 return SCM_MAKE_CHAR (cp);
948
949 /* Ignore dotted circles, which may be used to keep combining characters from
950 combining with the backslash in #\charname. */
951 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
952 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
953
954 if (cp >= '0' && cp < '8')
955 {
956 /* Dirk:FIXME:: This type of character syntax is not R5RS
957 * compliant. Further, it should be verified that the constant
958 * does only consist of octal digits. */
959 SCM p = scm_string_to_number (charname, scm_from_uint (8));
960 if (SCM_I_INUMP (p))
961 {
962 scm_t_wchar c = scm_to_uint32 (p);
963 if (SCM_IS_UNICODE_CHAR (c))
964 return SCM_MAKE_CHAR (c);
965 else
966 scm_i_input_error (FUNC_NAME, port,
967 "out-of-range octal character escape: ~a",
968 scm_list_1 (charname));
969 }
970 }
971
972 if (cp == 'x' && (charname_len > 1))
973 {
974 SCM p;
975
976 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
977 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
978 scm_from_uint (16));
979 if (SCM_I_INUMP (p))
980 {
981 scm_t_wchar c = scm_to_uint32 (p);
982 if (SCM_IS_UNICODE_CHAR (c))
983 return SCM_MAKE_CHAR (c);
984 else
985 scm_i_input_error (FUNC_NAME, port,
986 "out-of-range hex character escape: ~a",
987 scm_list_1 (charname));
988 }
989 }
990
991 /* The names of characters should never have non-Latin1
992 characters. */
993 if (scm_i_is_narrow_string (charname)
994 || scm_i_try_narrow_string (charname))
995 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
996 charname_len);
997 if (scm_is_true (ch))
998 return ch;
999 }
1000
1001 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1002 scm_list_1 (charname));
1003
1004 return SCM_UNSPECIFIED;
1005 }
1006 #undef FUNC_NAME
1007
1008 static SCM
1009 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1010 {
1011 SCM symbol;
1012
1013 /* Read the symbol that comprises the keyword. Doing this instead of
1014 invoking a specific symbol reader function allows `scm_read_keyword ()'
1015 to adapt to the delimiters currently valid of symbols.
1016
1017 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1018 symbol = scm_read_expression (port, opts);
1019 if (!scm_is_symbol (symbol))
1020 scm_i_input_error ("scm_read_keyword", port,
1021 "keyword prefix `~a' not followed by a symbol: ~s",
1022 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1023
1024 return (scm_symbol_to_keyword (symbol));
1025 }
1026
1027 static SCM
1028 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1029 long line, int column)
1030 {
1031 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1032 guarantee that it's going to do what we want. After all, this is an
1033 implementation detail of `scm_read_vector ()', not a desirable
1034 property. */
1035 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1036 port, opts, line, column);
1037 }
1038
1039 /* Helper used by scm_read_array */
1040 static int
1041 read_decimal_integer (SCM port, int c, ssize_t *resp)
1042 {
1043 ssize_t sign = 1;
1044 ssize_t res = 0;
1045 int got_it = 0;
1046
1047 if (c == '-')
1048 {
1049 sign = -1;
1050 c = scm_getc (port);
1051 }
1052
1053 while ('0' <= c && c <= '9')
1054 {
1055 res = 10*res + c-'0';
1056 got_it = 1;
1057 c = scm_getc (port);
1058 }
1059
1060 if (got_it)
1061 *resp = sign * res;
1062 return c;
1063 }
1064
1065 /* Read an array. This function can also read vectors and uniform
1066 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1067 handled here.
1068
1069 C is the first character read after the '#'.
1070 */
1071 static SCM
1072 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1073 {
1074 ssize_t rank;
1075 scm_t_wchar tag_buf[8];
1076 int tag_len;
1077
1078 SCM tag, shape = SCM_BOOL_F, elements, array;
1079
1080 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1081 the array code can not deal with zero-length dimensions yet, and
1082 we want to allow zero-length vectors, of course.
1083 */
1084 if (c == '(')
1085 return scm_read_vector (c, port, opts, line, column);
1086
1087 /* Disambiguate between '#f' and uniform floating point vectors.
1088 */
1089 if (c == 'f')
1090 {
1091 c = scm_getc (port);
1092 if (c != '3' && c != '6')
1093 {
1094 if (c != EOF)
1095 scm_ungetc (c, port);
1096 return SCM_BOOL_F;
1097 }
1098 rank = 1;
1099 tag_buf[0] = 'f';
1100 tag_len = 1;
1101 goto continue_reading_tag;
1102 }
1103
1104 /* Read rank. */
1105 rank = 1;
1106 c = read_decimal_integer (port, c, &rank);
1107 if (rank < 0)
1108 scm_i_input_error (NULL, port, "array rank must be non-negative",
1109 SCM_EOL);
1110
1111 /* Read tag. */
1112 tag_len = 0;
1113 continue_reading_tag:
1114 while (c != EOF && c != '(' && c != '@' && c != ':'
1115 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1116 {
1117 tag_buf[tag_len++] = c;
1118 c = scm_getc (port);
1119 }
1120 if (tag_len == 0)
1121 tag = SCM_BOOL_T;
1122 else
1123 {
1124 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1125 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1126 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1127 scm_list_1 (tag));
1128 }
1129
1130 /* Read shape. */
1131 if (c == '@' || c == ':')
1132 {
1133 shape = SCM_EOL;
1134
1135 do
1136 {
1137 ssize_t lbnd = 0, len = 0;
1138 SCM s;
1139
1140 if (c == '@')
1141 {
1142 c = scm_getc (port);
1143 c = read_decimal_integer (port, c, &lbnd);
1144 }
1145
1146 s = scm_from_ssize_t (lbnd);
1147
1148 if (c == ':')
1149 {
1150 c = scm_getc (port);
1151 c = read_decimal_integer (port, c, &len);
1152 if (len < 0)
1153 scm_i_input_error (NULL, port,
1154 "array length must be non-negative",
1155 SCM_EOL);
1156
1157 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1158 }
1159
1160 shape = scm_cons (s, shape);
1161 } while (c == '@' || c == ':');
1162
1163 shape = scm_reverse_x (shape, SCM_EOL);
1164 }
1165
1166 /* Read nested lists of elements. */
1167 if (c != '(')
1168 scm_i_input_error (NULL, port,
1169 "missing '(' in vector or array literal",
1170 SCM_EOL);
1171 elements = scm_read_sexp (c, port, opts);
1172
1173 if (scm_is_false (shape))
1174 shape = scm_from_ssize_t (rank);
1175 else if (scm_ilength (shape) != rank)
1176 scm_i_input_error
1177 (NULL, port,
1178 "the number of shape specifications must match the array rank",
1179 SCM_EOL);
1180
1181 /* Handle special print syntax of rank zero arrays; see
1182 scm_i_print_array for a rationale. */
1183 if (rank == 0)
1184 {
1185 if (!scm_is_pair (elements))
1186 scm_i_input_error (NULL, port,
1187 "too few elements in array literal, need 1",
1188 SCM_EOL);
1189 if (!scm_is_null (SCM_CDR (elements)))
1190 scm_i_input_error (NULL, port,
1191 "too many elements in array literal, want 1",
1192 SCM_EOL);
1193 elements = SCM_CAR (elements);
1194 }
1195
1196 /* Construct array, annotate with source location, and return. */
1197 array = scm_list_to_typed_array (tag, shape, elements);
1198 return maybe_annotate_source (array, port, opts, line, column);
1199 }
1200
1201 static SCM
1202 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1203 long line, int column)
1204 {
1205 return scm_read_array (chr, port, opts, line, column);
1206 }
1207
1208 static SCM
1209 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1210 long line, int column)
1211 {
1212 chr = scm_getc (port);
1213 if (chr != 'u')
1214 goto syntax;
1215
1216 chr = scm_getc (port);
1217 if (chr != '8')
1218 goto syntax;
1219
1220 chr = scm_getc (port);
1221 if (chr != '(')
1222 goto syntax;
1223
1224 return maybe_annotate_source
1225 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1226 port, opts, line, column);
1227
1228 syntax:
1229 scm_i_input_error ("read_bytevector", port,
1230 "invalid bytevector prefix",
1231 SCM_MAKE_CHAR (chr));
1232 return SCM_UNSPECIFIED;
1233 }
1234
1235 static SCM
1236 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1237 long line, int column)
1238 {
1239 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1240 terribly inefficient but who cares? */
1241 SCM s_bits = SCM_EOL;
1242
1243 for (chr = scm_getc (port);
1244 (chr != EOF) && ((chr == '0') || (chr == '1'));
1245 chr = scm_getc (port))
1246 {
1247 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1248 }
1249
1250 if (chr != EOF)
1251 scm_ungetc (chr, port);
1252
1253 return maybe_annotate_source
1254 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1255 port, opts, line, column);
1256 }
1257
1258 static SCM
1259 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1260 {
1261 int bang_seen = 0;
1262
1263 for (;;)
1264 {
1265 int c = scm_getc (port);
1266
1267 if (c == EOF)
1268 scm_i_input_error ("skip_block_comment", port,
1269 "unterminated `#! ... !#' comment", SCM_EOL);
1270
1271 if (c == '!')
1272 bang_seen = 1;
1273 else if (c == '#' && bang_seen)
1274 break;
1275 else
1276 bang_seen = 0;
1277 }
1278
1279 return SCM_UNSPECIFIED;
1280 }
1281
1282 static SCM
1283 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1284 {
1285 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1286 int c;
1287 int i = 0;
1288
1289 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1290 {
1291 c = scm_getc (port);
1292 if (c == EOF)
1293 scm_i_input_error ("skip_block_comment", port,
1294 "unterminated `#! ... !#' comment", SCM_EOL);
1295 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1296 name[i++] = c;
1297 else if (CHAR_IS_DELIMITER (c))
1298 {
1299 scm_ungetc (c, port);
1300 name[i] = '\0';
1301 if (0 == strcmp ("r6rs", name))
1302 ; /* Silently ignore */
1303 else
1304 break;
1305
1306 return SCM_UNSPECIFIED;
1307 }
1308 }
1309 while (i > 0)
1310 scm_ungetc (name[--i], port);
1311 return scm_read_scsh_block_comment (chr, port);
1312 }
1313
1314 static SCM
1315 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1316 {
1317 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1318 nested. So care must be taken. */
1319 int nesting_level = 1;
1320
1321 int a = scm_getc (port);
1322
1323 if (a == EOF)
1324 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1325 "unterminated `#| ... |#' comment", SCM_EOL);
1326
1327 while (nesting_level > 0)
1328 {
1329 int b = scm_getc (port);
1330
1331 if (b == EOF)
1332 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1333 "unterminated `#| ... |#' comment", SCM_EOL);
1334
1335 if (a == '|' && b == '#')
1336 {
1337 nesting_level--;
1338 b = EOF;
1339 }
1340 else if (a == '#' && b == '|')
1341 {
1342 nesting_level++;
1343 b = EOF;
1344 }
1345
1346 a = b;
1347 }
1348
1349 return SCM_UNSPECIFIED;
1350 }
1351
1352 static SCM
1353 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1354 scm_t_read_opts *opts)
1355 {
1356 scm_t_wchar c;
1357
1358 c = flush_ws (port, opts, (char *) NULL);
1359 if (EOF == c)
1360 scm_i_input_error ("read_commented_expression", port,
1361 "no expression after #; comment", SCM_EOL);
1362 scm_ungetc (c, port);
1363 scm_read_expression (port, opts);
1364 return SCM_UNSPECIFIED;
1365 }
1366
1367 static SCM
1368 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1369 {
1370 /* Guile's extended symbol read syntax looks like this:
1371
1372 #{This is all a symbol name}#
1373
1374 So here, CHR is expected to be `{'. */
1375 int saw_brace = 0;
1376 size_t len = 0;
1377 SCM buf = scm_i_make_string (1024, NULL, 0);
1378
1379 buf = scm_i_string_start_writing (buf);
1380
1381 while ((chr = scm_getc (port)) != EOF)
1382 {
1383 if (saw_brace)
1384 {
1385 if (chr == '#')
1386 {
1387 break;
1388 }
1389 else
1390 {
1391 saw_brace = 0;
1392 scm_i_string_set_x (buf, len++, '}');
1393 }
1394 }
1395
1396 if (chr == '}')
1397 saw_brace = 1;
1398 else if (chr == '\\')
1399 {
1400 /* It used to be that print.c would print extended-read-syntax
1401 symbols with backslashes before "non-standard" chars, but
1402 this routine wouldn't do anything with those escapes.
1403 Bummer. What we've done is to change print.c to output
1404 R6RS hex escapes for those characters, relying on the fact
1405 that the extended read syntax would never put a `\' before
1406 an `x'. For now, we just ignore other instances of
1407 backslash in the string. */
1408 switch ((chr = scm_getc (port)))
1409 {
1410 case EOF:
1411 goto done;
1412 case 'x':
1413 {
1414 scm_t_wchar c;
1415
1416 SCM_READ_HEX_ESCAPE (10, ';');
1417 scm_i_string_set_x (buf, len++, c);
1418 break;
1419
1420 str_eof:
1421 chr = EOF;
1422 goto done;
1423
1424 bad_escaped:
1425 scm_i_string_stop_writing ();
1426 scm_i_input_error ("scm_read_extended_symbol", port,
1427 "illegal character in escape sequence: ~S",
1428 scm_list_1 (SCM_MAKE_CHAR (c)));
1429 break;
1430 }
1431 default:
1432 scm_i_string_set_x (buf, len++, chr);
1433 break;
1434 }
1435 }
1436 else
1437 scm_i_string_set_x (buf, len++, chr);
1438
1439 if (len >= scm_i_string_length (buf) - 2)
1440 {
1441 SCM addy;
1442
1443 scm_i_string_stop_writing ();
1444 addy = scm_i_make_string (1024, NULL, 0);
1445 buf = scm_string_append (scm_list_2 (buf, addy));
1446 len = 0;
1447 buf = scm_i_string_start_writing (buf);
1448 }
1449 }
1450
1451 done:
1452 scm_i_string_stop_writing ();
1453 if (chr == EOF)
1454 scm_i_input_error ("scm_read_extended_symbol", port,
1455 "end of file while reading symbol", SCM_EOL);
1456
1457 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1458 }
1459
1460
1461 \f
1462 /* Top-level token readers, i.e., dispatchers. */
1463
1464 static SCM
1465 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1466 {
1467 SCM proc;
1468
1469 proc = scm_get_hash_procedure (chr);
1470 if (scm_is_true (scm_procedure_p (proc)))
1471 {
1472 long line = SCM_LINUM (port);
1473 int column = SCM_COL (port) - 2;
1474 SCM got;
1475
1476 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1477
1478 if (opts->record_positions_p && SCM_NIMP (got)
1479 && !scm_i_has_source_properties (got))
1480 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1481
1482 return got;
1483 }
1484
1485 return SCM_UNSPECIFIED;
1486 }
1487
1488 /* The reader for the sharp `#' character. It basically dispatches reads
1489 among the above token readers. */
1490 static SCM
1491 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1492 long line, int column)
1493 #define FUNC_NAME "scm_lreadr"
1494 {
1495 SCM result;
1496
1497 chr = scm_getc (port);
1498
1499 result = scm_read_sharp_extension (chr, port, opts);
1500 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1501 return result;
1502
1503 switch (chr)
1504 {
1505 case '\\':
1506 return (scm_read_character (chr, port, opts));
1507 case '(':
1508 return (scm_read_vector (chr, port, opts, line, column));
1509 case 's':
1510 case 'u':
1511 case 'f':
1512 case 'c':
1513 /* This one may return either a boolean or an SRFI-4 vector. */
1514 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1515 case 'v':
1516 return (scm_read_bytevector (chr, port, opts, line, column));
1517 case '*':
1518 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1519 case 't':
1520 case 'T':
1521 case 'F':
1522 return (scm_read_boolean (chr, port));
1523 case ':':
1524 return (scm_read_keyword (chr, port, opts));
1525 case '0': case '1': case '2': case '3': case '4':
1526 case '5': case '6': case '7': case '8': case '9':
1527 case '@':
1528 #if SCM_ENABLE_DEPRECATED
1529 /* See below for 'i' and 'e'. */
1530 case 'a':
1531 case 'y':
1532 case 'h':
1533 case 'l':
1534 #endif
1535 return (scm_read_array (chr, port, opts, line, column));
1536
1537 case 'i':
1538 case 'e':
1539 #if SCM_ENABLE_DEPRECATED
1540 {
1541 /* When next char is '(', it really is an old-style
1542 uniform array. */
1543 scm_t_wchar next_c = scm_getc (port);
1544 if (next_c != EOF)
1545 scm_ungetc (next_c, port);
1546 if (next_c == '(')
1547 return scm_read_array (chr, port, opts, line, column);
1548 /* Fall through. */
1549 }
1550 #endif
1551 case 'b':
1552 case 'B':
1553 case 'o':
1554 case 'O':
1555 case 'd':
1556 case 'D':
1557 case 'x':
1558 case 'X':
1559 case 'I':
1560 case 'E':
1561 return (scm_read_number_and_radix (chr, port, opts));
1562 case '{':
1563 return (scm_read_extended_symbol (chr, port));
1564 case '!':
1565 return (scm_read_shebang (chr, port, opts));
1566 case ';':
1567 return (scm_read_commented_expression (chr, port, opts));
1568 case '`':
1569 case '\'':
1570 case ',':
1571 return (scm_read_syntax (chr, port, opts));
1572 case 'n':
1573 return (scm_read_nil (chr, port, opts));
1574 default:
1575 result = scm_read_sharp_extension (chr, port, opts);
1576 if (scm_is_eq (result, SCM_UNSPECIFIED))
1577 {
1578 /* To remain compatible with 1.8 and earlier, the following
1579 characters have lower precedence than `read-hash-extend'
1580 characters. */
1581 switch (chr)
1582 {
1583 case '|':
1584 return scm_read_r6rs_block_comment (chr, port);
1585 default:
1586 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1587 scm_list_1 (SCM_MAKE_CHAR (chr)));
1588 }
1589 }
1590 else
1591 return result;
1592 }
1593
1594 return SCM_UNSPECIFIED;
1595 }
1596 #undef FUNC_NAME
1597
1598 static SCM
1599 scm_read_expression (SCM port, scm_t_read_opts *opts)
1600 #define FUNC_NAME "scm_read_expression"
1601 {
1602 while (1)
1603 {
1604 scm_t_wchar chr;
1605
1606 chr = scm_getc (port);
1607
1608 switch (chr)
1609 {
1610 case SCM_WHITE_SPACES:
1611 case SCM_LINE_INCREMENTORS:
1612 break;
1613 case ';':
1614 (void) scm_read_semicolon_comment (chr, port);
1615 break;
1616 case '[':
1617 if (!opts->square_brackets_p)
1618 return (scm_read_mixed_case_symbol (chr, port, opts));
1619 /* otherwise fall through */
1620 case '(':
1621 return (scm_read_sexp (chr, port, opts));
1622 case '"':
1623 return (scm_read_string (chr, port, opts));
1624 case '\'':
1625 case '`':
1626 case ',':
1627 return (scm_read_quote (chr, port, opts));
1628 case '#':
1629 {
1630 long line = SCM_LINUM (port);
1631 int column = SCM_COL (port) - 1;
1632 SCM result = scm_read_sharp (chr, port, opts, line, column);
1633 if (scm_is_eq (result, SCM_UNSPECIFIED))
1634 /* We read a comment or some such. */
1635 break;
1636 else
1637 return result;
1638 }
1639 case ')':
1640 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1641 break;
1642 case ']':
1643 if (opts->square_brackets_p)
1644 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1645 /* otherwise fall through */
1646 case EOF:
1647 return SCM_EOF_VAL;
1648 case ':':
1649 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1650 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1651 /* Fall through. */
1652
1653 default:
1654 {
1655 if (((chr >= '0') && (chr <= '9'))
1656 || (strchr ("+-.", chr)))
1657 return (scm_read_number (chr, port, opts));
1658 else
1659 return (scm_read_mixed_case_symbol (chr, port, opts));
1660 }
1661 }
1662 }
1663 }
1664 #undef FUNC_NAME
1665
1666 \f
1667 /* Actual reader. */
1668
1669 static void init_read_options (SCM port, scm_t_read_opts *opts);
1670
1671 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1672 (SCM port),
1673 "Read an s-expression from the input port @var{port}, or from\n"
1674 "the current input port if @var{port} is not specified.\n"
1675 "Any whitespace before the next token is discarded.")
1676 #define FUNC_NAME s_scm_read
1677 {
1678 scm_t_read_opts opts;
1679 int c;
1680
1681 if (SCM_UNBNDP (port))
1682 port = scm_current_input_port ();
1683 SCM_VALIDATE_OPINPORT (1, port);
1684
1685 init_read_options (port, &opts);
1686
1687 c = flush_ws (port, &opts, (char *) NULL);
1688 if (EOF == c)
1689 return SCM_EOF_VAL;
1690 scm_ungetc (c, port);
1691
1692 return (scm_read_expression (port, &opts));
1693 }
1694 #undef FUNC_NAME
1695
1696
1697 \f
1698
1699 /* Manipulate the read-hash-procedures alist. This could be written in
1700 Scheme, but maybe it will also be used by C code during initialisation. */
1701 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1702 (SCM chr, SCM proc),
1703 "Install the procedure @var{proc} for reading expressions\n"
1704 "starting with the character sequence @code{#} and @var{chr}.\n"
1705 "@var{proc} will be called with two arguments: the character\n"
1706 "@var{chr} and the port to read further data from. The object\n"
1707 "returned will be the return value of @code{read}. \n"
1708 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1709 )
1710 #define FUNC_NAME s_scm_read_hash_extend
1711 {
1712 SCM this;
1713 SCM prev;
1714
1715 SCM_VALIDATE_CHAR (1, chr);
1716 SCM_ASSERT (scm_is_false (proc)
1717 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1718 proc, SCM_ARG2, FUNC_NAME);
1719
1720 /* Check if chr is already in the alist. */
1721 this = scm_i_read_hash_procedures_ref ();
1722 prev = SCM_BOOL_F;
1723 while (1)
1724 {
1725 if (scm_is_null (this))
1726 {
1727 /* not found, so add it to the beginning. */
1728 if (scm_is_true (proc))
1729 {
1730 SCM new = scm_cons (scm_cons (chr, proc),
1731 scm_i_read_hash_procedures_ref ());
1732 scm_i_read_hash_procedures_set_x (new);
1733 }
1734 break;
1735 }
1736 if (scm_is_eq (chr, SCM_CAAR (this)))
1737 {
1738 /* already in the alist. */
1739 if (scm_is_false (proc))
1740 {
1741 /* remove it. */
1742 if (scm_is_false (prev))
1743 {
1744 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1745 scm_i_read_hash_procedures_set_x (rest);
1746 }
1747 else
1748 scm_set_cdr_x (prev, SCM_CDR (this));
1749 }
1750 else
1751 {
1752 /* replace it. */
1753 scm_set_cdr_x (SCM_CAR (this), proc);
1754 }
1755 break;
1756 }
1757 prev = this;
1758 this = SCM_CDR (this);
1759 }
1760
1761 return SCM_UNSPECIFIED;
1762 }
1763 #undef FUNC_NAME
1764
1765 /* Recover the read-hash procedure corresponding to char c. */
1766 static SCM
1767 scm_get_hash_procedure (int c)
1768 {
1769 SCM rest = scm_i_read_hash_procedures_ref ();
1770
1771 while (1)
1772 {
1773 if (scm_is_null (rest))
1774 return SCM_BOOL_F;
1775
1776 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1777 return SCM_CDAR (rest);
1778
1779 rest = SCM_CDR (rest);
1780 }
1781 }
1782
1783 #define SCM_ENCODING_SEARCH_SIZE (500)
1784
1785 /* Search the first few hundred characters of a file for an Emacs-like coding
1786 declaration. Returns either NULL or a string whose storage has been
1787 allocated with `scm_gc_malloc ()'. */
1788 char *
1789 scm_i_scan_for_encoding (SCM port)
1790 {
1791 scm_t_port *pt;
1792 char header[SCM_ENCODING_SEARCH_SIZE+1];
1793 size_t bytes_read, encoding_length, i;
1794 char *encoding = NULL;
1795 int utf8_bom = 0;
1796 char *pos, *encoding_start;
1797 int in_comment;
1798
1799 pt = SCM_PTAB_ENTRY (port);
1800
1801 if (pt->rw_active == SCM_PORT_WRITE)
1802 scm_flush (port);
1803
1804 if (pt->rw_random)
1805 pt->rw_active = SCM_PORT_READ;
1806
1807 if (pt->read_pos == pt->read_end)
1808 {
1809 /* We can use the read buffer, and thus avoid a seek. */
1810 if (scm_fill_input (port) == EOF)
1811 return NULL;
1812
1813 bytes_read = pt->read_end - pt->read_pos;
1814 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1815 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1816
1817 if (bytes_read <= 1)
1818 /* An unbuffered port -- don't scan. */
1819 return NULL;
1820
1821 memcpy (header, pt->read_pos, bytes_read);
1822 header[bytes_read] = '\0';
1823 }
1824 else
1825 {
1826 /* Try to read some bytes and then seek back. Not all ports
1827 support seeking back; and indeed some file ports (like
1828 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1829 check performed by SCM_FPORT_FDES---but fail to seek
1830 backwards. Hence this block comes second. We prefer to use
1831 the read buffer in-place. */
1832 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1833 return NULL;
1834
1835 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1836 header[bytes_read] = '\0';
1837 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1838 }
1839
1840 if (bytes_read > 3
1841 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1842 utf8_bom = 1;
1843
1844 /* search past "coding[:=]" */
1845 pos = header;
1846 while (1)
1847 {
1848 if ((pos = strstr(pos, "coding")) == NULL)
1849 return NULL;
1850
1851 pos += strlen("coding");
1852 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1853 (*pos == ':' || *pos == '='))
1854 {
1855 pos ++;
1856 break;
1857 }
1858 }
1859
1860 /* skip spaces */
1861 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1862 (*pos == ' ' || *pos == '\t'))
1863 pos ++;
1864
1865 /* grab the next token */
1866 encoding_start = pos;
1867 i = 0;
1868 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1869 && encoding_start + i - header < bytes_read
1870 && (isalnum ((int) encoding_start[i])
1871 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1872 i++;
1873
1874 encoding_length = i;
1875 if (encoding_length == 0)
1876 return NULL;
1877
1878 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1879 for (i = 0; i < encoding_length; i++)
1880 encoding[i] = toupper ((int) encoding[i]);
1881
1882 /* push backwards to make sure we were in a comment */
1883 in_comment = 0;
1884 pos = encoding_start;
1885 while (pos >= header)
1886 {
1887 if (*pos == ';')
1888 {
1889 in_comment = 1;
1890 break;
1891 }
1892 else if (*pos == '\n' || pos == header)
1893 {
1894 /* This wasn't in a semicolon comment. Check for a
1895 hash-bang comment. */
1896 char *beg = strstr (header, "#!");
1897 char *end = strstr (header, "!#");
1898 if (beg < encoding_start && encoding_start + encoding_length <= end)
1899 in_comment = 1;
1900 break;
1901 }
1902 else
1903 {
1904 pos --;
1905 continue;
1906 }
1907 }
1908 if (!in_comment)
1909 /* This wasn't in a comment */
1910 return NULL;
1911
1912 if (utf8_bom && strcmp(encoding, "UTF-8"))
1913 scm_misc_error (NULL,
1914 "the port input declares the encoding ~s but is encoded as UTF-8",
1915 scm_list_1 (scm_from_locale_string (encoding)));
1916
1917 return encoding;
1918 }
1919
1920 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1921 (SCM port),
1922 "Scans the port for an Emacs-like character coding declaration\n"
1923 "near the top of the contents of a port with random-accessible contents.\n"
1924 "The coding declaration is of the form\n"
1925 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1926 "\n"
1927 "Returns a string containing the character encoding of the file\n"
1928 "if a declaration was found, or @code{#f} otherwise.\n")
1929 #define FUNC_NAME s_scm_file_encoding
1930 {
1931 char *enc;
1932 SCM s_enc;
1933
1934 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1935
1936 enc = scm_i_scan_for_encoding (port);
1937 if (enc == NULL)
1938 return SCM_BOOL_F;
1939 else
1940 {
1941 s_enc = scm_from_locale_string (enc);
1942 return s_enc;
1943 }
1944
1945 return SCM_BOOL_F;
1946 }
1947 #undef FUNC_NAME
1948
1949 \f
1950 /* Per-port read options.
1951
1952 We store per-port read options in the 'port-read-options' key of the
1953 port's alist, which is stored in 'scm_i_port_weak_hash'. The value
1954 stored in the alist is a single integer that contains a two-bit field
1955 for each read option.
1956
1957 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
1958 the applicable value should be inherited from the corresponding
1959 global read option. Otherwise, the bit field contains the value of
1960 the read option. For boolean read options that have been set
1961 per-port, the possible values are 0 or 1. If the 'keyword_style'
1962 read option has been set per-port, its possible values are those in
1963 'enum t_keyword_style'. */
1964
1965 /* Key to read options in per-port alists. */
1966 SCM_SYMBOL (sym_port_read_options, "port-read-options");
1967
1968 /* Offsets of bit fields for each per-port override */
1969 #define READ_OPTION_COPY_SOURCE_P 0
1970 #define READ_OPTION_RECORD_POSITIONS_P 2
1971 #define READ_OPTION_CASE_INSENSITIVE_P 4
1972 #define READ_OPTION_KEYWORD_STYLE 6
1973 #define READ_OPTION_R6RS_ESCAPES_P 8
1974 #define READ_OPTION_SQUARE_BRACKETS_P 10
1975 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
1976
1977 #define READ_OPTIONS_NUM_BITS 14
1978
1979 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
1980 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
1981
1982 #define READ_OPTION_MASK 3
1983 #define READ_OPTION_INHERIT 3
1984
1985 static void
1986 set_port_read_option (SCM port, int option, int new_value)
1987 {
1988 SCM alist, scm_read_options;
1989 unsigned int read_options;
1990
1991 new_value &= READ_OPTION_MASK;
1992 scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
1993 alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
1994 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
1995 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
1996 read_options = scm_to_uint (scm_read_options);
1997 else
1998 read_options = READ_OPTIONS_INHERIT_ALL;
1999 read_options &= ~(READ_OPTION_MASK << option);
2000 read_options |= new_value << option;
2001 scm_read_options = scm_from_uint (read_options);
2002 alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2003 scm_hashq_set_x (scm_i_port_weak_hash, port, alist);
2004 scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2005 }
2006
2007 /* Initialize OPTS based on PORT's read options and the global read
2008 options. */
2009 static void
2010 init_read_options (SCM port, scm_t_read_opts *opts)
2011 {
2012 SCM alist, val, scm_read_options;
2013 unsigned int read_options, x;
2014
2015 scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2016 alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2017 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2018 scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2019
2020 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2021 read_options = scm_to_uint (scm_read_options);
2022 else
2023 read_options = READ_OPTIONS_INHERIT_ALL;
2024
2025 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2026 if (x == READ_OPTION_INHERIT)
2027 {
2028 val = SCM_PACK (SCM_KEYWORD_STYLE);
2029 if (scm_is_eq (val, scm_keyword_prefix))
2030 x = KEYWORD_STYLE_PREFIX;
2031 else if (scm_is_eq (val, scm_keyword_postfix))
2032 x = KEYWORD_STYLE_POSTFIX;
2033 else
2034 x = KEYWORD_STYLE_HASH_PREFIX;
2035 }
2036 opts->keyword_style = x;
2037
2038 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2039 do \
2040 { \
2041 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2042 if (x == READ_OPTION_INHERIT) \
2043 x = !!SCM_ ## NAME; \
2044 opts->name = x; \
2045 } \
2046 while (0)
2047
2048 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2049 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2050 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2051 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2052 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2053 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2054
2055 #undef RESOLVE_BOOLEAN_OPTION
2056 }
2057
2058 void
2059 scm_init_read ()
2060 {
2061 SCM read_hash_procs;
2062
2063 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2064
2065 scm_i_read_hash_procedures =
2066 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2067
2068 scm_init_opts (scm_read_options, scm_read_opts);
2069 #include "libguile/read.x"
2070 }
2071
2072 /*
2073 Local Variables:
2074 c-file-style: "gnu"
2075 End:
2076 */