Add source properties to more datum types in scm_read_sharp_extension.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /* Internal read options structure. This is initialized by 'scm_read'
85 from the global read options, and a pointer is passed down to all
86 helper functions. */
87 enum t_keyword_style {
88 KEYWORD_STYLE_HASH_PREFIX,
89 KEYWORD_STYLE_PREFIX,
90 KEYWORD_STYLE_POSTFIX
91 };
92
93 struct t_read_opts {
94 enum t_keyword_style keyword_style;
95 unsigned int copy_source_p : 1;
96 unsigned int record_positions_p : 1;
97 unsigned int case_insensitive_p : 1;
98 unsigned int r6rs_escapes_p : 1;
99 unsigned int square_brackets_p : 1;
100 unsigned int hungry_eol_escapes_p : 1;
101 };
102
103 typedef struct t_read_opts scm_t_read_opts;
104
105 /* Initialize OPTS from the global read options. */
106 static void
107 init_read_options (scm_t_read_opts *opts)
108 {
109 SCM val;
110 int x;
111
112 val = SCM_PACK (SCM_KEYWORD_STYLE);
113 if (scm_is_eq (val, scm_keyword_prefix))
114 x = KEYWORD_STYLE_PREFIX;
115 else if (scm_is_eq (val, scm_keyword_postfix))
116 x = KEYWORD_STYLE_POSTFIX;
117 else
118 x = KEYWORD_STYLE_HASH_PREFIX;
119 opts->keyword_style = x;
120
121 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
122 (opts->name = !!SCM_ ## NAME)
123
124 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
125 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
126 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
127 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
128 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
129 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
130
131 #undef RESOLVE_BOOLEAN_OPTION
132 }
133
134
135 /*
136 Give meaningful error messages for errors
137
138 We use the format
139
140 FILE:LINE:COL: MESSAGE
141 This happened in ....
142
143 This is not standard GNU format, but the test-suite likes the real
144 message to be in front.
145
146 */
147
148
149 void
150 scm_i_input_error (char const *function,
151 SCM port, const char *message, SCM arg)
152 {
153 SCM fn = (scm_is_string (SCM_FILENAME(port))
154 ? SCM_FILENAME(port)
155 : scm_from_locale_string ("#<unknown port>"));
156
157 SCM string_port = scm_open_output_string ();
158 SCM string = SCM_EOL;
159 scm_simple_format (string_port,
160 scm_from_locale_string ("~A:~S:~S: ~A"),
161 scm_list_4 (fn,
162 scm_from_long (SCM_LINUM (port) + 1),
163 scm_from_int (SCM_COL (port) + 1),
164 scm_from_locale_string (message)));
165
166 string = scm_get_output_string (string_port);
167 scm_close_output_port (string_port);
168 scm_error_scm (scm_from_latin1_symbol ("read-error"),
169 function? scm_from_locale_string (function) : SCM_BOOL_F,
170 string,
171 arg,
172 SCM_BOOL_F);
173 }
174
175
176 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
177 (SCM setting),
178 "Option interface for the read options. Instead of using\n"
179 "this procedure directly, use the procedures @code{read-enable},\n"
180 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
181 #define FUNC_NAME s_scm_read_options
182 {
183 SCM ans = scm_options (setting,
184 scm_read_opts,
185 FUNC_NAME);
186 if (SCM_COPY_SOURCE_P)
187 SCM_RECORD_POSITIONS_P = 1;
188 return ans;
189 }
190 #undef FUNC_NAME
191
192 /* A fluid referring to an association list mapping extra hash
193 characters to procedures. */
194 static SCM *scm_i_read_hash_procedures;
195
196 static SCM
197 scm_i_read_hash_procedures_ref (void)
198 {
199 return scm_fluid_ref (*scm_i_read_hash_procedures);
200 }
201
202 static void
203 scm_i_read_hash_procedures_set_x (SCM value)
204 {
205 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
206 }
207
208 \f
209 /* Token readers. */
210
211
212 /* Size of the C buffer used to read symbols and numbers. */
213 #define READER_BUFFER_SIZE 128
214
215 /* Number of 32-bit codepoints in the buffer used to read strings. */
216 #define READER_STRING_BUFFER_SIZE 128
217
218 /* The maximum size of Scheme character names. */
219 #define READER_CHAR_NAME_MAX_SIZE 50
220
221
222 /* `isblank' is only in C99. */
223 #define CHAR_IS_BLANK_(_chr) \
224 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
225 || ((_chr) == '\f') || ((_chr) == '\r'))
226
227 #ifdef MSDOS
228 # define CHAR_IS_BLANK(_chr) \
229 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
230 #else
231 # define CHAR_IS_BLANK CHAR_IS_BLANK_
232 #endif
233
234
235 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
236 structure''). */
237 #define CHAR_IS_R5RS_DELIMITER(c) \
238 (CHAR_IS_BLANK (c) \
239 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
240
241 #define CHAR_IS_DELIMITER(c) \
242 (CHAR_IS_R5RS_DELIMITER (c) \
243 || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
244
245 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
246 Structure''. */
247 #define CHAR_IS_EXPONENT_MARKER(_chr) \
248 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
249 || ((_chr) == 'd') || ((_chr) == 'l'))
250
251 /* Read an SCSH block comment. */
252 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
253 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
254 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
255 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
256 static SCM scm_get_hash_procedure (int);
257
258 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
259 result in the pre-allocated buffer BUF. Return zero if the whole token has
260 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
261 bytes actually read. */
262 static int
263 read_token (SCM port, scm_t_read_opts *opts,
264 char *buf, size_t buf_size, size_t *read)
265 {
266 *read = 0;
267
268 while (*read < buf_size)
269 {
270 int chr;
271
272 chr = scm_get_byte_or_eof (port);
273
274 if (chr == EOF)
275 return 0;
276 else if (CHAR_IS_DELIMITER (chr))
277 {
278 scm_unget_byte (chr, port);
279 return 0;
280 }
281 else
282 {
283 *buf = (char) chr;
284 buf++, (*read)++;
285 }
286 }
287
288 return 1;
289 }
290
291 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
292 if the token doesn't fit in BUFFER_SIZE bytes. */
293 static char *
294 read_complete_token (SCM port, scm_t_read_opts *opts,
295 char *buffer, size_t buffer_size, size_t *read)
296 {
297 int overflow = 0;
298 size_t bytes_read, overflow_size = 0;
299 char *overflow_buffer = NULL;
300
301 do
302 {
303 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
304 if (bytes_read == 0)
305 break;
306 if (overflow || overflow_size != 0)
307 {
308 if (overflow_size == 0)
309 {
310 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
311 memcpy (overflow_buffer, buffer, bytes_read);
312 overflow_size = bytes_read;
313 }
314 else
315 {
316 char *new_buf =
317 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
318
319 memcpy (new_buf, overflow_buffer, overflow_size);
320 memcpy (new_buf + overflow_size, buffer, bytes_read);
321
322 overflow_buffer = new_buf;
323 overflow_size += bytes_read;
324 }
325 }
326 }
327 while (overflow);
328
329 if (overflow_size)
330 *read = overflow_size;
331 else
332 *read = bytes_read;
333
334 return (overflow_size > 0 ? overflow_buffer : buffer);
335 }
336
337 /* Skip whitespace from PORT and return the first non-whitespace character
338 read. Raise an error on end-of-file. */
339 static int
340 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
341 {
342 scm_t_wchar c;
343 while (1)
344 switch (c = scm_getc (port))
345 {
346 case EOF:
347 goteof:
348 if (eoferr)
349 {
350 scm_i_input_error (eoferr,
351 port,
352 "end of file",
353 SCM_EOL);
354 }
355 return c;
356
357 case ';':
358 lp:
359 switch (c = scm_getc (port))
360 {
361 case EOF:
362 goto goteof;
363 default:
364 goto lp;
365 case SCM_LINE_INCREMENTORS:
366 break;
367 }
368 break;
369
370 case '#':
371 switch (c = scm_getc (port))
372 {
373 case EOF:
374 eoferr = "read_sharp";
375 goto goteof;
376 case '!':
377 scm_read_shebang (c, port, opts);
378 break;
379 case ';':
380 scm_read_commented_expression (c, port, opts);
381 break;
382 case '|':
383 if (scm_is_false (scm_get_hash_procedure (c)))
384 {
385 scm_read_r6rs_block_comment (c, port);
386 break;
387 }
388 /* fall through */
389 default:
390 scm_ungetc (c, port);
391 return '#';
392 }
393 break;
394
395 case SCM_LINE_INCREMENTORS:
396 case SCM_SINGLE_SPACES:
397 case '\t':
398 break;
399
400 default:
401 return c;
402 }
403
404 return 0;
405 }
406
407
408 \f
409 /* Token readers. */
410
411 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
412 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
413 long line, int column);
414
415
416 static SCM
417 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
418 long line, int column)
419 {
420 if (opts->record_positions_p)
421 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
422 return x;
423 }
424
425 static SCM
426 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
427 #define FUNC_NAME "scm_i_lreadparen"
428 {
429 int c;
430 SCM tmp, tl, ans = SCM_EOL;
431 const int terminating_char = ((chr == '[') ? ']' : ')');
432
433 /* Need to capture line and column numbers here. */
434 long line = SCM_LINUM (port);
435 int column = SCM_COL (port) - 1;
436
437 c = flush_ws (port, opts, FUNC_NAME);
438 if (terminating_char == c)
439 return SCM_EOL;
440
441 scm_ungetc (c, port);
442 tmp = scm_read_expression (port, opts);
443
444 /* Note that it is possible for scm_read_expression to return
445 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
446 check that it's a real dot by checking `c'. */
447 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
448 {
449 ans = scm_read_expression (port, opts);
450 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
451 scm_i_input_error (FUNC_NAME, port, "missing close paren",
452 SCM_EOL);
453 return ans;
454 }
455
456 /* Build the head of the list structure. */
457 ans = tl = scm_cons (tmp, SCM_EOL);
458
459 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
460 {
461 SCM new_tail;
462
463 if (c == ')' || (c == ']' && opts->square_brackets_p))
464 scm_i_input_error (FUNC_NAME, port,
465 "in pair: mismatched close paren: ~A",
466 scm_list_1 (SCM_MAKE_CHAR (c)));
467
468 scm_ungetc (c, port);
469 tmp = scm_read_expression (port, opts);
470
471 /* See above note about scm_sym_dot. */
472 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
473 {
474 SCM_SETCDR (tl, scm_read_expression (port, opts));
475
476 c = flush_ws (port, opts, FUNC_NAME);
477 if (terminating_char != c)
478 scm_i_input_error (FUNC_NAME, port,
479 "in pair: missing close paren", SCM_EOL);
480 goto exit;
481 }
482
483 new_tail = scm_cons (tmp, SCM_EOL);
484 SCM_SETCDR (tl, new_tail);
485 tl = new_tail;
486 }
487
488 exit:
489 return maybe_annotate_source (ans, port, opts, line, column);
490 }
491 #undef FUNC_NAME
492
493
494 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
495 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
496 found. */
497 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
498 do \
499 { \
500 scm_t_wchar a; \
501 size_t i = 0; \
502 c = 0; \
503 while (i < ndigits) \
504 { \
505 a = scm_getc (port); \
506 if (a == EOF) \
507 goto str_eof; \
508 if (terminator \
509 && (a == (scm_t_wchar) terminator) \
510 && (i > 0)) \
511 break; \
512 if ('0' <= a && a <= '9') \
513 a -= '0'; \
514 else if ('A' <= a && a <= 'F') \
515 a = a - 'A' + 10; \
516 else if ('a' <= a && a <= 'f') \
517 a = a - 'a' + 10; \
518 else \
519 { \
520 c = a; \
521 goto bad_escaped; \
522 } \
523 c = c * 16 + a; \
524 i ++; \
525 } \
526 } while (0)
527
528 static void
529 skip_intraline_whitespace (SCM port)
530 {
531 scm_t_wchar c;
532
533 do
534 {
535 c = scm_getc (port);
536 if (c == EOF)
537 return;
538 }
539 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
540
541 scm_ungetc (c, port);
542 }
543
544 static SCM
545 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
546 #define FUNC_NAME "scm_lreadr"
547 {
548 /* For strings smaller than C_STR, this function creates only one Scheme
549 object (the string returned). */
550
551 SCM str = SCM_EOL;
552 size_t c_str_len = 0;
553 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
554
555 /* Need to capture line and column numbers here. */
556 long line = SCM_LINUM (port);
557 int column = SCM_COL (port) - 1;
558
559 while ('"' != (c = scm_getc (port)))
560 {
561 if (c == EOF)
562 {
563 str_eof:
564 scm_i_input_error (FUNC_NAME, port,
565 "end of file in string constant", SCM_EOL);
566 }
567
568 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
569 {
570 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
571 c_str_len = 0;
572 }
573
574 if (c == '\\')
575 {
576 switch (c = scm_getc (port))
577 {
578 case EOF:
579 goto str_eof;
580 case '"':
581 case '\\':
582 break;
583 case '\n':
584 if (opts->hungry_eol_escapes_p)
585 skip_intraline_whitespace (port);
586 continue;
587 case '0':
588 c = '\0';
589 break;
590 case 'f':
591 c = '\f';
592 break;
593 case 'n':
594 c = '\n';
595 break;
596 case 'r':
597 c = '\r';
598 break;
599 case 't':
600 c = '\t';
601 break;
602 case 'a':
603 c = '\007';
604 break;
605 case 'v':
606 c = '\v';
607 break;
608 case 'b':
609 c = '\010';
610 break;
611 case 'x':
612 if (opts->r6rs_escapes_p)
613 SCM_READ_HEX_ESCAPE (10, ';');
614 else
615 SCM_READ_HEX_ESCAPE (2, '\0');
616 break;
617 case 'u':
618 if (!opts->r6rs_escapes_p)
619 {
620 SCM_READ_HEX_ESCAPE (4, '\0');
621 break;
622 }
623 case 'U':
624 if (!opts->r6rs_escapes_p)
625 {
626 SCM_READ_HEX_ESCAPE (6, '\0');
627 break;
628 }
629 default:
630 bad_escaped:
631 scm_i_input_error (FUNC_NAME, port,
632 "illegal character in escape sequence: ~S",
633 scm_list_1 (SCM_MAKE_CHAR (c)));
634 }
635 }
636
637 c_str[c_str_len++] = c;
638 }
639
640 if (scm_is_null (str))
641 /* Fast path: we got a string that fits in C_STR. */
642 str = scm_from_utf32_stringn (c_str, c_str_len);
643 else
644 {
645 if (c_str_len > 0)
646 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
647
648 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
649 }
650
651 return maybe_annotate_source (str, port, opts, line, column);
652 }
653 #undef FUNC_NAME
654
655
656 static SCM
657 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
658 {
659 SCM result, str = SCM_EOL;
660 char local_buffer[READER_BUFFER_SIZE], *buffer;
661 size_t bytes_read;
662 scm_t_port *pt = SCM_PTAB_ENTRY (port);
663
664 /* Need to capture line and column numbers here. */
665 long line = SCM_LINUM (port);
666 int column = SCM_COL (port) - 1;
667
668 scm_ungetc (chr, port);
669 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
670 &bytes_read);
671
672 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
673
674 result = scm_string_to_number (str, SCM_UNDEFINED);
675 if (scm_is_false (result))
676 {
677 /* Return a symbol instead of a number */
678 if (opts->case_insensitive_p)
679 str = scm_string_downcase_x (str);
680 result = scm_string_to_symbol (str);
681 }
682 else if (SCM_NIMP (result))
683 result = maybe_annotate_source (result, port, opts, line, column);
684
685 SCM_COL (port) += scm_i_string_length (str);
686 return result;
687 }
688
689 static SCM
690 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
691 {
692 SCM result;
693 int ends_with_colon = 0;
694 size_t bytes_read;
695 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
696 char local_buffer[READER_BUFFER_SIZE], *buffer;
697 scm_t_port *pt = SCM_PTAB_ENTRY (port);
698 SCM str;
699
700 scm_ungetc (chr, port);
701 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
702 &bytes_read);
703 if (bytes_read > 0)
704 ends_with_colon = buffer[bytes_read - 1] == ':';
705
706 if (postfix && ends_with_colon && (bytes_read > 1))
707 {
708 str = scm_from_stringn (buffer, bytes_read - 1,
709 pt->encoding, pt->ilseq_handler);
710
711 if (opts->case_insensitive_p)
712 str = scm_string_downcase_x (str);
713 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
714 }
715 else
716 {
717 str = scm_from_stringn (buffer, bytes_read,
718 pt->encoding, pt->ilseq_handler);
719
720 if (opts->case_insensitive_p)
721 str = scm_string_downcase_x (str);
722 result = scm_string_to_symbol (str);
723 }
724
725 SCM_COL (port) += scm_i_string_length (str);
726 return result;
727 }
728
729 static SCM
730 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
731 #define FUNC_NAME "scm_lreadr"
732 {
733 SCM result;
734 size_t read;
735 char local_buffer[READER_BUFFER_SIZE], *buffer;
736 unsigned int radix;
737 SCM str;
738 scm_t_port *pt;
739
740 switch (chr)
741 {
742 case 'B':
743 case 'b':
744 radix = 2;
745 break;
746
747 case 'o':
748 case 'O':
749 radix = 8;
750 break;
751
752 case 'd':
753 case 'D':
754 radix = 10;
755 break;
756
757 case 'x':
758 case 'X':
759 radix = 16;
760 break;
761
762 default:
763 scm_ungetc (chr, port);
764 scm_ungetc ('#', port);
765 radix = 10;
766 }
767
768 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
769 &read);
770
771 pt = SCM_PTAB_ENTRY (port);
772 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
773
774 result = scm_string_to_number (str, scm_from_uint (radix));
775
776 SCM_COL (port) += scm_i_string_length (str);
777
778 if (scm_is_true (result))
779 return result;
780
781 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
782
783 return SCM_BOOL_F;
784 }
785 #undef FUNC_NAME
786
787 static SCM
788 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
789 {
790 SCM p;
791 long line = SCM_LINUM (port);
792 int column = SCM_COL (port) - 1;
793
794 switch (chr)
795 {
796 case '`':
797 p = scm_sym_quasiquote;
798 break;
799
800 case '\'':
801 p = scm_sym_quote;
802 break;
803
804 case ',':
805 {
806 scm_t_wchar c;
807
808 c = scm_getc (port);
809 if ('@' == c)
810 p = scm_sym_uq_splicing;
811 else
812 {
813 scm_ungetc (c, port);
814 p = scm_sym_unquote;
815 }
816 break;
817 }
818
819 default:
820 fprintf (stderr, "%s: unhandled quote character (%i)\n",
821 "scm_read_quote", chr);
822 abort ();
823 }
824
825 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
826 return maybe_annotate_source (p, port, opts, line, column);
827 }
828
829 SCM_SYMBOL (sym_syntax, "syntax");
830 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
831 SCM_SYMBOL (sym_unsyntax, "unsyntax");
832 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
833
834 static SCM
835 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
836 {
837 SCM p;
838 long line = SCM_LINUM (port);
839 int column = SCM_COL (port) - 1;
840
841 switch (chr)
842 {
843 case '`':
844 p = sym_quasisyntax;
845 break;
846
847 case '\'':
848 p = sym_syntax;
849 break;
850
851 case ',':
852 {
853 int c;
854
855 c = scm_getc (port);
856 if ('@' == c)
857 p = sym_unsyntax_splicing;
858 else
859 {
860 scm_ungetc (c, port);
861 p = sym_unsyntax;
862 }
863 break;
864 }
865
866 default:
867 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
868 "scm_read_syntax", chr);
869 abort ();
870 }
871
872 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
873 return maybe_annotate_source (p, port, opts, line, column);
874 }
875
876 static SCM
877 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
878 {
879 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
880
881 if (!scm_is_eq (id, sym_nil))
882 scm_i_input_error ("scm_read_nil", port,
883 "unexpected input while reading #nil: ~a",
884 scm_list_1 (id));
885
886 return SCM_ELISP_NIL;
887 }
888
889 static SCM
890 scm_read_semicolon_comment (int chr, SCM port)
891 {
892 int c;
893
894 /* We use the get_byte here because there is no need to get the
895 locale correct with comment input. This presumes that newline
896 always represents itself no matter what the encoding is. */
897 for (c = scm_get_byte_or_eof (port);
898 (c != EOF) && (c != '\n');
899 c = scm_get_byte_or_eof (port));
900
901 return SCM_UNSPECIFIED;
902 }
903
904 \f
905 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
906
907 static SCM
908 scm_read_boolean (int chr, SCM port)
909 {
910 switch (chr)
911 {
912 case 't':
913 case 'T':
914 return SCM_BOOL_T;
915
916 case 'f':
917 case 'F':
918 return SCM_BOOL_F;
919 }
920
921 return SCM_UNSPECIFIED;
922 }
923
924 static SCM
925 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
926 #define FUNC_NAME "scm_lreadr"
927 {
928 char buffer[READER_CHAR_NAME_MAX_SIZE];
929 SCM charname;
930 size_t charname_len, bytes_read;
931 scm_t_wchar cp;
932 int overflow;
933 scm_t_port *pt;
934
935 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
936 &bytes_read);
937 if (overflow)
938 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
939
940 if (bytes_read == 0)
941 {
942 chr = scm_getc (port);
943 if (chr == EOF)
944 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
945 "while reading character", SCM_EOL);
946
947 /* CHR must be a token delimiter, like a whitespace. */
948 return (SCM_MAKE_CHAR (chr));
949 }
950
951 pt = SCM_PTAB_ENTRY (port);
952
953 /* Simple ASCII characters can be processed immediately. Also, simple
954 ISO-8859-1 characters can be processed immediately if the encoding for this
955 port is ISO-8859-1. */
956 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
957 {
958 SCM_COL (port) += 1;
959 return SCM_MAKE_CHAR (buffer[0]);
960 }
961
962 /* Otherwise, convert the buffer into a proper scheme string for
963 processing. */
964 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
965 pt->ilseq_handler);
966 charname_len = scm_i_string_length (charname);
967 SCM_COL (port) += charname_len;
968 cp = scm_i_string_ref (charname, 0);
969 if (charname_len == 1)
970 return SCM_MAKE_CHAR (cp);
971
972 /* Ignore dotted circles, which may be used to keep combining characters from
973 combining with the backslash in #\charname. */
974 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
975 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
976
977 if (cp >= '0' && cp < '8')
978 {
979 /* Dirk:FIXME:: This type of character syntax is not R5RS
980 * compliant. Further, it should be verified that the constant
981 * does only consist of octal digits. */
982 SCM p = scm_string_to_number (charname, scm_from_uint (8));
983 if (SCM_I_INUMP (p))
984 {
985 scm_t_wchar c = scm_to_uint32 (p);
986 if (SCM_IS_UNICODE_CHAR (c))
987 return SCM_MAKE_CHAR (c);
988 else
989 scm_i_input_error (FUNC_NAME, port,
990 "out-of-range octal character escape: ~a",
991 scm_list_1 (charname));
992 }
993 }
994
995 if (cp == 'x' && (charname_len > 1))
996 {
997 SCM p;
998
999 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1000 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1001 scm_from_uint (16));
1002 if (SCM_I_INUMP (p))
1003 {
1004 scm_t_wchar c = scm_to_uint32 (p);
1005 if (SCM_IS_UNICODE_CHAR (c))
1006 return SCM_MAKE_CHAR (c);
1007 else
1008 scm_i_input_error (FUNC_NAME, port,
1009 "out-of-range hex character escape: ~a",
1010 scm_list_1 (charname));
1011 }
1012 }
1013
1014 /* The names of characters should never have non-Latin1
1015 characters. */
1016 if (scm_i_is_narrow_string (charname)
1017 || scm_i_try_narrow_string (charname))
1018 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1019 charname_len);
1020 if (scm_is_true (ch))
1021 return ch;
1022 }
1023
1024 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1025 scm_list_1 (charname));
1026
1027 return SCM_UNSPECIFIED;
1028 }
1029 #undef FUNC_NAME
1030
1031 static SCM
1032 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1033 {
1034 SCM symbol;
1035
1036 /* Read the symbol that comprises the keyword. Doing this instead of
1037 invoking a specific symbol reader function allows `scm_read_keyword ()'
1038 to adapt to the delimiters currently valid of symbols.
1039
1040 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1041 symbol = scm_read_expression (port, opts);
1042 if (!scm_is_symbol (symbol))
1043 scm_i_input_error ("scm_read_keyword", port,
1044 "keyword prefix `~a' not followed by a symbol: ~s",
1045 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1046
1047 return (scm_symbol_to_keyword (symbol));
1048 }
1049
1050 static SCM
1051 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1052 long line, int column)
1053 {
1054 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1055 guarantee that it's going to do what we want. After all, this is an
1056 implementation detail of `scm_read_vector ()', not a desirable
1057 property. */
1058 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1059 port, opts, line, column);
1060 }
1061
1062 /* Helper used by scm_read_array */
1063 static int
1064 read_decimal_integer (SCM port, int c, ssize_t *resp)
1065 {
1066 ssize_t sign = 1;
1067 ssize_t res = 0;
1068 int got_it = 0;
1069
1070 if (c == '-')
1071 {
1072 sign = -1;
1073 c = scm_getc (port);
1074 }
1075
1076 while ('0' <= c && c <= '9')
1077 {
1078 res = 10*res + c-'0';
1079 got_it = 1;
1080 c = scm_getc (port);
1081 }
1082
1083 if (got_it)
1084 *resp = sign * res;
1085 return c;
1086 }
1087
1088 /* Read an array. This function can also read vectors and uniform
1089 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1090 handled here.
1091
1092 C is the first character read after the '#'.
1093 */
1094 static SCM
1095 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1096 {
1097 ssize_t rank;
1098 scm_t_wchar tag_buf[8];
1099 int tag_len;
1100
1101 SCM tag, shape = SCM_BOOL_F, elements, array;
1102
1103 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1104 the array code can not deal with zero-length dimensions yet, and
1105 we want to allow zero-length vectors, of course.
1106 */
1107 if (c == '(')
1108 return scm_read_vector (c, port, opts, line, column);
1109
1110 /* Disambiguate between '#f' and uniform floating point vectors.
1111 */
1112 if (c == 'f')
1113 {
1114 c = scm_getc (port);
1115 if (c != '3' && c != '6')
1116 {
1117 if (c != EOF)
1118 scm_ungetc (c, port);
1119 return SCM_BOOL_F;
1120 }
1121 rank = 1;
1122 tag_buf[0] = 'f';
1123 tag_len = 1;
1124 goto continue_reading_tag;
1125 }
1126
1127 /* Read rank. */
1128 rank = 1;
1129 c = read_decimal_integer (port, c, &rank);
1130 if (rank < 0)
1131 scm_i_input_error (NULL, port, "array rank must be non-negative",
1132 SCM_EOL);
1133
1134 /* Read tag. */
1135 tag_len = 0;
1136 continue_reading_tag:
1137 while (c != EOF && c != '(' && c != '@' && c != ':'
1138 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1139 {
1140 tag_buf[tag_len++] = c;
1141 c = scm_getc (port);
1142 }
1143 if (tag_len == 0)
1144 tag = SCM_BOOL_T;
1145 else
1146 {
1147 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1148 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1149 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1150 scm_list_1 (tag));
1151 }
1152
1153 /* Read shape. */
1154 if (c == '@' || c == ':')
1155 {
1156 shape = SCM_EOL;
1157
1158 do
1159 {
1160 ssize_t lbnd = 0, len = 0;
1161 SCM s;
1162
1163 if (c == '@')
1164 {
1165 c = scm_getc (port);
1166 c = read_decimal_integer (port, c, &lbnd);
1167 }
1168
1169 s = scm_from_ssize_t (lbnd);
1170
1171 if (c == ':')
1172 {
1173 c = scm_getc (port);
1174 c = read_decimal_integer (port, c, &len);
1175 if (len < 0)
1176 scm_i_input_error (NULL, port,
1177 "array length must be non-negative",
1178 SCM_EOL);
1179
1180 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1181 }
1182
1183 shape = scm_cons (s, shape);
1184 } while (c == '@' || c == ':');
1185
1186 shape = scm_reverse_x (shape, SCM_EOL);
1187 }
1188
1189 /* Read nested lists of elements. */
1190 if (c != '(')
1191 scm_i_input_error (NULL, port,
1192 "missing '(' in vector or array literal",
1193 SCM_EOL);
1194 elements = scm_read_sexp (c, port, opts);
1195
1196 if (scm_is_false (shape))
1197 shape = scm_from_ssize_t (rank);
1198 else if (scm_ilength (shape) != rank)
1199 scm_i_input_error
1200 (NULL, port,
1201 "the number of shape specifications must match the array rank",
1202 SCM_EOL);
1203
1204 /* Handle special print syntax of rank zero arrays; see
1205 scm_i_print_array for a rationale. */
1206 if (rank == 0)
1207 {
1208 if (!scm_is_pair (elements))
1209 scm_i_input_error (NULL, port,
1210 "too few elements in array literal, need 1",
1211 SCM_EOL);
1212 if (!scm_is_null (SCM_CDR (elements)))
1213 scm_i_input_error (NULL, port,
1214 "too many elements in array literal, want 1",
1215 SCM_EOL);
1216 elements = SCM_CAR (elements);
1217 }
1218
1219 /* Construct array, annotate with source location, and return. */
1220 array = scm_list_to_typed_array (tag, shape, elements);
1221 return maybe_annotate_source (array, port, opts, line, column);
1222 }
1223
1224 static SCM
1225 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1226 long line, int column)
1227 {
1228 return scm_read_array (chr, port, opts, line, column);
1229 }
1230
1231 static SCM
1232 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1233 long line, int column)
1234 {
1235 chr = scm_getc (port);
1236 if (chr != 'u')
1237 goto syntax;
1238
1239 chr = scm_getc (port);
1240 if (chr != '8')
1241 goto syntax;
1242
1243 chr = scm_getc (port);
1244 if (chr != '(')
1245 goto syntax;
1246
1247 return maybe_annotate_source
1248 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1249 port, opts, line, column);
1250
1251 syntax:
1252 scm_i_input_error ("read_bytevector", port,
1253 "invalid bytevector prefix",
1254 SCM_MAKE_CHAR (chr));
1255 return SCM_UNSPECIFIED;
1256 }
1257
1258 static SCM
1259 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1260 long line, int column)
1261 {
1262 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1263 terribly inefficient but who cares? */
1264 SCM s_bits = SCM_EOL;
1265
1266 for (chr = scm_getc (port);
1267 (chr != EOF) && ((chr == '0') || (chr == '1'));
1268 chr = scm_getc (port))
1269 {
1270 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1271 }
1272
1273 if (chr != EOF)
1274 scm_ungetc (chr, port);
1275
1276 return maybe_annotate_source
1277 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1278 port, opts, line, column);
1279 }
1280
1281 static SCM
1282 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1283 {
1284 int bang_seen = 0;
1285
1286 for (;;)
1287 {
1288 int c = scm_getc (port);
1289
1290 if (c == EOF)
1291 scm_i_input_error ("skip_block_comment", port,
1292 "unterminated `#! ... !#' comment", SCM_EOL);
1293
1294 if (c == '!')
1295 bang_seen = 1;
1296 else if (c == '#' && bang_seen)
1297 break;
1298 else
1299 bang_seen = 0;
1300 }
1301
1302 return SCM_UNSPECIFIED;
1303 }
1304
1305 static SCM
1306 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1307 {
1308 int c = 0;
1309 if ((c = scm_get_byte_or_eof (port)) != 'r')
1310 {
1311 scm_ungetc (c, port);
1312 return scm_read_scsh_block_comment (chr, port);
1313 }
1314 if ((c = scm_get_byte_or_eof (port)) != '6')
1315 {
1316 scm_ungetc (c, port);
1317 scm_ungetc ('r', port);
1318 return scm_read_scsh_block_comment (chr, port);
1319 }
1320 if ((c = scm_get_byte_or_eof (port)) != 'r')
1321 {
1322 scm_ungetc (c, port);
1323 scm_ungetc ('6', port);
1324 scm_ungetc ('r', port);
1325 return scm_read_scsh_block_comment (chr, port);
1326 }
1327 if ((c = scm_get_byte_or_eof (port)) != 's')
1328 {
1329 scm_ungetc (c, port);
1330 scm_ungetc ('r', port);
1331 scm_ungetc ('6', port);
1332 scm_ungetc ('r', port);
1333 return scm_read_scsh_block_comment (chr, port);
1334 }
1335
1336 return SCM_UNSPECIFIED;
1337 }
1338
1339 static SCM
1340 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1341 {
1342 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1343 nested. So care must be taken. */
1344 int nesting_level = 1;
1345
1346 int a = scm_getc (port);
1347
1348 if (a == EOF)
1349 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1350 "unterminated `#| ... |#' comment", SCM_EOL);
1351
1352 while (nesting_level > 0)
1353 {
1354 int b = scm_getc (port);
1355
1356 if (b == EOF)
1357 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1358 "unterminated `#| ... |#' comment", SCM_EOL);
1359
1360 if (a == '|' && b == '#')
1361 {
1362 nesting_level--;
1363 b = EOF;
1364 }
1365 else if (a == '#' && b == '|')
1366 {
1367 nesting_level++;
1368 b = EOF;
1369 }
1370
1371 a = b;
1372 }
1373
1374 return SCM_UNSPECIFIED;
1375 }
1376
1377 static SCM
1378 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1379 scm_t_read_opts *opts)
1380 {
1381 scm_t_wchar c;
1382
1383 c = flush_ws (port, opts, (char *) NULL);
1384 if (EOF == c)
1385 scm_i_input_error ("read_commented_expression", port,
1386 "no expression after #; comment", SCM_EOL);
1387 scm_ungetc (c, port);
1388 scm_read_expression (port, opts);
1389 return SCM_UNSPECIFIED;
1390 }
1391
1392 static SCM
1393 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1394 {
1395 /* Guile's extended symbol read syntax looks like this:
1396
1397 #{This is all a symbol name}#
1398
1399 So here, CHR is expected to be `{'. */
1400 int saw_brace = 0;
1401 size_t len = 0;
1402 SCM buf = scm_i_make_string (1024, NULL, 0);
1403
1404 buf = scm_i_string_start_writing (buf);
1405
1406 while ((chr = scm_getc (port)) != EOF)
1407 {
1408 if (saw_brace)
1409 {
1410 if (chr == '#')
1411 {
1412 break;
1413 }
1414 else
1415 {
1416 saw_brace = 0;
1417 scm_i_string_set_x (buf, len++, '}');
1418 }
1419 }
1420
1421 if (chr == '}')
1422 saw_brace = 1;
1423 else if (chr == '\\')
1424 {
1425 /* It used to be that print.c would print extended-read-syntax
1426 symbols with backslashes before "non-standard" chars, but
1427 this routine wouldn't do anything with those escapes.
1428 Bummer. What we've done is to change print.c to output
1429 R6RS hex escapes for those characters, relying on the fact
1430 that the extended read syntax would never put a `\' before
1431 an `x'. For now, we just ignore other instances of
1432 backslash in the string. */
1433 switch ((chr = scm_getc (port)))
1434 {
1435 case EOF:
1436 goto done;
1437 case 'x':
1438 {
1439 scm_t_wchar c;
1440
1441 SCM_READ_HEX_ESCAPE (10, ';');
1442 scm_i_string_set_x (buf, len++, c);
1443 break;
1444
1445 str_eof:
1446 chr = EOF;
1447 goto done;
1448
1449 bad_escaped:
1450 scm_i_string_stop_writing ();
1451 scm_i_input_error ("scm_read_extended_symbol", port,
1452 "illegal character in escape sequence: ~S",
1453 scm_list_1 (SCM_MAKE_CHAR (c)));
1454 break;
1455 }
1456 default:
1457 scm_i_string_set_x (buf, len++, chr);
1458 break;
1459 }
1460 }
1461 else
1462 scm_i_string_set_x (buf, len++, chr);
1463
1464 if (len >= scm_i_string_length (buf) - 2)
1465 {
1466 SCM addy;
1467
1468 scm_i_string_stop_writing ();
1469 addy = scm_i_make_string (1024, NULL, 0);
1470 buf = scm_string_append (scm_list_2 (buf, addy));
1471 len = 0;
1472 buf = scm_i_string_start_writing (buf);
1473 }
1474 }
1475
1476 done:
1477 scm_i_string_stop_writing ();
1478 if (chr == EOF)
1479 scm_i_input_error ("scm_read_extended_symbol", port,
1480 "end of file while reading symbol", SCM_EOL);
1481
1482 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1483 }
1484
1485
1486 \f
1487 /* Top-level token readers, i.e., dispatchers. */
1488
1489 static SCM
1490 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1491 {
1492 SCM proc;
1493
1494 proc = scm_get_hash_procedure (chr);
1495 if (scm_is_true (scm_procedure_p (proc)))
1496 {
1497 long line = SCM_LINUM (port);
1498 int column = SCM_COL (port) - 2;
1499 SCM got;
1500
1501 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1502
1503 if (opts->record_positions_p && SCM_NIMP (got)
1504 && !scm_i_has_source_properties (got))
1505 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1506
1507 return got;
1508 }
1509
1510 return SCM_UNSPECIFIED;
1511 }
1512
1513 /* The reader for the sharp `#' character. It basically dispatches reads
1514 among the above token readers. */
1515 static SCM
1516 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1517 long line, int column)
1518 #define FUNC_NAME "scm_lreadr"
1519 {
1520 SCM result;
1521
1522 chr = scm_getc (port);
1523
1524 result = scm_read_sharp_extension (chr, port, opts);
1525 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1526 return result;
1527
1528 switch (chr)
1529 {
1530 case '\\':
1531 return (scm_read_character (chr, port, opts));
1532 case '(':
1533 return (scm_read_vector (chr, port, opts, line, column));
1534 case 's':
1535 case 'u':
1536 case 'f':
1537 case 'c':
1538 /* This one may return either a boolean or an SRFI-4 vector. */
1539 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1540 case 'v':
1541 return (scm_read_bytevector (chr, port, opts, line, column));
1542 case '*':
1543 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1544 case 't':
1545 case 'T':
1546 case 'F':
1547 return (scm_read_boolean (chr, port));
1548 case ':':
1549 return (scm_read_keyword (chr, port, opts));
1550 case '0': case '1': case '2': case '3': case '4':
1551 case '5': case '6': case '7': case '8': case '9':
1552 case '@':
1553 #if SCM_ENABLE_DEPRECATED
1554 /* See below for 'i' and 'e'. */
1555 case 'a':
1556 case 'y':
1557 case 'h':
1558 case 'l':
1559 #endif
1560 return (scm_read_array (chr, port, opts, line, column));
1561
1562 case 'i':
1563 case 'e':
1564 #if SCM_ENABLE_DEPRECATED
1565 {
1566 /* When next char is '(', it really is an old-style
1567 uniform array. */
1568 scm_t_wchar next_c = scm_getc (port);
1569 if (next_c != EOF)
1570 scm_ungetc (next_c, port);
1571 if (next_c == '(')
1572 return scm_read_array (chr, port, opts, line, column);
1573 /* Fall through. */
1574 }
1575 #endif
1576 case 'b':
1577 case 'B':
1578 case 'o':
1579 case 'O':
1580 case 'd':
1581 case 'D':
1582 case 'x':
1583 case 'X':
1584 case 'I':
1585 case 'E':
1586 return (scm_read_number_and_radix (chr, port, opts));
1587 case '{':
1588 return (scm_read_extended_symbol (chr, port));
1589 case '!':
1590 return (scm_read_shebang (chr, port, opts));
1591 case ';':
1592 return (scm_read_commented_expression (chr, port, opts));
1593 case '`':
1594 case '\'':
1595 case ',':
1596 return (scm_read_syntax (chr, port, opts));
1597 case 'n':
1598 return (scm_read_nil (chr, port, opts));
1599 default:
1600 result = scm_read_sharp_extension (chr, port, opts);
1601 if (scm_is_eq (result, SCM_UNSPECIFIED))
1602 {
1603 /* To remain compatible with 1.8 and earlier, the following
1604 characters have lower precedence than `read-hash-extend'
1605 characters. */
1606 switch (chr)
1607 {
1608 case '|':
1609 return scm_read_r6rs_block_comment (chr, port);
1610 default:
1611 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1612 scm_list_1 (SCM_MAKE_CHAR (chr)));
1613 }
1614 }
1615 else
1616 return result;
1617 }
1618
1619 return SCM_UNSPECIFIED;
1620 }
1621 #undef FUNC_NAME
1622
1623 static SCM
1624 scm_read_expression (SCM port, scm_t_read_opts *opts)
1625 #define FUNC_NAME "scm_read_expression"
1626 {
1627 while (1)
1628 {
1629 scm_t_wchar chr;
1630
1631 chr = scm_getc (port);
1632
1633 switch (chr)
1634 {
1635 case SCM_WHITE_SPACES:
1636 case SCM_LINE_INCREMENTORS:
1637 break;
1638 case ';':
1639 (void) scm_read_semicolon_comment (chr, port);
1640 break;
1641 case '[':
1642 if (!opts->square_brackets_p)
1643 return (scm_read_mixed_case_symbol (chr, port, opts));
1644 /* otherwise fall through */
1645 case '(':
1646 return (scm_read_sexp (chr, port, opts));
1647 case '"':
1648 return (scm_read_string (chr, port, opts));
1649 case '\'':
1650 case '`':
1651 case ',':
1652 return (scm_read_quote (chr, port, opts));
1653 case '#':
1654 {
1655 long line = SCM_LINUM (port);
1656 int column = SCM_COL (port) - 1;
1657 SCM result = scm_read_sharp (chr, port, opts, line, column);
1658 if (scm_is_eq (result, SCM_UNSPECIFIED))
1659 /* We read a comment or some such. */
1660 break;
1661 else
1662 return result;
1663 }
1664 case ')':
1665 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1666 break;
1667 case ']':
1668 if (opts->square_brackets_p)
1669 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1670 /* otherwise fall through */
1671 case EOF:
1672 return SCM_EOF_VAL;
1673 case ':':
1674 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1675 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1676 /* Fall through. */
1677
1678 default:
1679 {
1680 if (((chr >= '0') && (chr <= '9'))
1681 || (strchr ("+-.", chr)))
1682 return (scm_read_number (chr, port, opts));
1683 else
1684 return (scm_read_mixed_case_symbol (chr, port, opts));
1685 }
1686 }
1687 }
1688 }
1689 #undef FUNC_NAME
1690
1691 \f
1692 /* Actual reader. */
1693
1694 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1695 (SCM port),
1696 "Read an s-expression from the input port @var{port}, or from\n"
1697 "the current input port if @var{port} is not specified.\n"
1698 "Any whitespace before the next token is discarded.")
1699 #define FUNC_NAME s_scm_read
1700 {
1701 scm_t_read_opts opts;
1702 int c;
1703
1704 if (SCM_UNBNDP (port))
1705 port = scm_current_input_port ();
1706 SCM_VALIDATE_OPINPORT (1, port);
1707
1708 init_read_options (&opts);
1709
1710 c = flush_ws (port, &opts, (char *) NULL);
1711 if (EOF == c)
1712 return SCM_EOF_VAL;
1713 scm_ungetc (c, port);
1714
1715 return (scm_read_expression (port, &opts));
1716 }
1717 #undef FUNC_NAME
1718
1719
1720 \f
1721
1722 /* Manipulate the read-hash-procedures alist. This could be written in
1723 Scheme, but maybe it will also be used by C code during initialisation. */
1724 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1725 (SCM chr, SCM proc),
1726 "Install the procedure @var{proc} for reading expressions\n"
1727 "starting with the character sequence @code{#} and @var{chr}.\n"
1728 "@var{proc} will be called with two arguments: the character\n"
1729 "@var{chr} and the port to read further data from. The object\n"
1730 "returned will be the return value of @code{read}. \n"
1731 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1732 )
1733 #define FUNC_NAME s_scm_read_hash_extend
1734 {
1735 SCM this;
1736 SCM prev;
1737
1738 SCM_VALIDATE_CHAR (1, chr);
1739 SCM_ASSERT (scm_is_false (proc)
1740 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1741 proc, SCM_ARG2, FUNC_NAME);
1742
1743 /* Check if chr is already in the alist. */
1744 this = scm_i_read_hash_procedures_ref ();
1745 prev = SCM_BOOL_F;
1746 while (1)
1747 {
1748 if (scm_is_null (this))
1749 {
1750 /* not found, so add it to the beginning. */
1751 if (scm_is_true (proc))
1752 {
1753 SCM new = scm_cons (scm_cons (chr, proc),
1754 scm_i_read_hash_procedures_ref ());
1755 scm_i_read_hash_procedures_set_x (new);
1756 }
1757 break;
1758 }
1759 if (scm_is_eq (chr, SCM_CAAR (this)))
1760 {
1761 /* already in the alist. */
1762 if (scm_is_false (proc))
1763 {
1764 /* remove it. */
1765 if (scm_is_false (prev))
1766 {
1767 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1768 scm_i_read_hash_procedures_set_x (rest);
1769 }
1770 else
1771 scm_set_cdr_x (prev, SCM_CDR (this));
1772 }
1773 else
1774 {
1775 /* replace it. */
1776 scm_set_cdr_x (SCM_CAR (this), proc);
1777 }
1778 break;
1779 }
1780 prev = this;
1781 this = SCM_CDR (this);
1782 }
1783
1784 return SCM_UNSPECIFIED;
1785 }
1786 #undef FUNC_NAME
1787
1788 /* Recover the read-hash procedure corresponding to char c. */
1789 static SCM
1790 scm_get_hash_procedure (int c)
1791 {
1792 SCM rest = scm_i_read_hash_procedures_ref ();
1793
1794 while (1)
1795 {
1796 if (scm_is_null (rest))
1797 return SCM_BOOL_F;
1798
1799 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1800 return SCM_CDAR (rest);
1801
1802 rest = SCM_CDR (rest);
1803 }
1804 }
1805
1806 #define SCM_ENCODING_SEARCH_SIZE (500)
1807
1808 /* Search the first few hundred characters of a file for an Emacs-like coding
1809 declaration. Returns either NULL or a string whose storage has been
1810 allocated with `scm_gc_malloc ()'. */
1811 char *
1812 scm_i_scan_for_encoding (SCM port)
1813 {
1814 scm_t_port *pt;
1815 char header[SCM_ENCODING_SEARCH_SIZE+1];
1816 size_t bytes_read, encoding_length, i;
1817 char *encoding = NULL;
1818 int utf8_bom = 0;
1819 char *pos, *encoding_start;
1820 int in_comment;
1821
1822 pt = SCM_PTAB_ENTRY (port);
1823
1824 if (pt->rw_active == SCM_PORT_WRITE)
1825 scm_flush (port);
1826
1827 if (pt->rw_random)
1828 pt->rw_active = SCM_PORT_READ;
1829
1830 if (pt->read_pos == pt->read_end)
1831 {
1832 /* We can use the read buffer, and thus avoid a seek. */
1833 if (scm_fill_input (port) == EOF)
1834 return NULL;
1835
1836 bytes_read = pt->read_end - pt->read_pos;
1837 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1838 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1839
1840 if (bytes_read <= 1)
1841 /* An unbuffered port -- don't scan. */
1842 return NULL;
1843
1844 memcpy (header, pt->read_pos, bytes_read);
1845 header[bytes_read] = '\0';
1846 }
1847 else
1848 {
1849 /* Try to read some bytes and then seek back. Not all ports
1850 support seeking back; and indeed some file ports (like
1851 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1852 check performed by SCM_FPORT_FDES---but fail to seek
1853 backwards. Hence this block comes second. We prefer to use
1854 the read buffer in-place. */
1855 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1856 return NULL;
1857
1858 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1859 header[bytes_read] = '\0';
1860 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1861 }
1862
1863 if (bytes_read > 3
1864 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1865 utf8_bom = 1;
1866
1867 /* search past "coding[:=]" */
1868 pos = header;
1869 while (1)
1870 {
1871 if ((pos = strstr(pos, "coding")) == NULL)
1872 return NULL;
1873
1874 pos += strlen("coding");
1875 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1876 (*pos == ':' || *pos == '='))
1877 {
1878 pos ++;
1879 break;
1880 }
1881 }
1882
1883 /* skip spaces */
1884 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1885 (*pos == ' ' || *pos == '\t'))
1886 pos ++;
1887
1888 /* grab the next token */
1889 encoding_start = pos;
1890 i = 0;
1891 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1892 && encoding_start + i - header < bytes_read
1893 && (isalnum ((int) encoding_start[i])
1894 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1895 i++;
1896
1897 encoding_length = i;
1898 if (encoding_length == 0)
1899 return NULL;
1900
1901 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1902 for (i = 0; i < encoding_length; i++)
1903 encoding[i] = toupper ((int) encoding[i]);
1904
1905 /* push backwards to make sure we were in a comment */
1906 in_comment = 0;
1907 pos = encoding_start;
1908 while (pos >= header)
1909 {
1910 if (*pos == ';')
1911 {
1912 in_comment = 1;
1913 break;
1914 }
1915 else if (*pos == '\n' || pos == header)
1916 {
1917 /* This wasn't in a semicolon comment. Check for a
1918 hash-bang comment. */
1919 char *beg = strstr (header, "#!");
1920 char *end = strstr (header, "!#");
1921 if (beg < encoding_start && encoding_start + encoding_length <= end)
1922 in_comment = 1;
1923 break;
1924 }
1925 else
1926 {
1927 pos --;
1928 continue;
1929 }
1930 }
1931 if (!in_comment)
1932 /* This wasn't in a comment */
1933 return NULL;
1934
1935 if (utf8_bom && strcmp(encoding, "UTF-8"))
1936 scm_misc_error (NULL,
1937 "the port input declares the encoding ~s but is encoded as UTF-8",
1938 scm_list_1 (scm_from_locale_string (encoding)));
1939
1940 return encoding;
1941 }
1942
1943 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1944 (SCM port),
1945 "Scans the port for an Emacs-like character coding declaration\n"
1946 "near the top of the contents of a port with random-accessible contents.\n"
1947 "The coding declaration is of the form\n"
1948 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1949 "\n"
1950 "Returns a string containing the character encoding of the file\n"
1951 "if a declaration was found, or @code{#f} otherwise.\n")
1952 #define FUNC_NAME s_scm_file_encoding
1953 {
1954 char *enc;
1955 SCM s_enc;
1956
1957 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1958
1959 enc = scm_i_scan_for_encoding (port);
1960 if (enc == NULL)
1961 return SCM_BOOL_F;
1962 else
1963 {
1964 s_enc = scm_from_locale_string (enc);
1965 return s_enc;
1966 }
1967
1968 return SCM_BOOL_F;
1969 }
1970 #undef FUNC_NAME
1971
1972 void
1973 scm_init_read ()
1974 {
1975 SCM read_hash_procs;
1976
1977 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1978
1979 scm_i_read_hash_procedures =
1980 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1981
1982 scm_init_opts (scm_read_options, scm_read_opts);
1983 #include "libguile/read.x"
1984 }
1985
1986 /*
1987 Local Variables:
1988 c-file-style: "gnu"
1989 End:
1990 */