Generalize scm_read_shebang to handle other reader directives.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /* Internal read options structure. This is initialized by 'scm_read'
85 from the global read options, and a pointer is passed down to all
86 helper functions. */
87 enum t_keyword_style {
88 KEYWORD_STYLE_HASH_PREFIX,
89 KEYWORD_STYLE_PREFIX,
90 KEYWORD_STYLE_POSTFIX
91 };
92
93 struct t_read_opts {
94 enum t_keyword_style keyword_style;
95 unsigned int copy_source_p : 1;
96 unsigned int record_positions_p : 1;
97 unsigned int case_insensitive_p : 1;
98 unsigned int r6rs_escapes_p : 1;
99 unsigned int square_brackets_p : 1;
100 unsigned int hungry_eol_escapes_p : 1;
101 };
102
103 typedef struct t_read_opts scm_t_read_opts;
104
105 /* Initialize OPTS from the global read options. */
106 static void
107 init_read_options (scm_t_read_opts *opts)
108 {
109 SCM val;
110 int x;
111
112 val = SCM_PACK (SCM_KEYWORD_STYLE);
113 if (scm_is_eq (val, scm_keyword_prefix))
114 x = KEYWORD_STYLE_PREFIX;
115 else if (scm_is_eq (val, scm_keyword_postfix))
116 x = KEYWORD_STYLE_POSTFIX;
117 else
118 x = KEYWORD_STYLE_HASH_PREFIX;
119 opts->keyword_style = x;
120
121 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
122 (opts->name = !!SCM_ ## NAME)
123
124 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
125 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
126 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
127 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
128 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
129 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
130
131 #undef RESOLVE_BOOLEAN_OPTION
132 }
133
134
135 /*
136 Give meaningful error messages for errors
137
138 We use the format
139
140 FILE:LINE:COL: MESSAGE
141 This happened in ....
142
143 This is not standard GNU format, but the test-suite likes the real
144 message to be in front.
145
146 */
147
148
149 void
150 scm_i_input_error (char const *function,
151 SCM port, const char *message, SCM arg)
152 {
153 SCM fn = (scm_is_string (SCM_FILENAME(port))
154 ? SCM_FILENAME(port)
155 : scm_from_locale_string ("#<unknown port>"));
156
157 SCM string_port = scm_open_output_string ();
158 SCM string = SCM_EOL;
159 scm_simple_format (string_port,
160 scm_from_locale_string ("~A:~S:~S: ~A"),
161 scm_list_4 (fn,
162 scm_from_long (SCM_LINUM (port) + 1),
163 scm_from_int (SCM_COL (port) + 1),
164 scm_from_locale_string (message)));
165
166 string = scm_get_output_string (string_port);
167 scm_close_output_port (string_port);
168 scm_error_scm (scm_from_latin1_symbol ("read-error"),
169 function? scm_from_locale_string (function) : SCM_BOOL_F,
170 string,
171 arg,
172 SCM_BOOL_F);
173 }
174
175
176 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
177 (SCM setting),
178 "Option interface for the read options. Instead of using\n"
179 "this procedure directly, use the procedures @code{read-enable},\n"
180 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
181 #define FUNC_NAME s_scm_read_options
182 {
183 SCM ans = scm_options (setting,
184 scm_read_opts,
185 FUNC_NAME);
186 if (SCM_COPY_SOURCE_P)
187 SCM_RECORD_POSITIONS_P = 1;
188 return ans;
189 }
190 #undef FUNC_NAME
191
192 /* A fluid referring to an association list mapping extra hash
193 characters to procedures. */
194 static SCM *scm_i_read_hash_procedures;
195
196 static SCM
197 scm_i_read_hash_procedures_ref (void)
198 {
199 return scm_fluid_ref (*scm_i_read_hash_procedures);
200 }
201
202 static void
203 scm_i_read_hash_procedures_set_x (SCM value)
204 {
205 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
206 }
207
208 \f
209 /* Token readers. */
210
211
212 /* Size of the C buffer used to read symbols and numbers. */
213 #define READER_BUFFER_SIZE 128
214
215 /* Number of 32-bit codepoints in the buffer used to read strings. */
216 #define READER_STRING_BUFFER_SIZE 128
217
218 /* The maximum size of Scheme character names. */
219 #define READER_CHAR_NAME_MAX_SIZE 50
220
221 /* The maximum size of reader directive names. */
222 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
223
224
225 /* `isblank' is only in C99. */
226 #define CHAR_IS_BLANK_(_chr) \
227 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
228 || ((_chr) == '\f') || ((_chr) == '\r'))
229
230 #ifdef MSDOS
231 # define CHAR_IS_BLANK(_chr) \
232 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
233 #else
234 # define CHAR_IS_BLANK CHAR_IS_BLANK_
235 #endif
236
237
238 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
239 structure''). */
240 #define CHAR_IS_R5RS_DELIMITER(c) \
241 (CHAR_IS_BLANK (c) \
242 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
243
244 #define CHAR_IS_DELIMITER(c) \
245 (CHAR_IS_R5RS_DELIMITER (c) \
246 || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
247
248 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
249 Structure''. */
250 #define CHAR_IS_EXPONENT_MARKER(_chr) \
251 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
252 || ((_chr) == 'd') || ((_chr) == 'l'))
253
254 /* Read an SCSH block comment. */
255 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
256 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
257 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
258 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
259 static SCM scm_get_hash_procedure (int);
260
261 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
262 result in the pre-allocated buffer BUF. Return zero if the whole token has
263 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
264 bytes actually read. */
265 static int
266 read_token (SCM port, scm_t_read_opts *opts,
267 char *buf, size_t buf_size, size_t *read)
268 {
269 *read = 0;
270
271 while (*read < buf_size)
272 {
273 int chr;
274
275 chr = scm_get_byte_or_eof (port);
276
277 if (chr == EOF)
278 return 0;
279 else if (CHAR_IS_DELIMITER (chr))
280 {
281 scm_unget_byte (chr, port);
282 return 0;
283 }
284 else
285 {
286 *buf = (char) chr;
287 buf++, (*read)++;
288 }
289 }
290
291 return 1;
292 }
293
294 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
295 if the token doesn't fit in BUFFER_SIZE bytes. */
296 static char *
297 read_complete_token (SCM port, scm_t_read_opts *opts,
298 char *buffer, size_t buffer_size, size_t *read)
299 {
300 int overflow = 0;
301 size_t bytes_read, overflow_size = 0;
302 char *overflow_buffer = NULL;
303
304 do
305 {
306 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
307 if (bytes_read == 0)
308 break;
309 if (overflow || overflow_size != 0)
310 {
311 if (overflow_size == 0)
312 {
313 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
314 memcpy (overflow_buffer, buffer, bytes_read);
315 overflow_size = bytes_read;
316 }
317 else
318 {
319 char *new_buf =
320 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
321
322 memcpy (new_buf, overflow_buffer, overflow_size);
323 memcpy (new_buf + overflow_size, buffer, bytes_read);
324
325 overflow_buffer = new_buf;
326 overflow_size += bytes_read;
327 }
328 }
329 }
330 while (overflow);
331
332 if (overflow_size)
333 *read = overflow_size;
334 else
335 *read = bytes_read;
336
337 return (overflow_size > 0 ? overflow_buffer : buffer);
338 }
339
340 /* Skip whitespace from PORT and return the first non-whitespace character
341 read. Raise an error on end-of-file. */
342 static int
343 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
344 {
345 scm_t_wchar c;
346 while (1)
347 switch (c = scm_getc (port))
348 {
349 case EOF:
350 goteof:
351 if (eoferr)
352 {
353 scm_i_input_error (eoferr,
354 port,
355 "end of file",
356 SCM_EOL);
357 }
358 return c;
359
360 case ';':
361 lp:
362 switch (c = scm_getc (port))
363 {
364 case EOF:
365 goto goteof;
366 default:
367 goto lp;
368 case SCM_LINE_INCREMENTORS:
369 break;
370 }
371 break;
372
373 case '#':
374 switch (c = scm_getc (port))
375 {
376 case EOF:
377 eoferr = "read_sharp";
378 goto goteof;
379 case '!':
380 scm_read_shebang (c, port, opts);
381 break;
382 case ';':
383 scm_read_commented_expression (c, port, opts);
384 break;
385 case '|':
386 if (scm_is_false (scm_get_hash_procedure (c)))
387 {
388 scm_read_r6rs_block_comment (c, port);
389 break;
390 }
391 /* fall through */
392 default:
393 scm_ungetc (c, port);
394 return '#';
395 }
396 break;
397
398 case SCM_LINE_INCREMENTORS:
399 case SCM_SINGLE_SPACES:
400 case '\t':
401 break;
402
403 default:
404 return c;
405 }
406
407 return 0;
408 }
409
410
411 \f
412 /* Token readers. */
413
414 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
415 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
416 long line, int column);
417
418
419 static SCM
420 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
421 long line, int column)
422 {
423 if (opts->record_positions_p)
424 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
425 return x;
426 }
427
428 static SCM
429 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
430 #define FUNC_NAME "scm_i_lreadparen"
431 {
432 int c;
433 SCM tmp, tl, ans = SCM_EOL;
434 const int terminating_char = ((chr == '[') ? ']' : ')');
435
436 /* Need to capture line and column numbers here. */
437 long line = SCM_LINUM (port);
438 int column = SCM_COL (port) - 1;
439
440 c = flush_ws (port, opts, FUNC_NAME);
441 if (terminating_char == c)
442 return SCM_EOL;
443
444 scm_ungetc (c, port);
445 tmp = scm_read_expression (port, opts);
446
447 /* Note that it is possible for scm_read_expression to return
448 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
449 check that it's a real dot by checking `c'. */
450 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
451 {
452 ans = scm_read_expression (port, opts);
453 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
454 scm_i_input_error (FUNC_NAME, port, "missing close paren",
455 SCM_EOL);
456 return ans;
457 }
458
459 /* Build the head of the list structure. */
460 ans = tl = scm_cons (tmp, SCM_EOL);
461
462 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
463 {
464 SCM new_tail;
465
466 if (c == ')' || (c == ']' && opts->square_brackets_p))
467 scm_i_input_error (FUNC_NAME, port,
468 "in pair: mismatched close paren: ~A",
469 scm_list_1 (SCM_MAKE_CHAR (c)));
470
471 scm_ungetc (c, port);
472 tmp = scm_read_expression (port, opts);
473
474 /* See above note about scm_sym_dot. */
475 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
476 {
477 SCM_SETCDR (tl, scm_read_expression (port, opts));
478
479 c = flush_ws (port, opts, FUNC_NAME);
480 if (terminating_char != c)
481 scm_i_input_error (FUNC_NAME, port,
482 "in pair: missing close paren", SCM_EOL);
483 goto exit;
484 }
485
486 new_tail = scm_cons (tmp, SCM_EOL);
487 SCM_SETCDR (tl, new_tail);
488 tl = new_tail;
489 }
490
491 exit:
492 return maybe_annotate_source (ans, port, opts, line, column);
493 }
494 #undef FUNC_NAME
495
496
497 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
498 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
499 found. */
500 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
501 do \
502 { \
503 scm_t_wchar a; \
504 size_t i = 0; \
505 c = 0; \
506 while (i < ndigits) \
507 { \
508 a = scm_getc (port); \
509 if (a == EOF) \
510 goto str_eof; \
511 if (terminator \
512 && (a == (scm_t_wchar) terminator) \
513 && (i > 0)) \
514 break; \
515 if ('0' <= a && a <= '9') \
516 a -= '0'; \
517 else if ('A' <= a && a <= 'F') \
518 a = a - 'A' + 10; \
519 else if ('a' <= a && a <= 'f') \
520 a = a - 'a' + 10; \
521 else \
522 { \
523 c = a; \
524 goto bad_escaped; \
525 } \
526 c = c * 16 + a; \
527 i ++; \
528 } \
529 } while (0)
530
531 static void
532 skip_intraline_whitespace (SCM port)
533 {
534 scm_t_wchar c;
535
536 do
537 {
538 c = scm_getc (port);
539 if (c == EOF)
540 return;
541 }
542 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
543
544 scm_ungetc (c, port);
545 }
546
547 static SCM
548 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
549 #define FUNC_NAME "scm_lreadr"
550 {
551 /* For strings smaller than C_STR, this function creates only one Scheme
552 object (the string returned). */
553
554 SCM str = SCM_EOL;
555 size_t c_str_len = 0;
556 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
557
558 /* Need to capture line and column numbers here. */
559 long line = SCM_LINUM (port);
560 int column = SCM_COL (port) - 1;
561
562 while ('"' != (c = scm_getc (port)))
563 {
564 if (c == EOF)
565 {
566 str_eof:
567 scm_i_input_error (FUNC_NAME, port,
568 "end of file in string constant", SCM_EOL);
569 }
570
571 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
572 {
573 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
574 c_str_len = 0;
575 }
576
577 if (c == '\\')
578 {
579 switch (c = scm_getc (port))
580 {
581 case EOF:
582 goto str_eof;
583 case '"':
584 case '\\':
585 break;
586 case '\n':
587 if (opts->hungry_eol_escapes_p)
588 skip_intraline_whitespace (port);
589 continue;
590 case '0':
591 c = '\0';
592 break;
593 case 'f':
594 c = '\f';
595 break;
596 case 'n':
597 c = '\n';
598 break;
599 case 'r':
600 c = '\r';
601 break;
602 case 't':
603 c = '\t';
604 break;
605 case 'a':
606 c = '\007';
607 break;
608 case 'v':
609 c = '\v';
610 break;
611 case 'b':
612 c = '\010';
613 break;
614 case 'x':
615 if (opts->r6rs_escapes_p)
616 SCM_READ_HEX_ESCAPE (10, ';');
617 else
618 SCM_READ_HEX_ESCAPE (2, '\0');
619 break;
620 case 'u':
621 if (!opts->r6rs_escapes_p)
622 {
623 SCM_READ_HEX_ESCAPE (4, '\0');
624 break;
625 }
626 case 'U':
627 if (!opts->r6rs_escapes_p)
628 {
629 SCM_READ_HEX_ESCAPE (6, '\0');
630 break;
631 }
632 default:
633 bad_escaped:
634 scm_i_input_error (FUNC_NAME, port,
635 "illegal character in escape sequence: ~S",
636 scm_list_1 (SCM_MAKE_CHAR (c)));
637 }
638 }
639
640 c_str[c_str_len++] = c;
641 }
642
643 if (scm_is_null (str))
644 /* Fast path: we got a string that fits in C_STR. */
645 str = scm_from_utf32_stringn (c_str, c_str_len);
646 else
647 {
648 if (c_str_len > 0)
649 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
650
651 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
652 }
653
654 return maybe_annotate_source (str, port, opts, line, column);
655 }
656 #undef FUNC_NAME
657
658
659 static SCM
660 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
661 {
662 SCM result, str = SCM_EOL;
663 char local_buffer[READER_BUFFER_SIZE], *buffer;
664 size_t bytes_read;
665 scm_t_port *pt = SCM_PTAB_ENTRY (port);
666
667 /* Need to capture line and column numbers here. */
668 long line = SCM_LINUM (port);
669 int column = SCM_COL (port) - 1;
670
671 scm_ungetc (chr, port);
672 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
673 &bytes_read);
674
675 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
676
677 result = scm_string_to_number (str, SCM_UNDEFINED);
678 if (scm_is_false (result))
679 {
680 /* Return a symbol instead of a number */
681 if (opts->case_insensitive_p)
682 str = scm_string_downcase_x (str);
683 result = scm_string_to_symbol (str);
684 }
685 else if (SCM_NIMP (result))
686 result = maybe_annotate_source (result, port, opts, line, column);
687
688 SCM_COL (port) += scm_i_string_length (str);
689 return result;
690 }
691
692 static SCM
693 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
694 {
695 SCM result;
696 int ends_with_colon = 0;
697 size_t bytes_read;
698 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
699 char local_buffer[READER_BUFFER_SIZE], *buffer;
700 scm_t_port *pt = SCM_PTAB_ENTRY (port);
701 SCM str;
702
703 scm_ungetc (chr, port);
704 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
705 &bytes_read);
706 if (bytes_read > 0)
707 ends_with_colon = buffer[bytes_read - 1] == ':';
708
709 if (postfix && ends_with_colon && (bytes_read > 1))
710 {
711 str = scm_from_stringn (buffer, bytes_read - 1,
712 pt->encoding, pt->ilseq_handler);
713
714 if (opts->case_insensitive_p)
715 str = scm_string_downcase_x (str);
716 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
717 }
718 else
719 {
720 str = scm_from_stringn (buffer, bytes_read,
721 pt->encoding, pt->ilseq_handler);
722
723 if (opts->case_insensitive_p)
724 str = scm_string_downcase_x (str);
725 result = scm_string_to_symbol (str);
726 }
727
728 SCM_COL (port) += scm_i_string_length (str);
729 return result;
730 }
731
732 static SCM
733 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
734 #define FUNC_NAME "scm_lreadr"
735 {
736 SCM result;
737 size_t read;
738 char local_buffer[READER_BUFFER_SIZE], *buffer;
739 unsigned int radix;
740 SCM str;
741 scm_t_port *pt;
742
743 switch (chr)
744 {
745 case 'B':
746 case 'b':
747 radix = 2;
748 break;
749
750 case 'o':
751 case 'O':
752 radix = 8;
753 break;
754
755 case 'd':
756 case 'D':
757 radix = 10;
758 break;
759
760 case 'x':
761 case 'X':
762 radix = 16;
763 break;
764
765 default:
766 scm_ungetc (chr, port);
767 scm_ungetc ('#', port);
768 radix = 10;
769 }
770
771 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
772 &read);
773
774 pt = SCM_PTAB_ENTRY (port);
775 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
776
777 result = scm_string_to_number (str, scm_from_uint (radix));
778
779 SCM_COL (port) += scm_i_string_length (str);
780
781 if (scm_is_true (result))
782 return result;
783
784 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
785
786 return SCM_BOOL_F;
787 }
788 #undef FUNC_NAME
789
790 static SCM
791 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
792 {
793 SCM p;
794 long line = SCM_LINUM (port);
795 int column = SCM_COL (port) - 1;
796
797 switch (chr)
798 {
799 case '`':
800 p = scm_sym_quasiquote;
801 break;
802
803 case '\'':
804 p = scm_sym_quote;
805 break;
806
807 case ',':
808 {
809 scm_t_wchar c;
810
811 c = scm_getc (port);
812 if ('@' == c)
813 p = scm_sym_uq_splicing;
814 else
815 {
816 scm_ungetc (c, port);
817 p = scm_sym_unquote;
818 }
819 break;
820 }
821
822 default:
823 fprintf (stderr, "%s: unhandled quote character (%i)\n",
824 "scm_read_quote", chr);
825 abort ();
826 }
827
828 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
829 return maybe_annotate_source (p, port, opts, line, column);
830 }
831
832 SCM_SYMBOL (sym_syntax, "syntax");
833 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
834 SCM_SYMBOL (sym_unsyntax, "unsyntax");
835 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
836
837 static SCM
838 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
839 {
840 SCM p;
841 long line = SCM_LINUM (port);
842 int column = SCM_COL (port) - 1;
843
844 switch (chr)
845 {
846 case '`':
847 p = sym_quasisyntax;
848 break;
849
850 case '\'':
851 p = sym_syntax;
852 break;
853
854 case ',':
855 {
856 int c;
857
858 c = scm_getc (port);
859 if ('@' == c)
860 p = sym_unsyntax_splicing;
861 else
862 {
863 scm_ungetc (c, port);
864 p = sym_unsyntax;
865 }
866 break;
867 }
868
869 default:
870 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
871 "scm_read_syntax", chr);
872 abort ();
873 }
874
875 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
876 return maybe_annotate_source (p, port, opts, line, column);
877 }
878
879 static SCM
880 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
881 {
882 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
883
884 if (!scm_is_eq (id, sym_nil))
885 scm_i_input_error ("scm_read_nil", port,
886 "unexpected input while reading #nil: ~a",
887 scm_list_1 (id));
888
889 return SCM_ELISP_NIL;
890 }
891
892 static SCM
893 scm_read_semicolon_comment (int chr, SCM port)
894 {
895 int c;
896
897 /* We use the get_byte here because there is no need to get the
898 locale correct with comment input. This presumes that newline
899 always represents itself no matter what the encoding is. */
900 for (c = scm_get_byte_or_eof (port);
901 (c != EOF) && (c != '\n');
902 c = scm_get_byte_or_eof (port));
903
904 return SCM_UNSPECIFIED;
905 }
906
907 \f
908 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
909
910 static SCM
911 scm_read_boolean (int chr, SCM port)
912 {
913 switch (chr)
914 {
915 case 't':
916 case 'T':
917 return SCM_BOOL_T;
918
919 case 'f':
920 case 'F':
921 return SCM_BOOL_F;
922 }
923
924 return SCM_UNSPECIFIED;
925 }
926
927 static SCM
928 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
929 #define FUNC_NAME "scm_lreadr"
930 {
931 char buffer[READER_CHAR_NAME_MAX_SIZE];
932 SCM charname;
933 size_t charname_len, bytes_read;
934 scm_t_wchar cp;
935 int overflow;
936 scm_t_port *pt;
937
938 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
939 &bytes_read);
940 if (overflow)
941 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
942
943 if (bytes_read == 0)
944 {
945 chr = scm_getc (port);
946 if (chr == EOF)
947 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
948 "while reading character", SCM_EOL);
949
950 /* CHR must be a token delimiter, like a whitespace. */
951 return (SCM_MAKE_CHAR (chr));
952 }
953
954 pt = SCM_PTAB_ENTRY (port);
955
956 /* Simple ASCII characters can be processed immediately. Also, simple
957 ISO-8859-1 characters can be processed immediately if the encoding for this
958 port is ISO-8859-1. */
959 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
960 {
961 SCM_COL (port) += 1;
962 return SCM_MAKE_CHAR (buffer[0]);
963 }
964
965 /* Otherwise, convert the buffer into a proper scheme string for
966 processing. */
967 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
968 pt->ilseq_handler);
969 charname_len = scm_i_string_length (charname);
970 SCM_COL (port) += charname_len;
971 cp = scm_i_string_ref (charname, 0);
972 if (charname_len == 1)
973 return SCM_MAKE_CHAR (cp);
974
975 /* Ignore dotted circles, which may be used to keep combining characters from
976 combining with the backslash in #\charname. */
977 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
978 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
979
980 if (cp >= '0' && cp < '8')
981 {
982 /* Dirk:FIXME:: This type of character syntax is not R5RS
983 * compliant. Further, it should be verified that the constant
984 * does only consist of octal digits. */
985 SCM p = scm_string_to_number (charname, scm_from_uint (8));
986 if (SCM_I_INUMP (p))
987 {
988 scm_t_wchar c = scm_to_uint32 (p);
989 if (SCM_IS_UNICODE_CHAR (c))
990 return SCM_MAKE_CHAR (c);
991 else
992 scm_i_input_error (FUNC_NAME, port,
993 "out-of-range octal character escape: ~a",
994 scm_list_1 (charname));
995 }
996 }
997
998 if (cp == 'x' && (charname_len > 1))
999 {
1000 SCM p;
1001
1002 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1003 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1004 scm_from_uint (16));
1005 if (SCM_I_INUMP (p))
1006 {
1007 scm_t_wchar c = scm_to_uint32 (p);
1008 if (SCM_IS_UNICODE_CHAR (c))
1009 return SCM_MAKE_CHAR (c);
1010 else
1011 scm_i_input_error (FUNC_NAME, port,
1012 "out-of-range hex character escape: ~a",
1013 scm_list_1 (charname));
1014 }
1015 }
1016
1017 /* The names of characters should never have non-Latin1
1018 characters. */
1019 if (scm_i_is_narrow_string (charname)
1020 || scm_i_try_narrow_string (charname))
1021 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1022 charname_len);
1023 if (scm_is_true (ch))
1024 return ch;
1025 }
1026
1027 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1028 scm_list_1 (charname));
1029
1030 return SCM_UNSPECIFIED;
1031 }
1032 #undef FUNC_NAME
1033
1034 static SCM
1035 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1036 {
1037 SCM symbol;
1038
1039 /* Read the symbol that comprises the keyword. Doing this instead of
1040 invoking a specific symbol reader function allows `scm_read_keyword ()'
1041 to adapt to the delimiters currently valid of symbols.
1042
1043 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1044 symbol = scm_read_expression (port, opts);
1045 if (!scm_is_symbol (symbol))
1046 scm_i_input_error ("scm_read_keyword", port,
1047 "keyword prefix `~a' not followed by a symbol: ~s",
1048 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1049
1050 return (scm_symbol_to_keyword (symbol));
1051 }
1052
1053 static SCM
1054 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1055 long line, int column)
1056 {
1057 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058 guarantee that it's going to do what we want. After all, this is an
1059 implementation detail of `scm_read_vector ()', not a desirable
1060 property. */
1061 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1062 port, opts, line, column);
1063 }
1064
1065 /* Helper used by scm_read_array */
1066 static int
1067 read_decimal_integer (SCM port, int c, ssize_t *resp)
1068 {
1069 ssize_t sign = 1;
1070 ssize_t res = 0;
1071 int got_it = 0;
1072
1073 if (c == '-')
1074 {
1075 sign = -1;
1076 c = scm_getc (port);
1077 }
1078
1079 while ('0' <= c && c <= '9')
1080 {
1081 res = 10*res + c-'0';
1082 got_it = 1;
1083 c = scm_getc (port);
1084 }
1085
1086 if (got_it)
1087 *resp = sign * res;
1088 return c;
1089 }
1090
1091 /* Read an array. This function can also read vectors and uniform
1092 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1093 handled here.
1094
1095 C is the first character read after the '#'.
1096 */
1097 static SCM
1098 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1099 {
1100 ssize_t rank;
1101 scm_t_wchar tag_buf[8];
1102 int tag_len;
1103
1104 SCM tag, shape = SCM_BOOL_F, elements, array;
1105
1106 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1107 the array code can not deal with zero-length dimensions yet, and
1108 we want to allow zero-length vectors, of course.
1109 */
1110 if (c == '(')
1111 return scm_read_vector (c, port, opts, line, column);
1112
1113 /* Disambiguate between '#f' and uniform floating point vectors.
1114 */
1115 if (c == 'f')
1116 {
1117 c = scm_getc (port);
1118 if (c != '3' && c != '6')
1119 {
1120 if (c != EOF)
1121 scm_ungetc (c, port);
1122 return SCM_BOOL_F;
1123 }
1124 rank = 1;
1125 tag_buf[0] = 'f';
1126 tag_len = 1;
1127 goto continue_reading_tag;
1128 }
1129
1130 /* Read rank. */
1131 rank = 1;
1132 c = read_decimal_integer (port, c, &rank);
1133 if (rank < 0)
1134 scm_i_input_error (NULL, port, "array rank must be non-negative",
1135 SCM_EOL);
1136
1137 /* Read tag. */
1138 tag_len = 0;
1139 continue_reading_tag:
1140 while (c != EOF && c != '(' && c != '@' && c != ':'
1141 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1142 {
1143 tag_buf[tag_len++] = c;
1144 c = scm_getc (port);
1145 }
1146 if (tag_len == 0)
1147 tag = SCM_BOOL_T;
1148 else
1149 {
1150 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1151 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1152 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1153 scm_list_1 (tag));
1154 }
1155
1156 /* Read shape. */
1157 if (c == '@' || c == ':')
1158 {
1159 shape = SCM_EOL;
1160
1161 do
1162 {
1163 ssize_t lbnd = 0, len = 0;
1164 SCM s;
1165
1166 if (c == '@')
1167 {
1168 c = scm_getc (port);
1169 c = read_decimal_integer (port, c, &lbnd);
1170 }
1171
1172 s = scm_from_ssize_t (lbnd);
1173
1174 if (c == ':')
1175 {
1176 c = scm_getc (port);
1177 c = read_decimal_integer (port, c, &len);
1178 if (len < 0)
1179 scm_i_input_error (NULL, port,
1180 "array length must be non-negative",
1181 SCM_EOL);
1182
1183 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1184 }
1185
1186 shape = scm_cons (s, shape);
1187 } while (c == '@' || c == ':');
1188
1189 shape = scm_reverse_x (shape, SCM_EOL);
1190 }
1191
1192 /* Read nested lists of elements. */
1193 if (c != '(')
1194 scm_i_input_error (NULL, port,
1195 "missing '(' in vector or array literal",
1196 SCM_EOL);
1197 elements = scm_read_sexp (c, port, opts);
1198
1199 if (scm_is_false (shape))
1200 shape = scm_from_ssize_t (rank);
1201 else if (scm_ilength (shape) != rank)
1202 scm_i_input_error
1203 (NULL, port,
1204 "the number of shape specifications must match the array rank",
1205 SCM_EOL);
1206
1207 /* Handle special print syntax of rank zero arrays; see
1208 scm_i_print_array for a rationale. */
1209 if (rank == 0)
1210 {
1211 if (!scm_is_pair (elements))
1212 scm_i_input_error (NULL, port,
1213 "too few elements in array literal, need 1",
1214 SCM_EOL);
1215 if (!scm_is_null (SCM_CDR (elements)))
1216 scm_i_input_error (NULL, port,
1217 "too many elements in array literal, want 1",
1218 SCM_EOL);
1219 elements = SCM_CAR (elements);
1220 }
1221
1222 /* Construct array, annotate with source location, and return. */
1223 array = scm_list_to_typed_array (tag, shape, elements);
1224 return maybe_annotate_source (array, port, opts, line, column);
1225 }
1226
1227 static SCM
1228 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1229 long line, int column)
1230 {
1231 return scm_read_array (chr, port, opts, line, column);
1232 }
1233
1234 static SCM
1235 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1236 long line, int column)
1237 {
1238 chr = scm_getc (port);
1239 if (chr != 'u')
1240 goto syntax;
1241
1242 chr = scm_getc (port);
1243 if (chr != '8')
1244 goto syntax;
1245
1246 chr = scm_getc (port);
1247 if (chr != '(')
1248 goto syntax;
1249
1250 return maybe_annotate_source
1251 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1252 port, opts, line, column);
1253
1254 syntax:
1255 scm_i_input_error ("read_bytevector", port,
1256 "invalid bytevector prefix",
1257 SCM_MAKE_CHAR (chr));
1258 return SCM_UNSPECIFIED;
1259 }
1260
1261 static SCM
1262 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1263 long line, int column)
1264 {
1265 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1266 terribly inefficient but who cares? */
1267 SCM s_bits = SCM_EOL;
1268
1269 for (chr = scm_getc (port);
1270 (chr != EOF) && ((chr == '0') || (chr == '1'));
1271 chr = scm_getc (port))
1272 {
1273 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1274 }
1275
1276 if (chr != EOF)
1277 scm_ungetc (chr, port);
1278
1279 return maybe_annotate_source
1280 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1281 port, opts, line, column);
1282 }
1283
1284 static SCM
1285 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1286 {
1287 int bang_seen = 0;
1288
1289 for (;;)
1290 {
1291 int c = scm_getc (port);
1292
1293 if (c == EOF)
1294 scm_i_input_error ("skip_block_comment", port,
1295 "unterminated `#! ... !#' comment", SCM_EOL);
1296
1297 if (c == '!')
1298 bang_seen = 1;
1299 else if (c == '#' && bang_seen)
1300 break;
1301 else
1302 bang_seen = 0;
1303 }
1304
1305 return SCM_UNSPECIFIED;
1306 }
1307
1308 static SCM
1309 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1310 {
1311 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1312 int c;
1313 int i = 0;
1314
1315 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1316 {
1317 c = scm_getc (port);
1318 if (c == EOF)
1319 scm_i_input_error ("skip_block_comment", port,
1320 "unterminated `#! ... !#' comment", SCM_EOL);
1321 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1322 name[i++] = c;
1323 else if (CHAR_IS_DELIMITER (c))
1324 {
1325 scm_ungetc (c, port);
1326 name[i] = '\0';
1327 if (0 == strcmp ("r6rs", name))
1328 ; /* Silently ignore */
1329 else
1330 break;
1331
1332 return SCM_UNSPECIFIED;
1333 }
1334 }
1335 while (i > 0)
1336 scm_ungetc (name[--i], port);
1337 return scm_read_scsh_block_comment (chr, port);
1338 }
1339
1340 static SCM
1341 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1342 {
1343 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1344 nested. So care must be taken. */
1345 int nesting_level = 1;
1346
1347 int a = scm_getc (port);
1348
1349 if (a == EOF)
1350 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1351 "unterminated `#| ... |#' comment", SCM_EOL);
1352
1353 while (nesting_level > 0)
1354 {
1355 int b = scm_getc (port);
1356
1357 if (b == EOF)
1358 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1359 "unterminated `#| ... |#' comment", SCM_EOL);
1360
1361 if (a == '|' && b == '#')
1362 {
1363 nesting_level--;
1364 b = EOF;
1365 }
1366 else if (a == '#' && b == '|')
1367 {
1368 nesting_level++;
1369 b = EOF;
1370 }
1371
1372 a = b;
1373 }
1374
1375 return SCM_UNSPECIFIED;
1376 }
1377
1378 static SCM
1379 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1380 scm_t_read_opts *opts)
1381 {
1382 scm_t_wchar c;
1383
1384 c = flush_ws (port, opts, (char *) NULL);
1385 if (EOF == c)
1386 scm_i_input_error ("read_commented_expression", port,
1387 "no expression after #; comment", SCM_EOL);
1388 scm_ungetc (c, port);
1389 scm_read_expression (port, opts);
1390 return SCM_UNSPECIFIED;
1391 }
1392
1393 static SCM
1394 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1395 {
1396 /* Guile's extended symbol read syntax looks like this:
1397
1398 #{This is all a symbol name}#
1399
1400 So here, CHR is expected to be `{'. */
1401 int saw_brace = 0;
1402 size_t len = 0;
1403 SCM buf = scm_i_make_string (1024, NULL, 0);
1404
1405 buf = scm_i_string_start_writing (buf);
1406
1407 while ((chr = scm_getc (port)) != EOF)
1408 {
1409 if (saw_brace)
1410 {
1411 if (chr == '#')
1412 {
1413 break;
1414 }
1415 else
1416 {
1417 saw_brace = 0;
1418 scm_i_string_set_x (buf, len++, '}');
1419 }
1420 }
1421
1422 if (chr == '}')
1423 saw_brace = 1;
1424 else if (chr == '\\')
1425 {
1426 /* It used to be that print.c would print extended-read-syntax
1427 symbols with backslashes before "non-standard" chars, but
1428 this routine wouldn't do anything with those escapes.
1429 Bummer. What we've done is to change print.c to output
1430 R6RS hex escapes for those characters, relying on the fact
1431 that the extended read syntax would never put a `\' before
1432 an `x'. For now, we just ignore other instances of
1433 backslash in the string. */
1434 switch ((chr = scm_getc (port)))
1435 {
1436 case EOF:
1437 goto done;
1438 case 'x':
1439 {
1440 scm_t_wchar c;
1441
1442 SCM_READ_HEX_ESCAPE (10, ';');
1443 scm_i_string_set_x (buf, len++, c);
1444 break;
1445
1446 str_eof:
1447 chr = EOF;
1448 goto done;
1449
1450 bad_escaped:
1451 scm_i_string_stop_writing ();
1452 scm_i_input_error ("scm_read_extended_symbol", port,
1453 "illegal character in escape sequence: ~S",
1454 scm_list_1 (SCM_MAKE_CHAR (c)));
1455 break;
1456 }
1457 default:
1458 scm_i_string_set_x (buf, len++, chr);
1459 break;
1460 }
1461 }
1462 else
1463 scm_i_string_set_x (buf, len++, chr);
1464
1465 if (len >= scm_i_string_length (buf) - 2)
1466 {
1467 SCM addy;
1468
1469 scm_i_string_stop_writing ();
1470 addy = scm_i_make_string (1024, NULL, 0);
1471 buf = scm_string_append (scm_list_2 (buf, addy));
1472 len = 0;
1473 buf = scm_i_string_start_writing (buf);
1474 }
1475 }
1476
1477 done:
1478 scm_i_string_stop_writing ();
1479 if (chr == EOF)
1480 scm_i_input_error ("scm_read_extended_symbol", port,
1481 "end of file while reading symbol", SCM_EOL);
1482
1483 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1484 }
1485
1486
1487 \f
1488 /* Top-level token readers, i.e., dispatchers. */
1489
1490 static SCM
1491 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1492 {
1493 SCM proc;
1494
1495 proc = scm_get_hash_procedure (chr);
1496 if (scm_is_true (scm_procedure_p (proc)))
1497 {
1498 long line = SCM_LINUM (port);
1499 int column = SCM_COL (port) - 2;
1500 SCM got;
1501
1502 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1503
1504 if (opts->record_positions_p && SCM_NIMP (got)
1505 && !scm_i_has_source_properties (got))
1506 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1507
1508 return got;
1509 }
1510
1511 return SCM_UNSPECIFIED;
1512 }
1513
1514 /* The reader for the sharp `#' character. It basically dispatches reads
1515 among the above token readers. */
1516 static SCM
1517 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1518 long line, int column)
1519 #define FUNC_NAME "scm_lreadr"
1520 {
1521 SCM result;
1522
1523 chr = scm_getc (port);
1524
1525 result = scm_read_sharp_extension (chr, port, opts);
1526 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1527 return result;
1528
1529 switch (chr)
1530 {
1531 case '\\':
1532 return (scm_read_character (chr, port, opts));
1533 case '(':
1534 return (scm_read_vector (chr, port, opts, line, column));
1535 case 's':
1536 case 'u':
1537 case 'f':
1538 case 'c':
1539 /* This one may return either a boolean or an SRFI-4 vector. */
1540 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1541 case 'v':
1542 return (scm_read_bytevector (chr, port, opts, line, column));
1543 case '*':
1544 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1545 case 't':
1546 case 'T':
1547 case 'F':
1548 return (scm_read_boolean (chr, port));
1549 case ':':
1550 return (scm_read_keyword (chr, port, opts));
1551 case '0': case '1': case '2': case '3': case '4':
1552 case '5': case '6': case '7': case '8': case '9':
1553 case '@':
1554 #if SCM_ENABLE_DEPRECATED
1555 /* See below for 'i' and 'e'. */
1556 case 'a':
1557 case 'y':
1558 case 'h':
1559 case 'l':
1560 #endif
1561 return (scm_read_array (chr, port, opts, line, column));
1562
1563 case 'i':
1564 case 'e':
1565 #if SCM_ENABLE_DEPRECATED
1566 {
1567 /* When next char is '(', it really is an old-style
1568 uniform array. */
1569 scm_t_wchar next_c = scm_getc (port);
1570 if (next_c != EOF)
1571 scm_ungetc (next_c, port);
1572 if (next_c == '(')
1573 return scm_read_array (chr, port, opts, line, column);
1574 /* Fall through. */
1575 }
1576 #endif
1577 case 'b':
1578 case 'B':
1579 case 'o':
1580 case 'O':
1581 case 'd':
1582 case 'D':
1583 case 'x':
1584 case 'X':
1585 case 'I':
1586 case 'E':
1587 return (scm_read_number_and_radix (chr, port, opts));
1588 case '{':
1589 return (scm_read_extended_symbol (chr, port));
1590 case '!':
1591 return (scm_read_shebang (chr, port, opts));
1592 case ';':
1593 return (scm_read_commented_expression (chr, port, opts));
1594 case '`':
1595 case '\'':
1596 case ',':
1597 return (scm_read_syntax (chr, port, opts));
1598 case 'n':
1599 return (scm_read_nil (chr, port, opts));
1600 default:
1601 result = scm_read_sharp_extension (chr, port, opts);
1602 if (scm_is_eq (result, SCM_UNSPECIFIED))
1603 {
1604 /* To remain compatible with 1.8 and earlier, the following
1605 characters have lower precedence than `read-hash-extend'
1606 characters. */
1607 switch (chr)
1608 {
1609 case '|':
1610 return scm_read_r6rs_block_comment (chr, port);
1611 default:
1612 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1613 scm_list_1 (SCM_MAKE_CHAR (chr)));
1614 }
1615 }
1616 else
1617 return result;
1618 }
1619
1620 return SCM_UNSPECIFIED;
1621 }
1622 #undef FUNC_NAME
1623
1624 static SCM
1625 scm_read_expression (SCM port, scm_t_read_opts *opts)
1626 #define FUNC_NAME "scm_read_expression"
1627 {
1628 while (1)
1629 {
1630 scm_t_wchar chr;
1631
1632 chr = scm_getc (port);
1633
1634 switch (chr)
1635 {
1636 case SCM_WHITE_SPACES:
1637 case SCM_LINE_INCREMENTORS:
1638 break;
1639 case ';':
1640 (void) scm_read_semicolon_comment (chr, port);
1641 break;
1642 case '[':
1643 if (!opts->square_brackets_p)
1644 return (scm_read_mixed_case_symbol (chr, port, opts));
1645 /* otherwise fall through */
1646 case '(':
1647 return (scm_read_sexp (chr, port, opts));
1648 case '"':
1649 return (scm_read_string (chr, port, opts));
1650 case '\'':
1651 case '`':
1652 case ',':
1653 return (scm_read_quote (chr, port, opts));
1654 case '#':
1655 {
1656 long line = SCM_LINUM (port);
1657 int column = SCM_COL (port) - 1;
1658 SCM result = scm_read_sharp (chr, port, opts, line, column);
1659 if (scm_is_eq (result, SCM_UNSPECIFIED))
1660 /* We read a comment or some such. */
1661 break;
1662 else
1663 return result;
1664 }
1665 case ')':
1666 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1667 break;
1668 case ']':
1669 if (opts->square_brackets_p)
1670 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1671 /* otherwise fall through */
1672 case EOF:
1673 return SCM_EOF_VAL;
1674 case ':':
1675 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1676 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1677 /* Fall through. */
1678
1679 default:
1680 {
1681 if (((chr >= '0') && (chr <= '9'))
1682 || (strchr ("+-.", chr)))
1683 return (scm_read_number (chr, port, opts));
1684 else
1685 return (scm_read_mixed_case_symbol (chr, port, opts));
1686 }
1687 }
1688 }
1689 }
1690 #undef FUNC_NAME
1691
1692 \f
1693 /* Actual reader. */
1694
1695 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1696 (SCM port),
1697 "Read an s-expression from the input port @var{port}, or from\n"
1698 "the current input port if @var{port} is not specified.\n"
1699 "Any whitespace before the next token is discarded.")
1700 #define FUNC_NAME s_scm_read
1701 {
1702 scm_t_read_opts opts;
1703 int c;
1704
1705 if (SCM_UNBNDP (port))
1706 port = scm_current_input_port ();
1707 SCM_VALIDATE_OPINPORT (1, port);
1708
1709 init_read_options (&opts);
1710
1711 c = flush_ws (port, &opts, (char *) NULL);
1712 if (EOF == c)
1713 return SCM_EOF_VAL;
1714 scm_ungetc (c, port);
1715
1716 return (scm_read_expression (port, &opts));
1717 }
1718 #undef FUNC_NAME
1719
1720
1721 \f
1722
1723 /* Manipulate the read-hash-procedures alist. This could be written in
1724 Scheme, but maybe it will also be used by C code during initialisation. */
1725 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1726 (SCM chr, SCM proc),
1727 "Install the procedure @var{proc} for reading expressions\n"
1728 "starting with the character sequence @code{#} and @var{chr}.\n"
1729 "@var{proc} will be called with two arguments: the character\n"
1730 "@var{chr} and the port to read further data from. The object\n"
1731 "returned will be the return value of @code{read}. \n"
1732 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1733 )
1734 #define FUNC_NAME s_scm_read_hash_extend
1735 {
1736 SCM this;
1737 SCM prev;
1738
1739 SCM_VALIDATE_CHAR (1, chr);
1740 SCM_ASSERT (scm_is_false (proc)
1741 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1742 proc, SCM_ARG2, FUNC_NAME);
1743
1744 /* Check if chr is already in the alist. */
1745 this = scm_i_read_hash_procedures_ref ();
1746 prev = SCM_BOOL_F;
1747 while (1)
1748 {
1749 if (scm_is_null (this))
1750 {
1751 /* not found, so add it to the beginning. */
1752 if (scm_is_true (proc))
1753 {
1754 SCM new = scm_cons (scm_cons (chr, proc),
1755 scm_i_read_hash_procedures_ref ());
1756 scm_i_read_hash_procedures_set_x (new);
1757 }
1758 break;
1759 }
1760 if (scm_is_eq (chr, SCM_CAAR (this)))
1761 {
1762 /* already in the alist. */
1763 if (scm_is_false (proc))
1764 {
1765 /* remove it. */
1766 if (scm_is_false (prev))
1767 {
1768 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1769 scm_i_read_hash_procedures_set_x (rest);
1770 }
1771 else
1772 scm_set_cdr_x (prev, SCM_CDR (this));
1773 }
1774 else
1775 {
1776 /* replace it. */
1777 scm_set_cdr_x (SCM_CAR (this), proc);
1778 }
1779 break;
1780 }
1781 prev = this;
1782 this = SCM_CDR (this);
1783 }
1784
1785 return SCM_UNSPECIFIED;
1786 }
1787 #undef FUNC_NAME
1788
1789 /* Recover the read-hash procedure corresponding to char c. */
1790 static SCM
1791 scm_get_hash_procedure (int c)
1792 {
1793 SCM rest = scm_i_read_hash_procedures_ref ();
1794
1795 while (1)
1796 {
1797 if (scm_is_null (rest))
1798 return SCM_BOOL_F;
1799
1800 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1801 return SCM_CDAR (rest);
1802
1803 rest = SCM_CDR (rest);
1804 }
1805 }
1806
1807 #define SCM_ENCODING_SEARCH_SIZE (500)
1808
1809 /* Search the first few hundred characters of a file for an Emacs-like coding
1810 declaration. Returns either NULL or a string whose storage has been
1811 allocated with `scm_gc_malloc ()'. */
1812 char *
1813 scm_i_scan_for_encoding (SCM port)
1814 {
1815 scm_t_port *pt;
1816 char header[SCM_ENCODING_SEARCH_SIZE+1];
1817 size_t bytes_read, encoding_length, i;
1818 char *encoding = NULL;
1819 int utf8_bom = 0;
1820 char *pos, *encoding_start;
1821 int in_comment;
1822
1823 pt = SCM_PTAB_ENTRY (port);
1824
1825 if (pt->rw_active == SCM_PORT_WRITE)
1826 scm_flush (port);
1827
1828 if (pt->rw_random)
1829 pt->rw_active = SCM_PORT_READ;
1830
1831 if (pt->read_pos == pt->read_end)
1832 {
1833 /* We can use the read buffer, and thus avoid a seek. */
1834 if (scm_fill_input (port) == EOF)
1835 return NULL;
1836
1837 bytes_read = pt->read_end - pt->read_pos;
1838 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1839 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1840
1841 if (bytes_read <= 1)
1842 /* An unbuffered port -- don't scan. */
1843 return NULL;
1844
1845 memcpy (header, pt->read_pos, bytes_read);
1846 header[bytes_read] = '\0';
1847 }
1848 else
1849 {
1850 /* Try to read some bytes and then seek back. Not all ports
1851 support seeking back; and indeed some file ports (like
1852 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1853 check performed by SCM_FPORT_FDES---but fail to seek
1854 backwards. Hence this block comes second. We prefer to use
1855 the read buffer in-place. */
1856 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1857 return NULL;
1858
1859 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1860 header[bytes_read] = '\0';
1861 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1862 }
1863
1864 if (bytes_read > 3
1865 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1866 utf8_bom = 1;
1867
1868 /* search past "coding[:=]" */
1869 pos = header;
1870 while (1)
1871 {
1872 if ((pos = strstr(pos, "coding")) == NULL)
1873 return NULL;
1874
1875 pos += strlen("coding");
1876 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1877 (*pos == ':' || *pos == '='))
1878 {
1879 pos ++;
1880 break;
1881 }
1882 }
1883
1884 /* skip spaces */
1885 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1886 (*pos == ' ' || *pos == '\t'))
1887 pos ++;
1888
1889 /* grab the next token */
1890 encoding_start = pos;
1891 i = 0;
1892 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1893 && encoding_start + i - header < bytes_read
1894 && (isalnum ((int) encoding_start[i])
1895 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1896 i++;
1897
1898 encoding_length = i;
1899 if (encoding_length == 0)
1900 return NULL;
1901
1902 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1903 for (i = 0; i < encoding_length; i++)
1904 encoding[i] = toupper ((int) encoding[i]);
1905
1906 /* push backwards to make sure we were in a comment */
1907 in_comment = 0;
1908 pos = encoding_start;
1909 while (pos >= header)
1910 {
1911 if (*pos == ';')
1912 {
1913 in_comment = 1;
1914 break;
1915 }
1916 else if (*pos == '\n' || pos == header)
1917 {
1918 /* This wasn't in a semicolon comment. Check for a
1919 hash-bang comment. */
1920 char *beg = strstr (header, "#!");
1921 char *end = strstr (header, "!#");
1922 if (beg < encoding_start && encoding_start + encoding_length <= end)
1923 in_comment = 1;
1924 break;
1925 }
1926 else
1927 {
1928 pos --;
1929 continue;
1930 }
1931 }
1932 if (!in_comment)
1933 /* This wasn't in a comment */
1934 return NULL;
1935
1936 if (utf8_bom && strcmp(encoding, "UTF-8"))
1937 scm_misc_error (NULL,
1938 "the port input declares the encoding ~s but is encoded as UTF-8",
1939 scm_list_1 (scm_from_locale_string (encoding)));
1940
1941 return encoding;
1942 }
1943
1944 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1945 (SCM port),
1946 "Scans the port for an Emacs-like character coding declaration\n"
1947 "near the top of the contents of a port with random-accessible contents.\n"
1948 "The coding declaration is of the form\n"
1949 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1950 "\n"
1951 "Returns a string containing the character encoding of the file\n"
1952 "if a declaration was found, or @code{#f} otherwise.\n")
1953 #define FUNC_NAME s_scm_file_encoding
1954 {
1955 char *enc;
1956 SCM s_enc;
1957
1958 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1959
1960 enc = scm_i_scan_for_encoding (port);
1961 if (enc == NULL)
1962 return SCM_BOOL_F;
1963 else
1964 {
1965 s_enc = scm_from_locale_string (enc);
1966 return s_enc;
1967 }
1968
1969 return SCM_BOOL_F;
1970 }
1971 #undef FUNC_NAME
1972
1973 void
1974 scm_init_read ()
1975 {
1976 SCM read_hash_procs;
1977
1978 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1979
1980 scm_i_read_hash_procedures =
1981 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1982
1983 scm_init_opts (scm_read_options, scm_read_opts);
1984 #include "libguile/read.x"
1985 }
1986
1987 /*
1988 Local Variables:
1989 c-file-style: "gnu"
1990 End:
1991 */