Have `scm_scan_for_encoding ()' use GC-managed memory.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/root.h"
46 #include "libguile/strings.h"
47 #include "libguile/strports.h"
48 #include "libguile/vectors.h"
49 #include "libguile/validate.h"
50 #include "libguile/srfi-4.h"
51 #include "libguile/srfi-13.h"
52
53 #include "libguile/read.h"
54 #include "libguile/private-options.h"
55
56
57 \f
58
59 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
60 SCM_SYMBOL (scm_keyword_prefix, "prefix");
61 SCM_SYMBOL (scm_keyword_postfix, "postfix");
62
63 scm_t_option scm_read_opts[] = {
64 { SCM_OPTION_BOOLEAN, "copy", 0,
65 "Copy source code expressions." },
66 { SCM_OPTION_BOOLEAN, "positions", 0,
67 "Record positions of source code expressions." },
68 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
69 "Convert symbols to lower case."},
70 { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
71 "Style of keyword recognition: #f, 'prefix or 'postfix."},
72 #if SCM_ENABLE_ELISP
73 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
74 "Support Elisp vector syntax, namely `[...]'."},
75 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
76 "Support `\\(' and `\\)' in strings."},
77 #endif
78 { 0, },
79 };
80
81 /*
82 Give meaningful error messages for errors
83
84 We use the format
85
86 FILE:LINE:COL: MESSAGE
87 This happened in ....
88
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
91
92 */
93
94
95 void
96 scm_i_input_error (char const *function,
97 SCM port, const char *message, SCM arg)
98 {
99 SCM fn = (scm_is_string (SCM_FILENAME(port))
100 ? SCM_FILENAME(port)
101 : scm_from_locale_string ("#<unknown port>"));
102
103 SCM string_port = scm_open_output_string ();
104 SCM string = SCM_EOL;
105 scm_simple_format (string_port,
106 scm_from_locale_string ("~A:~S:~S: ~A"),
107 scm_list_4 (fn,
108 scm_from_long (SCM_LINUM (port) + 1),
109 scm_from_int (SCM_COL (port) + 1),
110 scm_from_locale_string (message)));
111
112 string = scm_get_output_string (string_port);
113 scm_close_output_port (string_port);
114 scm_error_scm (scm_from_locale_symbol ("read-error"),
115 function? scm_from_locale_string (function) : SCM_BOOL_F,
116 string,
117 arg,
118 SCM_BOOL_F);
119 }
120
121
122 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
123 (SCM setting),
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
127 #define FUNC_NAME s_scm_read_options
128 {
129 SCM ans = scm_options (setting,
130 scm_read_opts,
131 FUNC_NAME);
132 if (SCM_COPY_SOURCE_P)
133 SCM_RECORD_POSITIONS_P = 1;
134 return ans;
135 }
136 #undef FUNC_NAME
137
138 /* An association list mapping extra hash characters to procedures. */
139 static SCM *scm_read_hash_procedures;
140
141
142 \f
143 /* Token readers. */
144
145
146 /* Size of the C buffer used to read symbols and numbers. */
147 #define READER_BUFFER_SIZE 128
148
149 /* Size of the C buffer used to read strings. */
150 #define READER_STRING_BUFFER_SIZE 512
151
152 /* The maximum size of Scheme character names. */
153 #define READER_CHAR_NAME_MAX_SIZE 50
154
155
156 /* `isblank' is only in C99. */
157 #define CHAR_IS_BLANK_(_chr) \
158 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
159 || ((_chr) == '\f') || ((_chr) == '\r'))
160
161 #ifdef MSDOS
162 # define CHAR_IS_BLANK(_chr) \
163 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
164 #else
165 # define CHAR_IS_BLANK CHAR_IS_BLANK_
166 #endif
167
168
169 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
170 structure''). */
171 #define CHAR_IS_R5RS_DELIMITER(c) \
172 (CHAR_IS_BLANK (c) \
173 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
174
175 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
176
177 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
178 Structure''. */
179 #define CHAR_IS_EXPONENT_MARKER(_chr) \
180 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
181 || ((_chr) == 'd') || ((_chr) == 'l'))
182
183 /* Read an SCSH block comment. */
184 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
185 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
186 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
187 static SCM scm_get_hash_procedure (int);
188
189 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
190 zero if the whole token fits in BUF, non-zero otherwise. */
191 static inline int
192 read_token (SCM port, SCM buf, size_t *read)
193 {
194 scm_t_wchar chr;
195 *read = 0;
196
197 buf = scm_i_string_start_writing (buf);
198 while (*read < scm_i_string_length (buf))
199 {
200 chr = scm_getc (port);
201
202 if (chr == EOF)
203 {
204 scm_i_string_stop_writing ();
205 return 0;
206 }
207
208 chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
209
210 if (CHAR_IS_DELIMITER (chr))
211 {
212 scm_i_string_stop_writing ();
213 scm_ungetc (chr, port);
214 return 0;
215 }
216
217 scm_i_string_set_x (buf, *read, chr);
218 (*read)++;
219 }
220 scm_i_string_stop_writing ();
221
222 return 1;
223 }
224
225 static SCM
226 read_complete_token (SCM port, size_t *read)
227 {
228 SCM buffer, str = SCM_EOL;
229 size_t len;
230 int overflow;
231
232 buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
233 overflow = read_token (port, buffer, read);
234 if (!overflow)
235 return scm_i_substring (buffer, 0, *read);
236
237 str = scm_string_copy (buffer);
238 do
239 {
240 overflow = read_token (port, buffer, &len);
241 str = scm_string_append (scm_list_2 (str, buffer));
242 *read += len;
243 }
244 while (overflow);
245
246 return scm_i_substring (str, 0, *read);
247 }
248
249 /* Skip whitespace from PORT and return the first non-whitespace character
250 read. Raise an error on end-of-file. */
251 static int
252 flush_ws (SCM port, const char *eoferr)
253 {
254 register scm_t_wchar c;
255 while (1)
256 switch (c = scm_getc (port))
257 {
258 case EOF:
259 goteof:
260 if (eoferr)
261 {
262 scm_i_input_error (eoferr,
263 port,
264 "end of file",
265 SCM_EOL);
266 }
267 return c;
268
269 case ';':
270 lp:
271 switch (c = scm_getc (port))
272 {
273 case EOF:
274 goto goteof;
275 default:
276 goto lp;
277 case SCM_LINE_INCREMENTORS:
278 break;
279 }
280 break;
281
282 case '#':
283 switch (c = scm_getc (port))
284 {
285 case EOF:
286 eoferr = "read_sharp";
287 goto goteof;
288 case '!':
289 scm_read_scsh_block_comment (c, port);
290 break;
291 case ';':
292 scm_read_commented_expression (c, port);
293 break;
294 case '|':
295 if (scm_is_false (scm_get_hash_procedure (c)))
296 {
297 scm_read_r6rs_block_comment (c, port);
298 break;
299 }
300 /* fall through */
301 default:
302 scm_ungetc (c, port);
303 return '#';
304 }
305 break;
306
307 case SCM_LINE_INCREMENTORS:
308 case SCM_SINGLE_SPACES:
309 case '\t':
310 break;
311
312 default:
313 return c;
314 }
315
316 return 0;
317 }
318
319
320 \f
321 /* Token readers. */
322
323 static SCM scm_read_expression (SCM port);
324 static SCM scm_read_sharp (int chr, SCM port);
325 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
326
327
328 static SCM
329 scm_read_sexp (scm_t_wchar chr, SCM port)
330 #define FUNC_NAME "scm_i_lreadparen"
331 {
332 register int c;
333 register SCM tmp;
334 register SCM tl, ans = SCM_EOL;
335 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
336 static const int terminating_char = ')';
337
338 /* Need to capture line and column numbers here. */
339 long line = SCM_LINUM (port);
340 int column = SCM_COL (port) - 1;
341
342
343 c = flush_ws (port, FUNC_NAME);
344 if (terminating_char == c)
345 return SCM_EOL;
346
347 scm_ungetc (c, port);
348 if (scm_is_eq (scm_sym_dot,
349 (tmp = scm_read_expression (port))))
350 {
351 ans = scm_read_expression (port);
352 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
353 scm_i_input_error (FUNC_NAME, port, "missing close paren",
354 SCM_EOL);
355 return ans;
356 }
357
358 /* Build the head of the list structure. */
359 ans = tl = scm_cons (tmp, SCM_EOL);
360
361 if (SCM_COPY_SOURCE_P)
362 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
363 ? copy
364 : tmp,
365 SCM_EOL);
366
367 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
368 {
369 SCM new_tail;
370
371 scm_ungetc (c, port);
372 if (scm_is_eq (scm_sym_dot,
373 (tmp = scm_read_expression (port))))
374 {
375 SCM_SETCDR (tl, tmp = scm_read_expression (port));
376
377 if (SCM_COPY_SOURCE_P)
378 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
379 SCM_EOL));
380
381 c = flush_ws (port, FUNC_NAME);
382 if (terminating_char != c)
383 scm_i_input_error (FUNC_NAME, port,
384 "in pair: missing close paren", SCM_EOL);
385 goto exit;
386 }
387
388 new_tail = scm_cons (tmp, SCM_EOL);
389 SCM_SETCDR (tl, new_tail);
390 tl = new_tail;
391
392 if (SCM_COPY_SOURCE_P)
393 {
394 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
395 ? copy
396 : tmp, SCM_EOL);
397 SCM_SETCDR (tl2, new_tail2);
398 tl2 = new_tail2;
399 }
400 }
401
402 exit:
403 if (SCM_RECORD_POSITIONS_P)
404 scm_whash_insert (scm_source_whash,
405 ans,
406 scm_make_srcprops (line, column,
407 SCM_FILENAME (port),
408 SCM_COPY_SOURCE_P
409 ? ans2
410 : SCM_UNDEFINED,
411 SCM_EOL));
412 return ans;
413 }
414 #undef FUNC_NAME
415
416 static SCM
417 scm_read_string (int chr, SCM port)
418 #define FUNC_NAME "scm_lreadr"
419 {
420 /* For strings smaller than C_STR, this function creates only one Scheme
421 object (the string returned). */
422
423 SCM str = SCM_BOOL_F;
424 unsigned c_str_len = 0;
425 scm_t_wchar c;
426
427 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
428 while ('"' != (c = scm_getc (port)))
429 {
430 if (c == EOF)
431 {
432 str_eof:
433 scm_i_input_error (FUNC_NAME, port,
434 "end of file in string constant", SCM_EOL);
435 }
436
437 if (c_str_len + 1 >= scm_i_string_length (str))
438 {
439 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
440
441 str = scm_string_append (scm_list_2 (str, addy));
442 }
443
444 if (c == '\\')
445 {
446 switch (c = scm_getc (port))
447 {
448 case EOF:
449 goto str_eof;
450 case '"':
451 case '\\':
452 break;
453 #if SCM_ENABLE_ELISP
454 case '(':
455 case ')':
456 if (SCM_ESCAPED_PARENS_P)
457 break;
458 goto bad_escaped;
459 #endif
460 case '\n':
461 continue;
462 case '0':
463 c = '\0';
464 break;
465 case 'f':
466 c = '\f';
467 break;
468 case 'n':
469 c = '\n';
470 break;
471 case 'r':
472 c = '\r';
473 break;
474 case 't':
475 c = '\t';
476 break;
477 case 'a':
478 c = '\007';
479 break;
480 case 'v':
481 c = '\v';
482 break;
483 case 'x':
484 {
485 scm_t_wchar a, b;
486 a = scm_getc (port);
487 if (a == EOF)
488 goto str_eof;
489 b = scm_getc (port);
490 if (b == EOF)
491 goto str_eof;
492 if ('0' <= a && a <= '9')
493 a -= '0';
494 else if ('A' <= a && a <= 'F')
495 a = a - 'A' + 10;
496 else if ('a' <= a && a <= 'f')
497 a = a - 'a' + 10;
498 else
499 {
500 c = a;
501 goto bad_escaped;
502 }
503 if ('0' <= b && b <= '9')
504 b -= '0';
505 else if ('A' <= b && b <= 'F')
506 b = b - 'A' + 10;
507 else if ('a' <= b && b <= 'f')
508 b = b - 'a' + 10;
509 else
510 {
511 c = b;
512 goto bad_escaped;
513 }
514 c = a * 16 + b;
515 break;
516 }
517 case 'u':
518 {
519 scm_t_wchar a;
520 int i;
521 c = 0;
522 for (i = 0; i < 4; i++)
523 {
524 a = scm_getc (port);
525 if (a == EOF)
526 goto str_eof;
527 if ('0' <= a && a <= '9')
528 a -= '0';
529 else if ('A' <= a && a <= 'F')
530 a = a - 'A' + 10;
531 else if ('a' <= a && a <= 'f')
532 a = a - 'a' + 10;
533 else
534 {
535 c = a;
536 goto bad_escaped;
537 }
538 c = c * 16 + a;
539 }
540 break;
541 }
542 case 'U':
543 {
544 scm_t_wchar a;
545 int i;
546 c = 0;
547 for (i = 0; i < 6; i++)
548 {
549 a = scm_getc (port);
550 if (a == EOF)
551 goto str_eof;
552 if ('0' <= a && a <= '9')
553 a -= '0';
554 else if ('A' <= a && a <= 'F')
555 a = a - 'A' + 10;
556 else if ('a' <= a && a <= 'f')
557 a = a - 'a' + 10;
558 else
559 {
560 c = a;
561 goto bad_escaped;
562 }
563 c = c * 16 + a;
564 }
565 break;
566 }
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
577 }
578
579 if (c_str_len > 0)
580 {
581 return scm_i_substring_copy (str, 0, c_str_len);
582 }
583
584 return scm_nullstr;
585 }
586 #undef FUNC_NAME
587
588
589 static SCM
590 scm_read_number (scm_t_wchar chr, SCM port)
591 {
592 SCM result;
593 SCM buffer;
594 size_t read;
595
596 scm_ungetc (chr, port);
597 buffer = read_complete_token (port, &read);
598 result = scm_string_to_number (buffer, SCM_UNDEFINED);
599 if (!scm_is_true (result))
600 /* Return a symbol instead of a number. */
601 result = scm_string_to_symbol (buffer);
602
603 return result;
604 }
605
606 static SCM
607 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
608 {
609 SCM result;
610 int ends_with_colon = 0;
611 SCM buffer;
612 size_t read = 0;
613 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
614
615 scm_ungetc (chr, port);
616 buffer = read_complete_token (port, &read);
617 if (read > 0)
618 ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
619
620 if (postfix && ends_with_colon && (read > 1))
621 result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
622 else
623 result = scm_string_to_symbol (buffer);
624
625 return result;
626 }
627
628 static SCM
629 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
630 #define FUNC_NAME "scm_lreadr"
631 {
632 SCM result;
633 size_t read;
634 SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
635 unsigned int radix;
636
637 switch (chr)
638 {
639 case 'B':
640 case 'b':
641 radix = 2;
642 break;
643
644 case 'o':
645 case 'O':
646 radix = 8;
647 break;
648
649 case 'd':
650 case 'D':
651 radix = 10;
652 break;
653
654 case 'x':
655 case 'X':
656 radix = 16;
657 break;
658
659 default:
660 scm_ungetc (chr, port);
661 scm_ungetc ('#', port);
662 radix = 10;
663 }
664
665 buffer = read_complete_token (port, &read);
666 result = scm_string_to_number (buffer, scm_from_uint (radix));
667
668 if (scm_is_true (result))
669 return result;
670
671 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
672
673 return SCM_BOOL_F;
674 }
675 #undef FUNC_NAME
676
677 static SCM
678 scm_read_quote (int chr, SCM port)
679 {
680 SCM p;
681 long line = SCM_LINUM (port);
682 int column = SCM_COL (port) - 1;
683
684 switch (chr)
685 {
686 case '`':
687 p = scm_sym_quasiquote;
688 break;
689
690 case '\'':
691 p = scm_sym_quote;
692 break;
693
694 case ',':
695 {
696 scm_t_wchar c;
697
698 c = scm_getc (port);
699 if ('@' == c)
700 p = scm_sym_uq_splicing;
701 else
702 {
703 scm_ungetc (c, port);
704 p = scm_sym_unquote;
705 }
706 break;
707 }
708
709 default:
710 fprintf (stderr, "%s: unhandled quote character (%i)\n",
711 "scm_read_quote", chr);
712 abort ();
713 }
714
715 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
716 if (SCM_RECORD_POSITIONS_P)
717 scm_whash_insert (scm_source_whash, p,
718 scm_make_srcprops (line, column,
719 SCM_FILENAME (port),
720 SCM_COPY_SOURCE_P
721 ? (scm_cons2 (SCM_CAR (p),
722 SCM_CAR (SCM_CDR (p)),
723 SCM_EOL))
724 : SCM_UNDEFINED,
725 SCM_EOL));
726
727
728 return p;
729 }
730
731 SCM_SYMBOL (sym_syntax, "syntax");
732 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
733 SCM_SYMBOL (sym_unsyntax, "unsyntax");
734 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
735
736 static SCM
737 scm_read_syntax (int chr, SCM port)
738 {
739 SCM p;
740 long line = SCM_LINUM (port);
741 int column = SCM_COL (port) - 1;
742
743 switch (chr)
744 {
745 case '`':
746 p = sym_quasisyntax;
747 break;
748
749 case '\'':
750 p = sym_syntax;
751 break;
752
753 case ',':
754 {
755 int c;
756
757 c = scm_getc (port);
758 if ('@' == c)
759 p = sym_unsyntax_splicing;
760 else
761 {
762 scm_ungetc (c, port);
763 p = sym_unsyntax;
764 }
765 break;
766 }
767
768 default:
769 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
770 "scm_read_syntax", chr);
771 abort ();
772 }
773
774 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
775 if (SCM_RECORD_POSITIONS_P)
776 scm_whash_insert (scm_source_whash, p,
777 scm_make_srcprops (line, column,
778 SCM_FILENAME (port),
779 SCM_COPY_SOURCE_P
780 ? (scm_cons2 (SCM_CAR (p),
781 SCM_CAR (SCM_CDR (p)),
782 SCM_EOL))
783 : SCM_UNDEFINED,
784 SCM_EOL));
785
786
787 return p;
788 }
789
790 static inline SCM
791 scm_read_semicolon_comment (int chr, SCM port)
792 {
793 int c;
794
795 /* We use the get_byte here because there is no need to get the
796 locale correct with comment input. This presumes that newline
797 always represents itself no matter what the encoding is. */
798 for (c = scm_get_byte_or_eof (port);
799 (c != EOF) && (c != '\n');
800 c = scm_getc (port));
801
802 return SCM_UNSPECIFIED;
803 }
804
805 \f
806 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
807
808 static SCM
809 scm_read_boolean (int chr, SCM port)
810 {
811 switch (chr)
812 {
813 case 't':
814 case 'T':
815 return SCM_BOOL_T;
816
817 case 'f':
818 case 'F':
819 return SCM_BOOL_F;
820 }
821
822 return SCM_UNSPECIFIED;
823 }
824
825 static SCM
826 scm_read_character (scm_t_wchar chr, SCM port)
827 #define FUNC_NAME "scm_lreadr"
828 {
829 SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
830 size_t charname_len;
831 scm_t_wchar cp;
832 int overflow;
833
834 overflow = read_token (port, charname, &charname_len);
835 charname = scm_c_substring (charname, 0, charname_len);
836
837 if (overflow)
838 goto char_error;
839
840 if (charname_len == 0)
841 {
842 chr = scm_getc (port);
843 if (chr == EOF)
844 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
845 "while reading character", SCM_EOL);
846
847 /* CHR must be a token delimiter, like a whitespace. */
848 return (SCM_MAKE_CHAR (chr));
849 }
850
851 if (charname_len == 1)
852 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
853
854 cp = scm_i_string_ref (charname, 0);
855 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
856 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
857
858 if (cp >= '0' && cp < '8')
859 {
860 /* Dirk:FIXME:: This type of character syntax is not R5RS
861 * compliant. Further, it should be verified that the constant
862 * does only consist of octal digits. */
863 SCM p = scm_string_to_number (charname, scm_from_uint (8));
864 if (SCM_I_INUMP (p))
865 {
866 scm_t_wchar c = SCM_I_INUM (p);
867 if (SCM_IS_UNICODE_CHAR (c))
868 return SCM_MAKE_CHAR (c);
869 else
870 scm_i_input_error (FUNC_NAME, port,
871 "out-of-range octal character escape: ~a",
872 scm_list_1 (charname));
873 }
874 }
875
876 /* The names of characters should never have non-Latin1
877 characters. */
878 if (scm_i_is_narrow_string (charname)
879 || scm_i_try_narrow_string (charname))
880 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
881 charname_len);
882 if (scm_is_true (ch))
883 return ch;
884 }
885
886 char_error:
887 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
888 scm_list_1 (charname));
889
890 return SCM_UNSPECIFIED;
891 }
892 #undef FUNC_NAME
893
894 static inline SCM
895 scm_read_keyword (int chr, SCM port)
896 {
897 SCM symbol;
898
899 /* Read the symbol that comprises the keyword. Doing this instead of
900 invoking a specific symbol reader function allows `scm_read_keyword ()'
901 to adapt to the delimiters currently valid of symbols.
902
903 XXX: This implementation allows sloppy syntaxes like `#: key'. */
904 symbol = scm_read_expression (port);
905 if (!scm_is_symbol (symbol))
906 scm_i_input_error ("scm_read_keyword", port,
907 "keyword prefix `~a' not followed by a symbol: ~s",
908 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
909
910 return (scm_symbol_to_keyword (symbol));
911 }
912
913 static inline SCM
914 scm_read_vector (int chr, SCM port)
915 {
916 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
917 guarantee that it's going to do what we want. After all, this is an
918 implementation detail of `scm_read_vector ()', not a desirable
919 property. */
920 return (scm_vector (scm_read_sexp (chr, port)));
921 }
922
923 static inline SCM
924 scm_read_srfi4_vector (int chr, SCM port)
925 {
926 return scm_i_read_array (port, chr);
927 }
928
929 static SCM
930 scm_read_bytevector (scm_t_wchar chr, SCM port)
931 {
932 chr = scm_getc (port);
933 if (chr != 'u')
934 goto syntax;
935
936 chr = scm_getc (port);
937 if (chr != '8')
938 goto syntax;
939
940 chr = scm_getc (port);
941 if (chr != '(')
942 goto syntax;
943
944 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
945
946 syntax:
947 scm_i_input_error ("read_bytevector", port,
948 "invalid bytevector prefix",
949 SCM_MAKE_CHAR (chr));
950 return SCM_UNSPECIFIED;
951 }
952
953 static SCM
954 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
955 {
956 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
957 terribly inefficient but who cares? */
958 SCM s_bits = SCM_EOL;
959
960 for (chr = scm_getc (port);
961 (chr != EOF) && ((chr == '0') || (chr == '1'));
962 chr = scm_getc (port))
963 {
964 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
965 }
966
967 if (chr != EOF)
968 scm_ungetc (chr, port);
969
970 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
971 }
972
973 static inline SCM
974 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
975 {
976 int bang_seen = 0;
977
978 /* We can use the get_byte here because there is no need to get the
979 locale correct when reading comments. This presumes that
980 hash and exclamation points always represent themselves no
981 matter what the source encoding is.*/
982 for (;;)
983 {
984 int c = scm_get_byte_or_eof (port);
985
986 if (c == EOF)
987 scm_i_input_error ("skip_block_comment", port,
988 "unterminated `#! ... !#' comment", SCM_EOL);
989
990 if (c == '!')
991 bang_seen = 1;
992 else if (c == '#' && bang_seen)
993 break;
994 else
995 bang_seen = 0;
996 }
997
998 return SCM_UNSPECIFIED;
999 }
1000
1001 static SCM
1002 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1003 {
1004 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1005 nested. So care must be taken. */
1006 int nesting_level = 1;
1007 int opening_seen = 0, closing_seen = 0;
1008
1009 while (nesting_level > 0)
1010 {
1011 int c = scm_getc (port);
1012
1013 if (c == EOF)
1014 scm_i_input_error (__FUNCTION__, port,
1015 "unterminated `#| ... |#' comment", SCM_EOL);
1016
1017 if (opening_seen)
1018 {
1019 if (c == '|')
1020 nesting_level++;
1021 opening_seen = 0;
1022 }
1023 else if (closing_seen)
1024 {
1025 if (c == '#')
1026 nesting_level--;
1027 closing_seen = 0;
1028 }
1029 else if (c == '|')
1030 closing_seen = 1;
1031 else if (c == '#')
1032 opening_seen = 1;
1033 else
1034 opening_seen = closing_seen = 0;
1035 }
1036
1037 return SCM_UNSPECIFIED;
1038 }
1039
1040 static SCM
1041 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1042 {
1043 scm_t_wchar c;
1044
1045 c = flush_ws (port, (char *) NULL);
1046 if (EOF == c)
1047 scm_i_input_error ("read_commented_expression", port,
1048 "no expression after #; comment", SCM_EOL);
1049 scm_ungetc (c, port);
1050 scm_read_expression (port);
1051 return SCM_UNSPECIFIED;
1052 }
1053
1054 static SCM
1055 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1056 {
1057 /* Guile's extended symbol read syntax looks like this:
1058
1059 #{This is all a symbol name}#
1060
1061 So here, CHR is expected to be `{'. */
1062 int saw_brace = 0, finished = 0;
1063 size_t len = 0;
1064 SCM buf = scm_i_make_string (1024, NULL);
1065
1066 buf = scm_i_string_start_writing (buf);
1067
1068 while ((chr = scm_getc (port)) != EOF)
1069 {
1070 if (saw_brace)
1071 {
1072 if (chr == '#')
1073 {
1074 finished = 1;
1075 break;
1076 }
1077 else
1078 {
1079 saw_brace = 0;
1080 scm_i_string_set_x (buf, len++, '}');
1081 scm_i_string_set_x (buf, len++, chr);
1082 }
1083 }
1084 else if (chr == '}')
1085 saw_brace = 1;
1086 else
1087 scm_i_string_set_x (buf, len++, chr);
1088
1089 if (len >= scm_i_string_length (buf) - 2)
1090 {
1091 scm_i_string_stop_writing ();
1092 SCM addy = scm_i_make_string (1024, NULL);
1093 buf = scm_string_append (scm_list_2 (buf, addy));
1094 len = 0;
1095 buf = scm_i_string_start_writing (buf);
1096 }
1097
1098 if (finished)
1099 break;
1100 }
1101 scm_i_string_stop_writing ();
1102
1103 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1104 }
1105
1106
1107 \f
1108 /* Top-level token readers, i.e., dispatchers. */
1109
1110 static SCM
1111 scm_read_sharp_extension (int chr, SCM port)
1112 {
1113 SCM proc;
1114
1115 proc = scm_get_hash_procedure (chr);
1116 if (scm_is_true (scm_procedure_p (proc)))
1117 {
1118 long line = SCM_LINUM (port);
1119 int column = SCM_COL (port) - 2;
1120 SCM got;
1121
1122 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1123 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1124 {
1125 if (SCM_RECORD_POSITIONS_P)
1126 return (recsexpr (got, line, column,
1127 SCM_FILENAME (port)));
1128 else
1129 return got;
1130 }
1131 }
1132
1133 return SCM_UNSPECIFIED;
1134 }
1135
1136 /* The reader for the sharp `#' character. It basically dispatches reads
1137 among the above token readers. */
1138 static SCM
1139 scm_read_sharp (scm_t_wchar chr, SCM port)
1140 #define FUNC_NAME "scm_lreadr"
1141 {
1142 SCM result;
1143
1144 chr = scm_getc (port);
1145
1146 result = scm_read_sharp_extension (chr, port);
1147 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1148 return result;
1149
1150 switch (chr)
1151 {
1152 case '\\':
1153 return (scm_read_character (chr, port));
1154 case '(':
1155 return (scm_read_vector (chr, port));
1156 case 's':
1157 case 'u':
1158 case 'f':
1159 /* This one may return either a boolean or an SRFI-4 vector. */
1160 return (scm_read_srfi4_vector (chr, port));
1161 case 'v':
1162 return (scm_read_bytevector (chr, port));
1163 case '*':
1164 return (scm_read_guile_bit_vector (chr, port));
1165 case 't':
1166 case 'T':
1167 case 'F':
1168 /* This one may return either a boolean or an SRFI-4 vector. */
1169 return (scm_read_boolean (chr, port));
1170 case ':':
1171 return (scm_read_keyword (chr, port));
1172 case '0': case '1': case '2': case '3': case '4':
1173 case '5': case '6': case '7': case '8': case '9':
1174 case '@':
1175 #if SCM_ENABLE_DEPRECATED
1176 /* See below for 'i' and 'e'. */
1177 case 'a':
1178 case 'c':
1179 case 'y':
1180 case 'h':
1181 case 'l':
1182 #endif
1183 return (scm_i_read_array (port, chr));
1184
1185 case 'i':
1186 case 'e':
1187 #if SCM_ENABLE_DEPRECATED
1188 {
1189 /* When next char is '(', it really is an old-style
1190 uniform array. */
1191 scm_t_wchar next_c = scm_getc (port);
1192 if (next_c != EOF)
1193 scm_ungetc (next_c, port);
1194 if (next_c == '(')
1195 return scm_i_read_array (port, chr);
1196 /* Fall through. */
1197 }
1198 #endif
1199 case 'b':
1200 case 'B':
1201 case 'o':
1202 case 'O':
1203 case 'd':
1204 case 'D':
1205 case 'x':
1206 case 'X':
1207 case 'I':
1208 case 'E':
1209 return (scm_read_number_and_radix (chr, port));
1210 case '{':
1211 return (scm_read_extended_symbol (chr, port));
1212 case '!':
1213 return (scm_read_scsh_block_comment (chr, port));
1214 case ';':
1215 return (scm_read_commented_expression (chr, port));
1216 case '`':
1217 case '\'':
1218 case ',':
1219 return (scm_read_syntax (chr, port));
1220 default:
1221 result = scm_read_sharp_extension (chr, port);
1222 if (scm_is_eq (result, SCM_UNSPECIFIED))
1223 {
1224 /* To remain compatible with 1.8 and earlier, the following
1225 characters have lower precedence than `read-hash-extend'
1226 characters. */
1227 switch (chr)
1228 {
1229 case '|':
1230 return scm_read_r6rs_block_comment (chr, port);
1231 default:
1232 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1233 scm_list_1 (SCM_MAKE_CHAR (chr)));
1234 }
1235 }
1236 else
1237 return result;
1238 }
1239
1240 return SCM_UNSPECIFIED;
1241 }
1242 #undef FUNC_NAME
1243
1244 static SCM
1245 scm_read_expression (SCM port)
1246 #define FUNC_NAME "scm_read_expression"
1247 {
1248 while (1)
1249 {
1250 register scm_t_wchar chr;
1251
1252 chr = scm_getc (port);
1253
1254 switch (chr)
1255 {
1256 case SCM_WHITE_SPACES:
1257 case SCM_LINE_INCREMENTORS:
1258 break;
1259 case ';':
1260 (void) scm_read_semicolon_comment (chr, port);
1261 break;
1262 case '(':
1263 return (scm_read_sexp (chr, port));
1264 case '"':
1265 return (scm_read_string (chr, port));
1266 case '\'':
1267 case '`':
1268 case ',':
1269 return (scm_read_quote (chr, port));
1270 case '#':
1271 {
1272 SCM result;
1273 result = scm_read_sharp (chr, port);
1274 if (scm_is_eq (result, SCM_UNSPECIFIED))
1275 /* We read a comment or some such. */
1276 break;
1277 else
1278 return result;
1279 }
1280 case ')':
1281 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1282 break;
1283 case EOF:
1284 return SCM_EOF_VAL;
1285 case ':':
1286 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1287 return scm_symbol_to_keyword (scm_read_expression (port));
1288 /* Fall through. */
1289
1290 default:
1291 {
1292 if (((chr >= '0') && (chr <= '9'))
1293 || (strchr ("+-.", chr)))
1294 return (scm_read_number (chr, port));
1295 else
1296 return (scm_read_mixed_case_symbol (chr, port));
1297 }
1298 }
1299 }
1300 }
1301 #undef FUNC_NAME
1302
1303 \f
1304 /* Actual reader. */
1305
1306 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1307 (SCM port),
1308 "Read an s-expression from the input port @var{port}, or from\n"
1309 "the current input port if @var{port} is not specified.\n"
1310 "Any whitespace before the next token is discarded.")
1311 #define FUNC_NAME s_scm_read
1312 {
1313 int c;
1314
1315 if (SCM_UNBNDP (port))
1316 port = scm_current_input_port ();
1317 SCM_VALIDATE_OPINPORT (1, port);
1318
1319 c = flush_ws (port, (char *) NULL);
1320 if (EOF == c)
1321 return SCM_EOF_VAL;
1322 scm_ungetc (c, port);
1323
1324 return (scm_read_expression (port));
1325 }
1326 #undef FUNC_NAME
1327
1328
1329 \f
1330
1331 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1332 static SCM
1333 recsexpr (SCM obj, long line, int column, SCM filename)
1334 {
1335 if (!scm_is_pair(obj)) {
1336 return obj;
1337 } else {
1338 SCM tmp = obj, copy;
1339 /* If this sexpr is visible in the read:sharp source, we want to
1340 keep that information, so only record non-constant cons cells
1341 which haven't previously been read by the reader. */
1342 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1343 {
1344 if (SCM_COPY_SOURCE_P)
1345 {
1346 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1347 SCM_UNDEFINED);
1348 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1349 {
1350 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1351 line,
1352 column,
1353 filename),
1354 SCM_UNDEFINED));
1355 copy = SCM_CDR (copy);
1356 }
1357 SCM_SETCDR (copy, tmp);
1358 }
1359 else
1360 {
1361 recsexpr (SCM_CAR (obj), line, column, filename);
1362 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1363 recsexpr (SCM_CAR (tmp), line, column, filename);
1364 copy = SCM_UNDEFINED;
1365 }
1366 scm_whash_insert (scm_source_whash,
1367 obj,
1368 scm_make_srcprops (line,
1369 column,
1370 filename,
1371 copy,
1372 SCM_EOL));
1373 }
1374 return obj;
1375 }
1376 }
1377
1378 /* Manipulate the read-hash-procedures alist. This could be written in
1379 Scheme, but maybe it will also be used by C code during initialisation. */
1380 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1381 (SCM chr, SCM proc),
1382 "Install the procedure @var{proc} for reading expressions\n"
1383 "starting with the character sequence @code{#} and @var{chr}.\n"
1384 "@var{proc} will be called with two arguments: the character\n"
1385 "@var{chr} and the port to read further data from. The object\n"
1386 "returned will be the return value of @code{read}. \n"
1387 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1388 )
1389 #define FUNC_NAME s_scm_read_hash_extend
1390 {
1391 SCM this;
1392 SCM prev;
1393
1394 SCM_VALIDATE_CHAR (1, chr);
1395 SCM_ASSERT (scm_is_false (proc)
1396 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1397 proc, SCM_ARG2, FUNC_NAME);
1398
1399 /* Check if chr is already in the alist. */
1400 this = *scm_read_hash_procedures;
1401 prev = SCM_BOOL_F;
1402 while (1)
1403 {
1404 if (scm_is_null (this))
1405 {
1406 /* not found, so add it to the beginning. */
1407 if (scm_is_true (proc))
1408 {
1409 *scm_read_hash_procedures =
1410 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1411 }
1412 break;
1413 }
1414 if (scm_is_eq (chr, SCM_CAAR (this)))
1415 {
1416 /* already in the alist. */
1417 if (scm_is_false (proc))
1418 {
1419 /* remove it. */
1420 if (scm_is_false (prev))
1421 {
1422 *scm_read_hash_procedures =
1423 SCM_CDR (*scm_read_hash_procedures);
1424 }
1425 else
1426 scm_set_cdr_x (prev, SCM_CDR (this));
1427 }
1428 else
1429 {
1430 /* replace it. */
1431 scm_set_cdr_x (SCM_CAR (this), proc);
1432 }
1433 break;
1434 }
1435 prev = this;
1436 this = SCM_CDR (this);
1437 }
1438
1439 return SCM_UNSPECIFIED;
1440 }
1441 #undef FUNC_NAME
1442
1443 /* Recover the read-hash procedure corresponding to char c. */
1444 static SCM
1445 scm_get_hash_procedure (int c)
1446 {
1447 SCM rest = *scm_read_hash_procedures;
1448
1449 while (1)
1450 {
1451 if (scm_is_null (rest))
1452 return SCM_BOOL_F;
1453
1454 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1455 return SCM_CDAR (rest);
1456
1457 rest = SCM_CDR (rest);
1458 }
1459 }
1460
1461 #define SCM_ENCODING_SEARCH_SIZE (500)
1462
1463 /* Search the first few hundred characters of a file for an Emacs-like coding
1464 declaration. Returns either NULL or a string whose storage has been
1465 allocated with `scm_gc_malloc ()'. */
1466 char *
1467 scm_i_scan_for_encoding (SCM port)
1468 {
1469 char header[SCM_ENCODING_SEARCH_SIZE+1];
1470 size_t bytes_read;
1471 char *encoding = NULL;
1472 int utf8_bom = 0;
1473 char *pos;
1474 int i;
1475 int in_comment;
1476
1477 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1478 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1479
1480 if (bytes_read > 3
1481 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1482 utf8_bom = 1;
1483
1484 /* search past "coding[:=]" */
1485 pos = header;
1486 while (1)
1487 {
1488 if ((pos = strstr(pos, "coding")) == NULL)
1489 return NULL;
1490
1491 pos += strlen("coding");
1492 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1493 (*pos == ':' || *pos == '='))
1494 {
1495 pos ++;
1496 break;
1497 }
1498 }
1499
1500 /* skip spaces */
1501 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1502 (*pos == ' ' || *pos == '\t'))
1503 pos ++;
1504
1505 /* grab the next token */
1506 i = 0;
1507 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1508 && pos + i - header < bytes_read
1509 && (isalnum((int) pos[i]) || pos[i] == '_' || pos[i] == '-'
1510 || pos[i] == '.'))
1511 i++;
1512
1513 if (i == 0)
1514 return NULL;
1515
1516 encoding = scm_gc_strndup (pos, i + 1, "encoding");
1517 for (i = 0; i < strlen (encoding); i++)
1518 encoding[i] = toupper ((int) encoding[i]);
1519
1520 /* push backwards to make sure we were in a comment */
1521 in_comment = 0;
1522 while (pos - i - header > 0)
1523 {
1524 if (*(pos - i) == '\n')
1525 {
1526 /* This wasn't in a semicolon comment. Check for a
1527 hash-bang comment. */
1528 char *beg = strstr (header, "#!");
1529 char *end = strstr (header, "!#");
1530 if (beg < pos && pos < end)
1531 in_comment = 1;
1532 break;
1533 }
1534 if (*(pos - i) == ';')
1535 {
1536 in_comment = 1;
1537 break;
1538 }
1539 i ++;
1540 }
1541 if (!in_comment)
1542 /* This wasn't in a comment */
1543 return NULL;
1544
1545 if (utf8_bom && strcmp(encoding, "UTF-8"))
1546 scm_misc_error (NULL,
1547 "the port input declares the encoding ~s but is encoded as UTF-8",
1548 scm_list_1 (scm_from_locale_string (encoding)));
1549
1550 return encoding;
1551 }
1552
1553 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1554 (SCM port),
1555 "Scans the port for an EMACS-like character coding declaration\n"
1556 "near the top of the contents of a port with random-acessible contents.\n"
1557 "The coding declaration is of the form\n"
1558 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1559 "\n"
1560 "Returns a string containing the character encoding of the file\n"
1561 "if a declaration was found, or @code{#f} otherwise.\n")
1562 #define FUNC_NAME s_scm_file_encoding
1563 {
1564 char *enc;
1565 SCM s_enc;
1566
1567 enc = scm_i_scan_for_encoding (port);
1568 if (enc == NULL)
1569 return SCM_BOOL_F;
1570 else
1571 {
1572 s_enc = scm_from_locale_string (enc);
1573 return s_enc;
1574 }
1575
1576 return SCM_BOOL_F;
1577 }
1578 #undef FUNC_NAME
1579
1580 void
1581 scm_init_read ()
1582 {
1583 scm_read_hash_procedures =
1584 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1585
1586 scm_init_opts (scm_read_options, scm_read_opts);
1587 #include "libguile/read.x"
1588 }
1589
1590 /*
1591 Local Variables:
1592 c-file-style: "gnu"
1593 End:
1594 */