Add Unicode strings and symbols
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30
31 #include "libguile/_scm.h"
32 #include "libguile/bytevectors.h"
33 #include "libguile/chars.h"
34 #include "libguile/eval.h"
35 #include "libguile/unif.h"
36 #include "libguile/keywords.h"
37 #include "libguile/alist.h"
38 #include "libguile/srcprop.h"
39 #include "libguile/hashtab.h"
40 #include "libguile/hash.h"
41 #include "libguile/ports.h"
42 #include "libguile/root.h"
43 #include "libguile/strings.h"
44 #include "libguile/strports.h"
45 #include "libguile/vectors.h"
46 #include "libguile/validate.h"
47 #include "libguile/srfi-4.h"
48 #include "libguile/srfi-13.h"
49
50 #include "libguile/read.h"
51 #include "libguile/private-options.h"
52
53
54 \f
55
56 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
57 SCM_SYMBOL (scm_keyword_prefix, "prefix");
58 SCM_SYMBOL (scm_keyword_postfix, "postfix");
59
60 scm_t_option scm_read_opts[] = {
61 { SCM_OPTION_BOOLEAN, "copy", 0,
62 "Copy source code expressions." },
63 { SCM_OPTION_BOOLEAN, "positions", 0,
64 "Record positions of source code expressions." },
65 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
66 "Convert symbols to lower case."},
67 { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
68 "Style of keyword recognition: #f, 'prefix or 'postfix."},
69 #if SCM_ENABLE_ELISP
70 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
71 "Support Elisp vector syntax, namely `[...]'."},
72 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
73 "Support `\\(' and `\\)' in strings."},
74 #endif
75 { 0, },
76 };
77
78 /*
79 Give meaningful error messages for errors
80
81 We use the format
82
83 FILE:LINE:COL: MESSAGE
84 This happened in ....
85
86 This is not standard GNU format, but the test-suite likes the real
87 message to be in front.
88
89 */
90
91
92 void
93 scm_i_input_error (char const *function,
94 SCM port, const char *message, SCM arg)
95 {
96 SCM fn = (scm_is_string (SCM_FILENAME(port))
97 ? SCM_FILENAME(port)
98 : scm_from_locale_string ("#<unknown port>"));
99
100 SCM string_port = scm_open_output_string ();
101 SCM string = SCM_EOL;
102 scm_simple_format (string_port,
103 scm_from_locale_string ("~A:~S:~S: ~A"),
104 scm_list_4 (fn,
105 scm_from_long (SCM_LINUM (port) + 1),
106 scm_from_int (SCM_COL (port) + 1),
107 scm_from_locale_string (message)));
108
109 string = scm_get_output_string (string_port);
110 scm_close_output_port (string_port);
111 scm_error_scm (scm_from_locale_symbol ("read-error"),
112 function? scm_from_locale_string (function) : SCM_BOOL_F,
113 string,
114 arg,
115 SCM_BOOL_F);
116 }
117
118
119 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
120 (SCM setting),
121 "Option interface for the read options. Instead of using\n"
122 "this procedure directly, use the procedures @code{read-enable},\n"
123 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
124 #define FUNC_NAME s_scm_read_options
125 {
126 SCM ans = scm_options (setting,
127 scm_read_opts,
128 FUNC_NAME);
129 if (SCM_COPY_SOURCE_P)
130 SCM_RECORD_POSITIONS_P = 1;
131 return ans;
132 }
133 #undef FUNC_NAME
134
135 /* An association list mapping extra hash characters to procedures. */
136 static SCM *scm_read_hash_procedures;
137
138
139 \f
140 /* Token readers. */
141
142
143 /* Size of the C buffer used to read symbols and numbers. */
144 #define READER_BUFFER_SIZE 128
145
146 /* Size of the C buffer used to read strings. */
147 #define READER_STRING_BUFFER_SIZE 512
148
149 /* The maximum size of Scheme character names. */
150 #define READER_CHAR_NAME_MAX_SIZE 50
151
152
153 /* `isblank' is only in C99. */
154 #define CHAR_IS_BLANK_(_chr) \
155 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
156 || ((_chr) == '\f') || ((_chr) == '\r'))
157
158 #ifdef MSDOS
159 # define CHAR_IS_BLANK(_chr) \
160 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
161 #else
162 # define CHAR_IS_BLANK CHAR_IS_BLANK_
163 #endif
164
165
166 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
167 structure''). */
168 #define CHAR_IS_R5RS_DELIMITER(c) \
169 (CHAR_IS_BLANK (c) \
170 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
171
172 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
173
174 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
175 Structure''. */
176 #define CHAR_IS_EXPONENT_MARKER(_chr) \
177 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
178 || ((_chr) == 'd') || ((_chr) == 'l'))
179
180 /* An inlinable version of `scm_c_downcase ()'. */
181 #define CHAR_DOWNCASE(_chr) \
182 (((_chr) <= UCHAR_MAX) ? tolower (_chr) : (_chr))
183
184
185 /* Read an SCSH block comment. */
186 static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
187 static SCM scm_read_commented_expression (int chr, SCM port);
188
189 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
190 zero if the whole token fits in BUF, non-zero otherwise. */
191 static inline int
192 read_token (SCM port, char *buf, size_t buf_size, size_t *read)
193 {
194 *read = 0;
195
196 while (*read < buf_size)
197 {
198 int chr;
199
200 chr = scm_getc (port);
201 chr = (SCM_CASE_INSENSITIVE_P ? CHAR_DOWNCASE (chr) : chr);
202
203 if (chr == EOF)
204 return 0;
205 else if (CHAR_IS_DELIMITER (chr))
206 {
207 scm_ungetc (chr, port);
208 return 0;
209 }
210 else
211 {
212 *buf = (char) chr;
213 buf++, (*read)++;
214 }
215 }
216
217 return 1;
218 }
219
220
221 /* Skip whitespace from PORT and return the first non-whitespace character
222 read. Raise an error on end-of-file. */
223 static int
224 flush_ws (SCM port, const char *eoferr)
225 {
226 register int c;
227 while (1)
228 switch (c = scm_getc (port))
229 {
230 case EOF:
231 goteof:
232 if (eoferr)
233 {
234 scm_i_input_error (eoferr,
235 port,
236 "end of file",
237 SCM_EOL);
238 }
239 return c;
240
241 case ';':
242 lp:
243 switch (c = scm_getc (port))
244 {
245 case EOF:
246 goto goteof;
247 default:
248 goto lp;
249 case SCM_LINE_INCREMENTORS:
250 break;
251 }
252 break;
253
254 case '#':
255 switch (c = scm_getc (port))
256 {
257 case EOF:
258 eoferr = "read_sharp";
259 goto goteof;
260 case '!':
261 scm_read_scsh_block_comment (c, port);
262 break;
263 case ';':
264 scm_read_commented_expression (c, port);
265 break;
266 default:
267 scm_ungetc (c, port);
268 return '#';
269 }
270 break;
271
272 case SCM_LINE_INCREMENTORS:
273 case SCM_SINGLE_SPACES:
274 case '\t':
275 break;
276
277 default:
278 return c;
279 }
280
281 return 0;
282 }
283
284
285 \f
286 /* Token readers. */
287
288 static SCM scm_read_expression (SCM port);
289 static SCM scm_read_sharp (int chr, SCM port);
290 static SCM scm_get_hash_procedure (int c);
291 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
292
293
294 static SCM
295 scm_read_sexp (int chr, SCM port)
296 #define FUNC_NAME "scm_i_lreadparen"
297 {
298 register int c;
299 register SCM tmp;
300 register SCM tl, ans = SCM_EOL;
301 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
302 static const int terminating_char = ')';
303
304 /* Need to capture line and column numbers here. */
305 long line = SCM_LINUM (port);
306 int column = SCM_COL (port) - 1;
307
308
309 c = flush_ws (port, FUNC_NAME);
310 if (terminating_char == c)
311 return SCM_EOL;
312
313 scm_ungetc (c, port);
314 if (scm_is_eq (scm_sym_dot,
315 (tmp = scm_read_expression (port))))
316 {
317 ans = scm_read_expression (port);
318 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
319 scm_i_input_error (FUNC_NAME, port, "missing close paren",
320 SCM_EOL);
321 return ans;
322 }
323
324 /* Build the head of the list structure. */
325 ans = tl = scm_cons (tmp, SCM_EOL);
326
327 if (SCM_COPY_SOURCE_P)
328 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
329 ? copy
330 : tmp,
331 SCM_EOL);
332
333 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
334 {
335 SCM new_tail;
336
337 scm_ungetc (c, port);
338 if (scm_is_eq (scm_sym_dot,
339 (tmp = scm_read_expression (port))))
340 {
341 SCM_SETCDR (tl, tmp = scm_read_expression (port));
342
343 if (SCM_COPY_SOURCE_P)
344 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
345 SCM_EOL));
346
347 c = flush_ws (port, FUNC_NAME);
348 if (terminating_char != c)
349 scm_i_input_error (FUNC_NAME, port,
350 "in pair: missing close paren", SCM_EOL);
351 goto exit;
352 }
353
354 new_tail = scm_cons (tmp, SCM_EOL);
355 SCM_SETCDR (tl, new_tail);
356 tl = new_tail;
357
358 if (SCM_COPY_SOURCE_P)
359 {
360 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
361 ? copy
362 : tmp, SCM_EOL);
363 SCM_SETCDR (tl2, new_tail2);
364 tl2 = new_tail2;
365 }
366 }
367
368 exit:
369 if (SCM_RECORD_POSITIONS_P)
370 scm_whash_insert (scm_source_whash,
371 ans,
372 scm_make_srcprops (line, column,
373 SCM_FILENAME (port),
374 SCM_COPY_SOURCE_P
375 ? ans2
376 : SCM_UNDEFINED,
377 SCM_EOL));
378 return ans;
379 }
380 #undef FUNC_NAME
381
382 static SCM
383 scm_read_string (int chr, SCM port)
384 #define FUNC_NAME "scm_lreadr"
385 {
386 /* For strings smaller than C_STR, this function creates only one Scheme
387 object (the string returned). */
388
389 SCM str = SCM_BOOL_F;
390 unsigned c_str_len = 0;
391 scm_t_wchar c;
392
393 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
394 while ('"' != (c = scm_getc (port)))
395 {
396 if (c == EOF)
397 {
398 str_eof:
399 scm_i_input_error (FUNC_NAME, port,
400 "end of file in string constant", SCM_EOL);
401 }
402
403 if (c_str_len + 1 >= scm_i_string_length (str))
404 {
405 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
406
407 str = scm_string_append (scm_list_2 (str, addy));
408 }
409
410 if (c == '\\')
411 {
412 switch (c = scm_getc (port))
413 {
414 case EOF:
415 goto str_eof;
416 case '"':
417 case '\\':
418 break;
419 #if SCM_ENABLE_ELISP
420 case '(':
421 case ')':
422 if (SCM_ESCAPED_PARENS_P)
423 break;
424 goto bad_escaped;
425 #endif
426 case '\n':
427 continue;
428 case '0':
429 c = '\0';
430 break;
431 case 'f':
432 c = '\f';
433 break;
434 case 'n':
435 c = '\n';
436 break;
437 case 'r':
438 c = '\r';
439 break;
440 case 't':
441 c = '\t';
442 break;
443 case 'a':
444 c = '\007';
445 break;
446 case 'v':
447 c = '\v';
448 break;
449 case 'x':
450 {
451 scm_t_wchar a, b;
452 a = scm_getc (port);
453 if (a == EOF)
454 goto str_eof;
455 b = scm_getc (port);
456 if (b == EOF)
457 goto str_eof;
458 if ('0' <= a && a <= '9')
459 a -= '0';
460 else if ('A' <= a && a <= 'F')
461 a = a - 'A' + 10;
462 else if ('a' <= a && a <= 'f')
463 a = a - 'a' + 10;
464 else
465 {
466 c = a;
467 goto bad_escaped;
468 }
469 if ('0' <= b && b <= '9')
470 b -= '0';
471 else if ('A' <= b && b <= 'F')
472 b = b - 'A' + 10;
473 else if ('a' <= b && b <= 'f')
474 b = b - 'a' + 10;
475 else
476 {
477 c = b;
478 goto bad_escaped;
479 }
480 c = a * 16 + b;
481 break;
482 }
483 case 'u':
484 {
485 scm_t_wchar a;
486 int i;
487 c = 0;
488 for (i = 0; i < 4; i++)
489 {
490 a = scm_getc (port);
491 if (a == EOF)
492 goto str_eof;
493 if ('0' <= a && a <= '9')
494 a -= '0';
495 else if ('A' <= a && a <= 'F')
496 a = a - 'A' + 10;
497 else if ('a' <= a && a <= 'f')
498 a = a - 'a' + 10;
499 else
500 {
501 c = a;
502 goto bad_escaped;
503 }
504 c = c * 16 + a;
505 }
506 break;
507 }
508 case 'U':
509 {
510 scm_t_wchar a;
511 int i;
512 c = 0;
513 for (i = 0; i < 6; i++)
514 {
515 a = scm_getc (port);
516 if (a == EOF)
517 goto str_eof;
518 if ('0' <= a && a <= '9')
519 a -= '0';
520 else if ('A' <= a && a <= 'F')
521 a = a - 'A' + 10;
522 else if ('a' <= a && a <= 'f')
523 a = a - 'a' + 10;
524 else
525 {
526 c = a;
527 goto bad_escaped;
528 }
529 c = c * 16 + a;
530 }
531 break;
532 }
533 default:
534 bad_escaped:
535 scm_i_input_error (FUNC_NAME, port,
536 "illegal character in escape sequence: ~S",
537 scm_list_1 (SCM_MAKE_CHAR (c)));
538 }
539 }
540 str = scm_i_string_start_writing (str);
541 scm_i_string_set_x (str, c_str_len++, c);
542 scm_i_string_stop_writing ();
543 }
544
545 if (c_str_len > 0)
546 {
547 return scm_i_substring_copy (str, 0, c_str_len);
548 }
549
550 return scm_nullstr;
551 }
552 #undef FUNC_NAME
553
554
555 static SCM
556 scm_read_number (int chr, SCM port)
557 {
558 SCM result, str = SCM_EOL;
559 char buffer[READER_BUFFER_SIZE];
560 size_t read;
561 int overflow = 0;
562
563 scm_ungetc (chr, port);
564 do
565 {
566 overflow = read_token (port, buffer, sizeof (buffer), &read);
567
568 if ((overflow) || (scm_is_pair (str)))
569 str = scm_cons (scm_from_locale_stringn (buffer, read), str);
570 }
571 while (overflow);
572
573 if (scm_is_pair (str))
574 {
575 /* The slow path. */
576
577 str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
578 result = scm_string_to_number (str, SCM_UNDEFINED);
579 if (!scm_is_true (result))
580 /* Return a symbol instead of a number. */
581 result = scm_string_to_symbol (str);
582 }
583 else
584 {
585 result = scm_c_locale_stringn_to_number (buffer, read, 10);
586 if (!scm_is_true (result))
587 /* Return a symbol instead of a number. */
588 result = scm_from_locale_symboln (buffer, read);
589 }
590
591 return result;
592 }
593
594 static SCM
595 scm_read_mixed_case_symbol (int chr, SCM port)
596 {
597 SCM result, str = SCM_EOL;
598 int overflow = 0, ends_with_colon = 0;
599 char buffer[READER_BUFFER_SIZE];
600 size_t read = 0;
601 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
602
603 scm_ungetc (chr, port);
604 do
605 {
606 overflow = read_token (port, buffer, sizeof (buffer), &read);
607
608 if (read > 0)
609 ends_with_colon = (buffer[read - 1] == ':');
610
611 if ((overflow) || (scm_is_pair (str)))
612 str = scm_cons (scm_from_locale_stringn (buffer, read), str);
613 }
614 while (overflow);
615
616 if (scm_is_pair (str))
617 {
618 size_t len;
619
620 str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
621 len = scm_c_string_length (str);
622
623 /* Per SRFI-88, `:' alone is an identifier, not a keyword. */
624 if (postfix && ends_with_colon && (len > 1))
625 {
626 /* Strip off colon. */
627 str = scm_c_substring (str, 0, len-1);
628 result = scm_string_to_symbol (str);
629 result = scm_symbol_to_keyword (result);
630 }
631 else
632 result = scm_string_to_symbol (str);
633 }
634 else
635 {
636 /* For symbols smaller than `sizeof (buffer)', we don't need to recur
637 to Scheme strings. Therefore, we only create one Scheme object (a
638 symbol) per symbol read. */
639 if (postfix && ends_with_colon && (read > 1))
640 result = scm_from_locale_keywordn (buffer, read - 1);
641 else
642 result = scm_from_locale_symboln (buffer, read);
643 }
644
645 return result;
646 }
647
648 static SCM
649 scm_read_number_and_radix (int chr, SCM port)
650 #define FUNC_NAME "scm_lreadr"
651 {
652 SCM result, str = SCM_EOL;
653 size_t read;
654 char buffer[READER_BUFFER_SIZE];
655 unsigned int radix;
656 int overflow = 0;
657
658 switch (chr)
659 {
660 case 'B':
661 case 'b':
662 radix = 2;
663 break;
664
665 case 'o':
666 case 'O':
667 radix = 8;
668 break;
669
670 case 'd':
671 case 'D':
672 radix = 10;
673 break;
674
675 case 'x':
676 case 'X':
677 radix = 16;
678 break;
679
680 default:
681 scm_ungetc (chr, port);
682 scm_ungetc ('#', port);
683 radix = 10;
684 }
685
686 do
687 {
688 overflow = read_token (port, buffer, sizeof (buffer), &read);
689
690 if ((overflow) || (scm_is_pair (str)))
691 str = scm_cons (scm_from_locale_stringn (buffer, read), str);
692 }
693 while (overflow);
694
695 if (scm_is_pair (str))
696 {
697 str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
698 result = scm_string_to_number (str, scm_from_uint (radix));
699 }
700 else
701 result = scm_c_locale_stringn_to_number (buffer, read, radix);
702
703 if (scm_is_true (result))
704 return result;
705
706 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
707
708 return SCM_BOOL_F;
709 }
710 #undef FUNC_NAME
711
712 static SCM
713 scm_read_quote (int chr, SCM port)
714 {
715 SCM p;
716 long line = SCM_LINUM (port);
717 int column = SCM_COL (port) - 1;
718
719 switch (chr)
720 {
721 case '`':
722 p = scm_sym_quasiquote;
723 break;
724
725 case '\'':
726 p = scm_sym_quote;
727 break;
728
729 case ',':
730 {
731 int c;
732
733 c = scm_getc (port);
734 if ('@' == c)
735 p = scm_sym_uq_splicing;
736 else
737 {
738 scm_ungetc (c, port);
739 p = scm_sym_unquote;
740 }
741 break;
742 }
743
744 default:
745 fprintf (stderr, "%s: unhandled quote character (%i)\n",
746 "scm_read_quote", chr);
747 abort ();
748 }
749
750 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
751 if (SCM_RECORD_POSITIONS_P)
752 scm_whash_insert (scm_source_whash, p,
753 scm_make_srcprops (line, column,
754 SCM_FILENAME (port),
755 SCM_COPY_SOURCE_P
756 ? (scm_cons2 (SCM_CAR (p),
757 SCM_CAR (SCM_CDR (p)),
758 SCM_EOL))
759 : SCM_UNDEFINED,
760 SCM_EOL));
761
762
763 return p;
764 }
765
766 SCM_SYMBOL (sym_syntax, "syntax");
767 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
768 SCM_SYMBOL (sym_unsyntax, "unsyntax");
769 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
770
771 static SCM
772 scm_read_syntax (int chr, SCM port)
773 {
774 SCM p;
775 long line = SCM_LINUM (port);
776 int column = SCM_COL (port) - 1;
777
778 switch (chr)
779 {
780 case '`':
781 p = sym_quasisyntax;
782 break;
783
784 case '\'':
785 p = sym_syntax;
786 break;
787
788 case ',':
789 {
790 int c;
791
792 c = scm_getc (port);
793 if ('@' == c)
794 p = sym_unsyntax_splicing;
795 else
796 {
797 scm_ungetc (c, port);
798 p = sym_unsyntax;
799 }
800 break;
801 }
802
803 default:
804 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
805 "scm_read_syntax", chr);
806 abort ();
807 }
808
809 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
810 if (SCM_RECORD_POSITIONS_P)
811 scm_whash_insert (scm_source_whash, p,
812 scm_make_srcprops (line, column,
813 SCM_FILENAME (port),
814 SCM_COPY_SOURCE_P
815 ? (scm_cons2 (SCM_CAR (p),
816 SCM_CAR (SCM_CDR (p)),
817 SCM_EOL))
818 : SCM_UNDEFINED,
819 SCM_EOL));
820
821
822 return p;
823 }
824
825 static inline SCM
826 scm_read_semicolon_comment (int chr, SCM port)
827 {
828 int c;
829
830 for (c = scm_getc (port);
831 (c != EOF) && (c != '\n');
832 c = scm_getc (port));
833
834 return SCM_UNSPECIFIED;
835 }
836
837 \f
838 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
839
840 static SCM
841 scm_read_boolean (int chr, SCM port)
842 {
843 switch (chr)
844 {
845 case 't':
846 case 'T':
847 return SCM_BOOL_T;
848
849 case 'f':
850 case 'F':
851 return SCM_BOOL_F;
852 }
853
854 return SCM_UNSPECIFIED;
855 }
856
857 static SCM
858 scm_read_character (int chr, SCM port)
859 #define FUNC_NAME "scm_lreadr"
860 {
861 SCM ch;
862 char charname[READER_CHAR_NAME_MAX_SIZE];
863 size_t charname_len;
864
865 if (read_token (port, charname, sizeof (charname), &charname_len))
866 goto char_error;
867
868 if (charname_len == 0)
869 {
870 chr = scm_getc (port);
871 if (chr == EOF)
872 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
873 "while reading character", SCM_EOL);
874
875 /* CHR must be a token delimiter, like a whitespace. */
876 return (SCM_MAKE_CHAR (chr));
877 }
878
879 if (charname_len == 1)
880 return SCM_MAKE_CHAR (charname[0]);
881
882 if (*charname >= '0' && *charname < '8')
883 {
884 /* Dirk:FIXME:: This type of character syntax is not R5RS
885 * compliant. Further, it should be verified that the constant
886 * does only consist of octal digits. Finally, it should be
887 * checked whether the resulting fixnum is in the range of
888 * characters. */
889 SCM p = scm_c_locale_stringn_to_number (charname, charname_len, 8);
890 if (SCM_I_INUMP (p))
891 return SCM_MAKE_CHAR (SCM_I_INUM (p));
892 }
893
894 ch = scm_i_charname_to_char (charname, charname_len);
895 if (scm_is_true (ch))
896 return ch;
897
898 char_error:
899 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
900 scm_list_1 (scm_from_locale_stringn (charname,
901 charname_len)));
902
903 return SCM_UNSPECIFIED;
904 }
905 #undef FUNC_NAME
906
907 static inline SCM
908 scm_read_keyword (int chr, SCM port)
909 {
910 SCM symbol;
911
912 /* Read the symbol that comprises the keyword. Doing this instead of
913 invoking a specific symbol reader function allows `scm_read_keyword ()'
914 to adapt to the delimiters currently valid of symbols.
915
916 XXX: This implementation allows sloppy syntaxes like `#: key'. */
917 symbol = scm_read_expression (port);
918 if (!scm_is_symbol (symbol))
919 scm_i_input_error ("scm_read_keyword", port,
920 "keyword prefix `~a' not followed by a symbol: ~s",
921 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
922
923 return (scm_symbol_to_keyword (symbol));
924 }
925
926 static inline SCM
927 scm_read_vector (int chr, SCM port)
928 {
929 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
930 guarantee that it's going to do what we want. After all, this is an
931 implementation detail of `scm_read_vector ()', not a desirable
932 property. */
933 return (scm_vector (scm_read_sexp (chr, port)));
934 }
935
936 static inline SCM
937 scm_read_srfi4_vector (int chr, SCM port)
938 {
939 return scm_i_read_array (port, chr);
940 }
941
942 static SCM
943 scm_read_bytevector (int chr, SCM port)
944 {
945 chr = scm_getc (port);
946 if (chr != 'u')
947 goto syntax;
948
949 chr = scm_getc (port);
950 if (chr != '8')
951 goto syntax;
952
953 chr = scm_getc (port);
954 if (chr != '(')
955 goto syntax;
956
957 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
958
959 syntax:
960 scm_i_input_error ("read_bytevector", port,
961 "invalid bytevector prefix",
962 SCM_MAKE_CHAR (chr));
963 return SCM_UNSPECIFIED;
964 }
965
966 static SCM
967 scm_read_guile_bit_vector (int chr, SCM port)
968 {
969 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
970 terribly inefficient but who cares? */
971 SCM s_bits = SCM_EOL;
972
973 for (chr = scm_getc (port);
974 (chr != EOF) && ((chr == '0') || (chr == '1'));
975 chr = scm_getc (port))
976 {
977 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
978 }
979
980 if (chr != EOF)
981 scm_ungetc (chr, port);
982
983 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
984 }
985
986 static inline SCM
987 scm_read_scsh_block_comment (int chr, SCM port)
988 {
989 int bang_seen = 0;
990
991 for (;;)
992 {
993 int c = scm_getc (port);
994
995 if (c == EOF)
996 scm_i_input_error ("skip_block_comment", port,
997 "unterminated `#! ... !#' comment", SCM_EOL);
998
999 if (c == '!')
1000 bang_seen = 1;
1001 else if (c == '#' && bang_seen)
1002 break;
1003 else
1004 bang_seen = 0;
1005 }
1006
1007 return SCM_UNSPECIFIED;
1008 }
1009
1010 static SCM
1011 scm_read_commented_expression (int chr, SCM port)
1012 {
1013 int c;
1014
1015 c = flush_ws (port, (char *) NULL);
1016 if (EOF == c)
1017 scm_i_input_error ("read_commented_expression", port,
1018 "no expression after #; comment", SCM_EOL);
1019 scm_ungetc (c, port);
1020 scm_read_expression (port);
1021 return SCM_UNSPECIFIED;
1022 }
1023
1024 static SCM
1025 scm_read_extended_symbol (int chr, SCM port)
1026 {
1027 /* Guile's extended symbol read syntax looks like this:
1028
1029 #{This is all a symbol name}#
1030
1031 So here, CHR is expected to be `{'. */
1032 SCM result;
1033 int saw_brace = 0, finished = 0;
1034 size_t len = 0;
1035 char buf[1024];
1036
1037 result = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
1038
1039 while ((chr = scm_getc (port)) != EOF)
1040 {
1041 if (saw_brace)
1042 {
1043 if (chr == '#')
1044 {
1045 finished = 1;
1046 break;
1047 }
1048 else
1049 {
1050 saw_brace = 0;
1051 buf[len++] = '}';
1052 buf[len++] = chr;
1053 }
1054 }
1055 else if (chr == '}')
1056 saw_brace = 1;
1057 else
1058 buf[len++] = chr;
1059
1060 if (len >= sizeof (buf) - 2)
1061 {
1062 scm_string_append (scm_list_2 (result,
1063 scm_from_locale_stringn (buf, len)));
1064 len = 0;
1065 }
1066
1067 if (finished)
1068 break;
1069 }
1070
1071 if (len)
1072 result = scm_string_append (scm_list_2
1073 (result,
1074 scm_from_locale_stringn (buf, len)));
1075
1076 return (scm_string_to_symbol (result));
1077 }
1078
1079
1080 \f
1081 /* Top-level token readers, i.e., dispatchers. */
1082
1083 static SCM
1084 scm_read_sharp_extension (int chr, SCM port)
1085 {
1086 SCM proc;
1087
1088 proc = scm_get_hash_procedure (chr);
1089 if (scm_is_true (scm_procedure_p (proc)))
1090 {
1091 long line = SCM_LINUM (port);
1092 int column = SCM_COL (port) - 2;
1093 SCM got;
1094
1095 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1096 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1097 {
1098 if (SCM_RECORD_POSITIONS_P)
1099 return (recsexpr (got, line, column,
1100 SCM_FILENAME (port)));
1101 else
1102 return got;
1103 }
1104 }
1105
1106 return SCM_UNSPECIFIED;
1107 }
1108
1109 /* The reader for the sharp `#' character. It basically dispatches reads
1110 among the above token readers. */
1111 static SCM
1112 scm_read_sharp (int chr, SCM port)
1113 #define FUNC_NAME "scm_lreadr"
1114 {
1115 SCM result;
1116
1117 chr = scm_getc (port);
1118
1119 result = scm_read_sharp_extension (chr, port);
1120 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1121 return result;
1122
1123 switch (chr)
1124 {
1125 case '\\':
1126 return (scm_read_character (chr, port));
1127 case '(':
1128 return (scm_read_vector (chr, port));
1129 case 's':
1130 case 'u':
1131 case 'f':
1132 /* This one may return either a boolean or an SRFI-4 vector. */
1133 return (scm_read_srfi4_vector (chr, port));
1134 case 'v':
1135 return (scm_read_bytevector (chr, port));
1136 case '*':
1137 return (scm_read_guile_bit_vector (chr, port));
1138 case 't':
1139 case 'T':
1140 case 'F':
1141 /* This one may return either a boolean or an SRFI-4 vector. */
1142 return (scm_read_boolean (chr, port));
1143 case ':':
1144 return (scm_read_keyword (chr, port));
1145 case '0': case '1': case '2': case '3': case '4':
1146 case '5': case '6': case '7': case '8': case '9':
1147 case '@':
1148 #if SCM_ENABLE_DEPRECATED
1149 /* See below for 'i' and 'e'. */
1150 case 'a':
1151 case 'c':
1152 case 'y':
1153 case 'h':
1154 case 'l':
1155 #endif
1156 return (scm_i_read_array (port, chr));
1157
1158 case 'i':
1159 case 'e':
1160 #if SCM_ENABLE_DEPRECATED
1161 {
1162 /* When next char is '(', it really is an old-style
1163 uniform array. */
1164 int next_c = scm_getc (port);
1165 if (next_c != EOF)
1166 scm_ungetc (next_c, port);
1167 if (next_c == '(')
1168 return scm_i_read_array (port, chr);
1169 /* Fall through. */
1170 }
1171 #endif
1172 case 'b':
1173 case 'B':
1174 case 'o':
1175 case 'O':
1176 case 'd':
1177 case 'D':
1178 case 'x':
1179 case 'X':
1180 case 'I':
1181 case 'E':
1182 return (scm_read_number_and_radix (chr, port));
1183 case '{':
1184 return (scm_read_extended_symbol (chr, port));
1185 case '!':
1186 return (scm_read_scsh_block_comment (chr, port));
1187 case ';':
1188 return (scm_read_commented_expression (chr, port));
1189 case '`':
1190 case '\'':
1191 case ',':
1192 return (scm_read_syntax (chr, port));
1193 default:
1194 result = scm_read_sharp_extension (chr, port);
1195 if (scm_is_eq (result, SCM_UNSPECIFIED))
1196 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1197 scm_list_1 (SCM_MAKE_CHAR (chr)));
1198 else
1199 return result;
1200 }
1201
1202 return SCM_UNSPECIFIED;
1203 }
1204 #undef FUNC_NAME
1205
1206 static SCM
1207 scm_read_expression (SCM port)
1208 #define FUNC_NAME "scm_read_expression"
1209 {
1210 while (1)
1211 {
1212 register int chr;
1213
1214 chr = scm_getc (port);
1215
1216 switch (chr)
1217 {
1218 case SCM_WHITE_SPACES:
1219 case SCM_LINE_INCREMENTORS:
1220 break;
1221 case ';':
1222 (void) scm_read_semicolon_comment (chr, port);
1223 break;
1224 case '(':
1225 return (scm_read_sexp (chr, port));
1226 case '"':
1227 return (scm_read_string (chr, port));
1228 case '\'':
1229 case '`':
1230 case ',':
1231 return (scm_read_quote (chr, port));
1232 case '#':
1233 {
1234 SCM result;
1235 result = scm_read_sharp (chr, port);
1236 if (scm_is_eq (result, SCM_UNSPECIFIED))
1237 /* We read a comment or some such. */
1238 break;
1239 else
1240 return result;
1241 }
1242 case ')':
1243 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1244 break;
1245 case EOF:
1246 return SCM_EOF_VAL;
1247 case ':':
1248 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1249 return scm_symbol_to_keyword (scm_read_expression (port));
1250 /* Fall through. */
1251
1252 default:
1253 {
1254 if (((chr >= '0') && (chr <= '9'))
1255 || (strchr ("+-.", chr)))
1256 return (scm_read_number (chr, port));
1257 else
1258 return (scm_read_mixed_case_symbol (chr, port));
1259 }
1260 }
1261 }
1262 }
1263 #undef FUNC_NAME
1264
1265 \f
1266 /* Actual reader. */
1267
1268 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1269 (SCM port),
1270 "Read an s-expression from the input port @var{port}, or from\n"
1271 "the current input port if @var{port} is not specified.\n"
1272 "Any whitespace before the next token is discarded.")
1273 #define FUNC_NAME s_scm_read
1274 {
1275 int c;
1276
1277 if (SCM_UNBNDP (port))
1278 port = scm_current_input_port ();
1279 SCM_VALIDATE_OPINPORT (1, port);
1280
1281 c = flush_ws (port, (char *) NULL);
1282 if (EOF == c)
1283 return SCM_EOF_VAL;
1284 scm_ungetc (c, port);
1285
1286 return (scm_read_expression (port));
1287 }
1288 #undef FUNC_NAME
1289
1290
1291 \f
1292
1293 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1294 static SCM
1295 recsexpr (SCM obj, long line, int column, SCM filename)
1296 {
1297 if (!scm_is_pair(obj)) {
1298 return obj;
1299 } else {
1300 SCM tmp = obj, copy;
1301 /* If this sexpr is visible in the read:sharp source, we want to
1302 keep that information, so only record non-constant cons cells
1303 which haven't previously been read by the reader. */
1304 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1305 {
1306 if (SCM_COPY_SOURCE_P)
1307 {
1308 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1309 SCM_UNDEFINED);
1310 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1311 {
1312 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1313 line,
1314 column,
1315 filename),
1316 SCM_UNDEFINED));
1317 copy = SCM_CDR (copy);
1318 }
1319 SCM_SETCDR (copy, tmp);
1320 }
1321 else
1322 {
1323 recsexpr (SCM_CAR (obj), line, column, filename);
1324 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1325 recsexpr (SCM_CAR (tmp), line, column, filename);
1326 copy = SCM_UNDEFINED;
1327 }
1328 scm_whash_insert (scm_source_whash,
1329 obj,
1330 scm_make_srcprops (line,
1331 column,
1332 filename,
1333 copy,
1334 SCM_EOL));
1335 }
1336 return obj;
1337 }
1338 }
1339
1340 /* Manipulate the read-hash-procedures alist. This could be written in
1341 Scheme, but maybe it will also be used by C code during initialisation. */
1342 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1343 (SCM chr, SCM proc),
1344 "Install the procedure @var{proc} for reading expressions\n"
1345 "starting with the character sequence @code{#} and @var{chr}.\n"
1346 "@var{proc} will be called with two arguments: the character\n"
1347 "@var{chr} and the port to read further data from. The object\n"
1348 "returned will be the return value of @code{read}. \n"
1349 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1350 )
1351 #define FUNC_NAME s_scm_read_hash_extend
1352 {
1353 SCM this;
1354 SCM prev;
1355
1356 SCM_VALIDATE_CHAR (1, chr);
1357 SCM_ASSERT (scm_is_false (proc)
1358 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1359 proc, SCM_ARG2, FUNC_NAME);
1360
1361 /* Check if chr is already in the alist. */
1362 this = *scm_read_hash_procedures;
1363 prev = SCM_BOOL_F;
1364 while (1)
1365 {
1366 if (scm_is_null (this))
1367 {
1368 /* not found, so add it to the beginning. */
1369 if (scm_is_true (proc))
1370 {
1371 *scm_read_hash_procedures =
1372 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1373 }
1374 break;
1375 }
1376 if (scm_is_eq (chr, SCM_CAAR (this)))
1377 {
1378 /* already in the alist. */
1379 if (scm_is_false (proc))
1380 {
1381 /* remove it. */
1382 if (scm_is_false (prev))
1383 {
1384 *scm_read_hash_procedures =
1385 SCM_CDR (*scm_read_hash_procedures);
1386 }
1387 else
1388 scm_set_cdr_x (prev, SCM_CDR (this));
1389 }
1390 else
1391 {
1392 /* replace it. */
1393 scm_set_cdr_x (SCM_CAR (this), proc);
1394 }
1395 break;
1396 }
1397 prev = this;
1398 this = SCM_CDR (this);
1399 }
1400
1401 return SCM_UNSPECIFIED;
1402 }
1403 #undef FUNC_NAME
1404
1405 /* Recover the read-hash procedure corresponding to char c. */
1406 static SCM
1407 scm_get_hash_procedure (int c)
1408 {
1409 SCM rest = *scm_read_hash_procedures;
1410
1411 while (1)
1412 {
1413 if (scm_is_null (rest))
1414 return SCM_BOOL_F;
1415
1416 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1417 return SCM_CDAR (rest);
1418
1419 rest = SCM_CDR (rest);
1420 }
1421 }
1422
1423 void
1424 scm_init_read ()
1425 {
1426 scm_read_hash_procedures =
1427 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1428
1429 scm_init_opts (scm_read_options, scm_read_opts);
1430 #include "libguile/read.x"
1431 }
1432
1433 /*
1434 Local Variables:
1435 c-file-style: "gnu"
1436 End:
1437 */