3493ba03ff4a06cee97e43816bc49870e965cdf3
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19
20 \f
21
22 #ifdef HAVE_CONFIG_H
23 # include <config.h>
24 #endif
25
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <string.h>
29
30 #include "libguile/_scm.h"
31 #include "libguile/chars.h"
32 #include "libguile/eval.h"
33 #include "libguile/unif.h"
34 #include "libguile/keywords.h"
35 #include "libguile/alist.h"
36 #include "libguile/srcprop.h"
37 #include "libguile/hashtab.h"
38 #include "libguile/hash.h"
39 #include "libguile/ports.h"
40 #include "libguile/root.h"
41 #include "libguile/strings.h"
42 #include "libguile/strports.h"
43 #include "libguile/vectors.h"
44 #include "libguile/validate.h"
45 #include "libguile/srfi-4.h"
46 #include "libguile/srfi-13.h"
47
48 #include "libguile/read.h"
49 #include "libguile/private-options.h"
50
51
52 \f
53
54 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
55 SCM_SYMBOL (scm_keyword_prefix, "prefix");
56 SCM_SYMBOL (scm_keyword_postfix, "postfix");
57
58 scm_t_option scm_read_opts[] = {
59 { SCM_OPTION_BOOLEAN, "copy", 0,
60 "Copy source code expressions." },
61 { SCM_OPTION_BOOLEAN, "positions", 0,
62 "Record positions of source code expressions." },
63 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
64 "Convert symbols to lower case."},
65 { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
66 "Style of keyword recognition: #f, 'prefix or 'postfix."},
67 #if SCM_ENABLE_ELISP
68 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
69 "Support Elisp vector syntax, namely `[...]'."},
70 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
71 "Support `\\(' and `\\)' in strings."},
72 #endif
73 { 0, },
74 };
75
76 /*
77 Give meaningful error messages for errors
78
79 We use the format
80
81 FILE:LINE:COL: MESSAGE
82 This happened in ....
83
84 This is not standard GNU format, but the test-suite likes the real
85 message to be in front.
86
87 */
88
89
90 void
91 scm_i_input_error (char const *function,
92 SCM port, const char *message, SCM arg)
93 {
94 SCM fn = (scm_is_string (SCM_FILENAME(port))
95 ? SCM_FILENAME(port)
96 : scm_from_locale_string ("#<unknown port>"));
97
98 SCM string_port = scm_open_output_string ();
99 SCM string = SCM_EOL;
100 scm_simple_format (string_port,
101 scm_from_locale_string ("~A:~S:~S: ~A"),
102 scm_list_4 (fn,
103 scm_from_long (SCM_LINUM (port) + 1),
104 scm_from_int (SCM_COL (port) + 1),
105 scm_from_locale_string (message)));
106
107 string = scm_get_output_string (string_port);
108 scm_close_output_port (string_port);
109 scm_error_scm (scm_from_locale_symbol ("read-error"),
110 function? scm_from_locale_string (function) : SCM_BOOL_F,
111 string,
112 arg,
113 SCM_BOOL_F);
114 }
115
116
117 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
118 (SCM setting),
119 "Option interface for the read options. Instead of using\n"
120 "this procedure directly, use the procedures @code{read-enable},\n"
121 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
122 #define FUNC_NAME s_scm_read_options
123 {
124 SCM ans = scm_options (setting,
125 scm_read_opts,
126 FUNC_NAME);
127 if (SCM_COPY_SOURCE_P)
128 SCM_RECORD_POSITIONS_P = 1;
129 return ans;
130 }
131 #undef FUNC_NAME
132
133 /* An association list mapping extra hash characters to procedures. */
134 static SCM *scm_read_hash_procedures;
135
136
137 \f
138 /* Token readers. */
139
140
141 /* Size of the C buffer used to read symbols and numbers. */
142 #define READER_BUFFER_SIZE 128
143
144 /* Size of the C buffer used to read strings. */
145 #define READER_STRING_BUFFER_SIZE 512
146
147 /* The maximum size of Scheme character names. */
148 #define READER_CHAR_NAME_MAX_SIZE 50
149
150
151 /* `isblank' is only in C99. */
152 #define CHAR_IS_BLANK_(_chr) \
153 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
154 || ((_chr) == '\f') || ((_chr) == '\r'))
155
156 #ifdef MSDOS
157 # define CHAR_IS_BLANK(_chr) \
158 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
159 #else
160 # define CHAR_IS_BLANK CHAR_IS_BLANK_
161 #endif
162
163
164 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
165 structure''). */
166 #define CHAR_IS_R5RS_DELIMITER(c) \
167 (CHAR_IS_BLANK (c) \
168 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
169
170 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
171
172 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
173 Structure''. */
174 #define CHAR_IS_EXPONENT_MARKER(_chr) \
175 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
176 || ((_chr) == 'd') || ((_chr) == 'l'))
177
178 /* An inlinable version of `scm_c_downcase ()'. */
179 #define CHAR_DOWNCASE(_chr) \
180 (((_chr) <= UCHAR_MAX) ? tolower (_chr) : (_chr))
181
182
183 /* Read an SCSH block comment. */
184 static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
185 static SCM scm_read_commented_expression (int chr, SCM port);
186
187 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
188 zero if the whole token fits in BUF, non-zero otherwise. */
189 static inline int
190 read_token (SCM port, char *buf, size_t buf_size, size_t *read)
191 {
192 *read = 0;
193
194 while (*read < buf_size)
195 {
196 int chr;
197
198 chr = scm_getc (port);
199 chr = (SCM_CASE_INSENSITIVE_P ? CHAR_DOWNCASE (chr) : chr);
200
201 if (chr == EOF)
202 return 0;
203 else if (CHAR_IS_DELIMITER (chr))
204 {
205 scm_ungetc (chr, port);
206 return 0;
207 }
208 else
209 {
210 *buf = (char) chr;
211 buf++, (*read)++;
212 }
213 }
214
215 return 1;
216 }
217
218
219 /* Skip whitespace from PORT and return the first non-whitespace character
220 read. Raise an error on end-of-file. */
221 static int
222 flush_ws (SCM port, const char *eoferr)
223 {
224 register int c;
225 while (1)
226 switch (c = scm_getc (port))
227 {
228 case EOF:
229 goteof:
230 if (eoferr)
231 {
232 scm_i_input_error (eoferr,
233 port,
234 "end of file",
235 SCM_EOL);
236 }
237 return c;
238
239 case ';':
240 lp:
241 switch (c = scm_getc (port))
242 {
243 case EOF:
244 goto goteof;
245 default:
246 goto lp;
247 case SCM_LINE_INCREMENTORS:
248 break;
249 }
250 break;
251
252 case '#':
253 switch (c = scm_getc (port))
254 {
255 case EOF:
256 eoferr = "read_sharp";
257 goto goteof;
258 case '!':
259 scm_read_scsh_block_comment (c, port);
260 break;
261 case ';':
262 scm_read_commented_expression (c, port);
263 break;
264 default:
265 scm_ungetc (c, port);
266 return '#';
267 }
268 break;
269
270 case SCM_LINE_INCREMENTORS:
271 case SCM_SINGLE_SPACES:
272 case '\t':
273 break;
274
275 default:
276 return c;
277 }
278
279 return 0;
280 }
281
282
283 \f
284 /* Token readers. */
285
286 static SCM scm_read_expression (SCM port);
287 static SCM scm_read_sharp (int chr, SCM port);
288 static SCM scm_get_hash_procedure (int c);
289 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
290
291
292 static SCM
293 scm_read_sexp (int chr, SCM port)
294 #define FUNC_NAME "scm_i_lreadparen"
295 {
296 register int c;
297 register SCM tmp;
298 register SCM tl, ans = SCM_EOL;
299 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
300 static const int terminating_char = ')';
301
302 /* Need to capture line and column numbers here. */
303 long line = SCM_LINUM (port);
304 int column = SCM_COL (port) - 1;
305
306
307 c = flush_ws (port, FUNC_NAME);
308 if (terminating_char == c)
309 return SCM_EOL;
310
311 scm_ungetc (c, port);
312 if (scm_is_eq (scm_sym_dot,
313 (tmp = scm_read_expression (port))))
314 {
315 ans = scm_read_expression (port);
316 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
317 scm_i_input_error (FUNC_NAME, port, "missing close paren",
318 SCM_EOL);
319 return ans;
320 }
321
322 /* Build the head of the list structure. */
323 ans = tl = scm_cons (tmp, SCM_EOL);
324
325 if (SCM_COPY_SOURCE_P)
326 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
327 ? copy
328 : tmp,
329 SCM_EOL);
330
331 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
332 {
333 SCM new_tail;
334
335 scm_ungetc (c, port);
336 if (scm_is_eq (scm_sym_dot,
337 (tmp = scm_read_expression (port))))
338 {
339 SCM_SETCDR (tl, tmp = scm_read_expression (port));
340
341 if (SCM_COPY_SOURCE_P)
342 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
343 SCM_EOL));
344
345 c = flush_ws (port, FUNC_NAME);
346 if (terminating_char != c)
347 scm_i_input_error (FUNC_NAME, port,
348 "in pair: missing close paren", SCM_EOL);
349 goto exit;
350 }
351
352 new_tail = scm_cons (tmp, SCM_EOL);
353 SCM_SETCDR (tl, new_tail);
354 tl = new_tail;
355
356 if (SCM_COPY_SOURCE_P)
357 {
358 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
359 ? copy
360 : tmp, SCM_EOL);
361 SCM_SETCDR (tl2, new_tail2);
362 tl2 = new_tail2;
363 }
364 }
365
366 exit:
367 if (SCM_RECORD_POSITIONS_P)
368 scm_whash_insert (scm_source_whash,
369 ans,
370 scm_make_srcprops (line, column,
371 SCM_FILENAME (port),
372 SCM_COPY_SOURCE_P
373 ? ans2
374 : SCM_UNDEFINED,
375 SCM_EOL));
376 return ans;
377 }
378 #undef FUNC_NAME
379
380 static SCM
381 scm_read_string (int chr, SCM port)
382 #define FUNC_NAME "scm_lreadr"
383 {
384 /* For strings smaller than C_STR, this function creates only one Scheme
385 object (the string returned). */
386
387 SCM str = SCM_BOOL_F;
388 char c_str[READER_STRING_BUFFER_SIZE];
389 unsigned c_str_len = 0;
390 int c;
391
392 while ('"' != (c = scm_getc (port)))
393 {
394 if (c == EOF)
395 str_eof: scm_i_input_error (FUNC_NAME, port,
396 "end of file in string constant",
397 SCM_EOL);
398
399 if (c_str_len + 1 >= sizeof (c_str))
400 {
401 /* Flush the C buffer onto a Scheme string. */
402 SCM addy;
403
404 if (str == SCM_BOOL_F)
405 str = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
406
407 addy = scm_from_locale_stringn (c_str, c_str_len);
408 str = scm_string_append_shared (scm_list_2 (str, addy));
409
410 c_str_len = 0;
411 }
412
413 if (c == '\\')
414 switch (c = scm_getc (port))
415 {
416 case EOF:
417 goto str_eof;
418 case '"':
419 case '\\':
420 break;
421 #if SCM_ENABLE_ELISP
422 case '(':
423 case ')':
424 if (SCM_ESCAPED_PARENS_P)
425 break;
426 goto bad_escaped;
427 #endif
428 case '\n':
429 continue;
430 case '0':
431 c = '\0';
432 break;
433 case 'f':
434 c = '\f';
435 break;
436 case 'n':
437 c = '\n';
438 break;
439 case 'r':
440 c = '\r';
441 break;
442 case 't':
443 c = '\t';
444 break;
445 case 'a':
446 c = '\007';
447 break;
448 case 'v':
449 c = '\v';
450 break;
451 case 'x':
452 {
453 int a, b;
454 a = scm_getc (port);
455 if (a == EOF) goto str_eof;
456 b = scm_getc (port);
457 if (b == EOF) goto str_eof;
458 if ('0' <= a && a <= '9') a -= '0';
459 else if ('A' <= a && a <= 'F') a = a - 'A' + 10;
460 else if ('a' <= a && a <= 'f') a = a - 'a' + 10;
461 else goto bad_escaped;
462 if ('0' <= b && b <= '9') b -= '0';
463 else if ('A' <= b && b <= 'F') b = b - 'A' + 10;
464 else if ('a' <= b && b <= 'f') b = b - 'a' + 10;
465 else goto bad_escaped;
466 c = a * 16 + b;
467 break;
468 }
469 default:
470 bad_escaped:
471 scm_i_input_error (FUNC_NAME, port,
472 "illegal character in escape sequence: ~S",
473 scm_list_1 (SCM_MAKE_CHAR (c)));
474 }
475 c_str[c_str_len++] = c;
476 }
477
478 if (c_str_len > 0)
479 {
480 SCM addy;
481
482 addy = scm_from_locale_stringn (c_str, c_str_len);
483 if (str == SCM_BOOL_F)
484 str = addy;
485 else
486 str = scm_string_append_shared (scm_list_2 (str, addy));
487 }
488 else
489 str = (str == SCM_BOOL_F) ? scm_nullstr : str;
490
491 return str;
492 }
493 #undef FUNC_NAME
494
495
496 static SCM
497 scm_read_number (int chr, SCM port)
498 {
499 SCM result, str = SCM_EOL;
500 char buffer[READER_BUFFER_SIZE];
501 size_t read;
502 int overflow = 0;
503
504 scm_ungetc (chr, port);
505 do
506 {
507 overflow = read_token (port, buffer, sizeof (buffer), &read);
508
509 if ((overflow) || (scm_is_pair (str)))
510 str = scm_cons (scm_from_locale_stringn (buffer, read), str);
511 }
512 while (overflow);
513
514 if (scm_is_pair (str))
515 {
516 /* The slow path. */
517
518 str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
519 result = scm_string_to_number (str, SCM_UNDEFINED);
520 if (!scm_is_true (result))
521 /* Return a symbol instead of a number. */
522 result = scm_string_to_symbol (str);
523 }
524 else
525 {
526 result = scm_c_locale_stringn_to_number (buffer, read, 10);
527 if (!scm_is_true (result))
528 /* Return a symbol instead of a number. */
529 result = scm_from_locale_symboln (buffer, read);
530 }
531
532 return result;
533 }
534
535 static SCM
536 scm_read_mixed_case_symbol (int chr, SCM port)
537 {
538 SCM result, str = SCM_EOL;
539 int overflow = 0, ends_with_colon = 0;
540 char buffer[READER_BUFFER_SIZE];
541 size_t read = 0;
542 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
543
544 scm_ungetc (chr, port);
545 do
546 {
547 overflow = read_token (port, buffer, sizeof (buffer), &read);
548
549 if (read > 0)
550 ends_with_colon = (buffer[read - 1] == ':');
551
552 if ((overflow) || (scm_is_pair (str)))
553 str = scm_cons (scm_from_locale_stringn (buffer, read), str);
554 }
555 while (overflow);
556
557 if (scm_is_pair (str))
558 {
559 size_t len;
560
561 str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
562 len = scm_c_string_length (str);
563
564 /* Per SRFI-88, `:' alone is an identifier, not a keyword. */
565 if (postfix && ends_with_colon && (len > 1))
566 {
567 /* Strip off colon. */
568 str = scm_c_substring (str, 0, len-1);
569 result = scm_string_to_symbol (str);
570 result = scm_symbol_to_keyword (result);
571 }
572 else
573 result = scm_string_to_symbol (str);
574 }
575 else
576 {
577 /* For symbols smaller than `sizeof (buffer)', we don't need to recur
578 to Scheme strings. Therefore, we only create one Scheme object (a
579 symbol) per symbol read. */
580 if (postfix && ends_with_colon && (read > 1))
581 result = scm_from_locale_keywordn (buffer, read - 1);
582 else
583 result = scm_from_locale_symboln (buffer, read);
584 }
585
586 return result;
587 }
588
589 static SCM
590 scm_read_number_and_radix (int chr, SCM port)
591 #define FUNC_NAME "scm_lreadr"
592 {
593 SCM result, str = SCM_EOL;
594 size_t read;
595 char buffer[READER_BUFFER_SIZE];
596 unsigned int radix;
597 int overflow = 0;
598
599 switch (chr)
600 {
601 case 'B':
602 case 'b':
603 radix = 2;
604 break;
605
606 case 'o':
607 case 'O':
608 radix = 8;
609 break;
610
611 case 'd':
612 case 'D':
613 radix = 10;
614 break;
615
616 case 'x':
617 case 'X':
618 radix = 16;
619 break;
620
621 default:
622 scm_ungetc (chr, port);
623 scm_ungetc ('#', port);
624 radix = 10;
625 }
626
627 do
628 {
629 overflow = read_token (port, buffer, sizeof (buffer), &read);
630
631 if ((overflow) || (scm_is_pair (str)))
632 str = scm_cons (scm_from_locale_stringn (buffer, read), str);
633 }
634 while (overflow);
635
636 if (scm_is_pair (str))
637 {
638 str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
639 result = scm_string_to_number (str, scm_from_uint (radix));
640 }
641 else
642 result = scm_c_locale_stringn_to_number (buffer, read, radix);
643
644 if (scm_is_true (result))
645 return result;
646
647 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
648
649 return SCM_BOOL_F;
650 }
651 #undef FUNC_NAME
652
653 static SCM
654 scm_read_quote (int chr, SCM port)
655 {
656 SCM p;
657 long line = SCM_LINUM (port);
658 int column = SCM_COL (port) - 1;
659
660 switch (chr)
661 {
662 case '`':
663 p = scm_sym_quasiquote;
664 break;
665
666 case '\'':
667 p = scm_sym_quote;
668 break;
669
670 case ',':
671 {
672 int c;
673
674 c = scm_getc (port);
675 if ('@' == c)
676 p = scm_sym_uq_splicing;
677 else
678 {
679 scm_ungetc (c, port);
680 p = scm_sym_unquote;
681 }
682 break;
683 }
684
685 default:
686 fprintf (stderr, "%s: unhandled quote character (%i)\n",
687 "scm_read_quote", chr);
688 abort ();
689 }
690
691 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
692 if (SCM_RECORD_POSITIONS_P)
693 scm_whash_insert (scm_source_whash, p,
694 scm_make_srcprops (line, column,
695 SCM_FILENAME (port),
696 SCM_COPY_SOURCE_P
697 ? (scm_cons2 (SCM_CAR (p),
698 SCM_CAR (SCM_CDR (p)),
699 SCM_EOL))
700 : SCM_UNDEFINED,
701 SCM_EOL));
702
703
704 return p;
705 }
706
707 SCM_SYMBOL (sym_syntax, "syntax");
708 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
709 SCM_SYMBOL (sym_unsyntax, "unsyntax");
710 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
711
712 static SCM
713 scm_read_syntax (int chr, SCM port)
714 {
715 SCM p;
716 long line = SCM_LINUM (port);
717 int column = SCM_COL (port) - 1;
718
719 switch (chr)
720 {
721 case '`':
722 p = sym_quasisyntax;
723 break;
724
725 case '\'':
726 p = sym_syntax;
727 break;
728
729 case ',':
730 {
731 int c;
732
733 c = scm_getc (port);
734 if ('@' == c)
735 p = sym_unsyntax_splicing;
736 else
737 {
738 scm_ungetc (c, port);
739 p = sym_unsyntax;
740 }
741 break;
742 }
743
744 default:
745 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
746 "scm_read_syntax", chr);
747 abort ();
748 }
749
750 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
751 if (SCM_RECORD_POSITIONS_P)
752 scm_whash_insert (scm_source_whash, p,
753 scm_make_srcprops (line, column,
754 SCM_FILENAME (port),
755 SCM_COPY_SOURCE_P
756 ? (scm_cons2 (SCM_CAR (p),
757 SCM_CAR (SCM_CDR (p)),
758 SCM_EOL))
759 : SCM_UNDEFINED,
760 SCM_EOL));
761
762
763 return p;
764 }
765
766 static inline SCM
767 scm_read_semicolon_comment (int chr, SCM port)
768 {
769 int c;
770
771 for (c = scm_getc (port);
772 (c != EOF) && (c != '\n');
773 c = scm_getc (port));
774
775 return SCM_UNSPECIFIED;
776 }
777
778 \f
779 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
780
781 static SCM
782 scm_read_boolean (int chr, SCM port)
783 {
784 switch (chr)
785 {
786 case 't':
787 case 'T':
788 return SCM_BOOL_T;
789
790 case 'f':
791 case 'F':
792 return SCM_BOOL_F;
793 }
794
795 return SCM_UNSPECIFIED;
796 }
797
798 static SCM
799 scm_read_character (int chr, SCM port)
800 #define FUNC_NAME "scm_lreadr"
801 {
802 unsigned c;
803 char charname[READER_CHAR_NAME_MAX_SIZE];
804 size_t charname_len;
805
806 if (read_token (port, charname, sizeof (charname), &charname_len))
807 goto char_error;
808
809 if (charname_len == 0)
810 {
811 chr = scm_getc (port);
812 if (chr == EOF)
813 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
814 "while reading character", SCM_EOL);
815
816 /* CHR must be a token delimiter, like a whitespace. */
817 return (SCM_MAKE_CHAR (chr));
818 }
819
820 if (charname_len == 1)
821 return SCM_MAKE_CHAR (charname[0]);
822
823 if (*charname >= '0' && *charname < '8')
824 {
825 /* Dirk:FIXME:: This type of character syntax is not R5RS
826 * compliant. Further, it should be verified that the constant
827 * does only consist of octal digits. Finally, it should be
828 * checked whether the resulting fixnum is in the range of
829 * characters. */
830 SCM p = scm_c_locale_stringn_to_number (charname, charname_len, 8);
831 if (SCM_I_INUMP (p))
832 return SCM_MAKE_CHAR (SCM_I_INUM (p));
833 }
834
835 for (c = 0; c < scm_n_charnames; c++)
836 if (scm_charnames[c]
837 && (!strncasecmp (scm_charnames[c], charname, charname_len)))
838 return SCM_MAKE_CHAR (scm_charnums[c]);
839
840 char_error:
841 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
842 scm_list_1 (scm_from_locale_stringn (charname,
843 charname_len)));
844
845 return SCM_UNSPECIFIED;
846 }
847 #undef FUNC_NAME
848
849 static inline SCM
850 scm_read_keyword (int chr, SCM port)
851 {
852 SCM symbol;
853
854 /* Read the symbol that comprises the keyword. Doing this instead of
855 invoking a specific symbol reader function allows `scm_read_keyword ()'
856 to adapt to the delimiters currently valid of symbols.
857
858 XXX: This implementation allows sloppy syntaxes like `#: key'. */
859 symbol = scm_read_expression (port);
860 if (!scm_is_symbol (symbol))
861 scm_i_input_error ("scm_read_keyword", port,
862 "keyword prefix `~a' not followed by a symbol: ~s",
863 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
864
865 return (scm_symbol_to_keyword (symbol));
866 }
867
868 static inline SCM
869 scm_read_vector (int chr, SCM port)
870 {
871 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
872 guarantee that it's going to do what we want. After all, this is an
873 implementation detail of `scm_read_vector ()', not a desirable
874 property. */
875 return (scm_vector (scm_read_sexp (chr, port)));
876 }
877
878 static inline SCM
879 scm_read_srfi4_vector (int chr, SCM port)
880 {
881 return scm_i_read_array (port, chr);
882 }
883
884 static SCM
885 scm_read_guile_bit_vector (int chr, SCM port)
886 {
887 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
888 terribly inefficient but who cares? */
889 SCM s_bits = SCM_EOL;
890
891 for (chr = scm_getc (port);
892 (chr != EOF) && ((chr == '0') || (chr == '1'));
893 chr = scm_getc (port))
894 {
895 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
896 }
897
898 if (chr != EOF)
899 scm_ungetc (chr, port);
900
901 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
902 }
903
904 static inline SCM
905 scm_read_scsh_block_comment (int chr, SCM port)
906 {
907 int bang_seen = 0;
908
909 for (;;)
910 {
911 int c = scm_getc (port);
912
913 if (c == EOF)
914 scm_i_input_error ("skip_block_comment", port,
915 "unterminated `#! ... !#' comment", SCM_EOL);
916
917 if (c == '!')
918 bang_seen = 1;
919 else if (c == '#' && bang_seen)
920 break;
921 else
922 bang_seen = 0;
923 }
924
925 return SCM_UNSPECIFIED;
926 }
927
928 static SCM
929 scm_read_commented_expression (int chr, SCM port)
930 {
931 int c;
932
933 c = flush_ws (port, (char *) NULL);
934 if (EOF == c)
935 scm_i_input_error ("read_commented_expression", port,
936 "no expression after #; comment", SCM_EOL);
937 scm_ungetc (c, port);
938 scm_read_expression (port);
939 return SCM_UNSPECIFIED;
940 }
941
942 static SCM
943 scm_read_extended_symbol (int chr, SCM port)
944 {
945 /* Guile's extended symbol read syntax looks like this:
946
947 #{This is all a symbol name}#
948
949 So here, CHR is expected to be `{'. */
950 SCM result;
951 int saw_brace = 0, finished = 0;
952 size_t len = 0;
953 char buf[1024];
954
955 result = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
956
957 while ((chr = scm_getc (port)) != EOF)
958 {
959 if (saw_brace)
960 {
961 if (chr == '#')
962 {
963 finished = 1;
964 break;
965 }
966 else
967 {
968 saw_brace = 0;
969 buf[len++] = '}';
970 buf[len++] = chr;
971 }
972 }
973 else if (chr == '}')
974 saw_brace = 1;
975 else
976 buf[len++] = chr;
977
978 if (len >= sizeof (buf) - 2)
979 {
980 scm_string_append (scm_list_2 (result,
981 scm_from_locale_stringn (buf, len)));
982 len = 0;
983 }
984
985 if (finished)
986 break;
987 }
988
989 if (len)
990 result = scm_string_append (scm_list_2
991 (result,
992 scm_from_locale_stringn (buf, len)));
993
994 return (scm_string_to_symbol (result));
995 }
996
997
998 \f
999 /* Top-level token readers, i.e., dispatchers. */
1000
1001 static SCM
1002 scm_read_sharp_extension (int chr, SCM port)
1003 {
1004 SCM proc;
1005
1006 proc = scm_get_hash_procedure (chr);
1007 if (scm_is_true (scm_procedure_p (proc)))
1008 {
1009 long line = SCM_LINUM (port);
1010 int column = SCM_COL (port) - 2;
1011 SCM got;
1012
1013 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1014 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1015 {
1016 if (SCM_RECORD_POSITIONS_P)
1017 return (recsexpr (got, line, column,
1018 SCM_FILENAME (port)));
1019 else
1020 return got;
1021 }
1022 }
1023
1024 return SCM_UNSPECIFIED;
1025 }
1026
1027 /* The reader for the sharp `#' character. It basically dispatches reads
1028 among the above token readers. */
1029 static SCM
1030 scm_read_sharp (int chr, SCM port)
1031 #define FUNC_NAME "scm_lreadr"
1032 {
1033 SCM result;
1034
1035 chr = scm_getc (port);
1036
1037 result = scm_read_sharp_extension (chr, port);
1038 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1039 return result;
1040
1041 switch (chr)
1042 {
1043 case '\\':
1044 return (scm_read_character (chr, port));
1045 case '(':
1046 return (scm_read_vector (chr, port));
1047 case 's':
1048 case 'u':
1049 case 'f':
1050 /* This one may return either a boolean or an SRFI-4 vector. */
1051 return (scm_read_srfi4_vector (chr, port));
1052 case '*':
1053 return (scm_read_guile_bit_vector (chr, port));
1054 case 't':
1055 case 'T':
1056 case 'F':
1057 /* This one may return either a boolean or an SRFI-4 vector. */
1058 return (scm_read_boolean (chr, port));
1059 case ':':
1060 return (scm_read_keyword (chr, port));
1061 case '0': case '1': case '2': case '3': case '4':
1062 case '5': case '6': case '7': case '8': case '9':
1063 case '@':
1064 #if SCM_ENABLE_DEPRECATED
1065 /* See below for 'i' and 'e'. */
1066 case 'a':
1067 case 'c':
1068 case 'y':
1069 case 'h':
1070 case 'l':
1071 #endif
1072 return (scm_i_read_array (port, chr));
1073
1074 case 'i':
1075 case 'e':
1076 #if SCM_ENABLE_DEPRECATED
1077 {
1078 /* When next char is '(', it really is an old-style
1079 uniform array. */
1080 int next_c = scm_getc (port);
1081 if (next_c != EOF)
1082 scm_ungetc (next_c, port);
1083 if (next_c == '(')
1084 return scm_i_read_array (port, chr);
1085 /* Fall through. */
1086 }
1087 #endif
1088 case 'b':
1089 case 'B':
1090 case 'o':
1091 case 'O':
1092 case 'd':
1093 case 'D':
1094 case 'x':
1095 case 'X':
1096 case 'I':
1097 case 'E':
1098 return (scm_read_number_and_radix (chr, port));
1099 case '{':
1100 return (scm_read_extended_symbol (chr, port));
1101 case '!':
1102 return (scm_read_scsh_block_comment (chr, port));
1103 case ';':
1104 return (scm_read_commented_expression (chr, port));
1105 case '`':
1106 case '\'':
1107 case ',':
1108 return (scm_read_syntax (chr, port));
1109 default:
1110 result = scm_read_sharp_extension (chr, port);
1111 if (scm_is_eq (result, SCM_UNSPECIFIED))
1112 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1113 scm_list_1 (SCM_MAKE_CHAR (chr)));
1114 else
1115 return result;
1116 }
1117
1118 return SCM_UNSPECIFIED;
1119 }
1120 #undef FUNC_NAME
1121
1122 static SCM
1123 scm_read_expression (SCM port)
1124 #define FUNC_NAME "scm_read_expression"
1125 {
1126 while (1)
1127 {
1128 register int chr;
1129
1130 chr = scm_getc (port);
1131
1132 switch (chr)
1133 {
1134 case SCM_WHITE_SPACES:
1135 case SCM_LINE_INCREMENTORS:
1136 break;
1137 case ';':
1138 (void) scm_read_semicolon_comment (chr, port);
1139 break;
1140 case '(':
1141 return (scm_read_sexp (chr, port));
1142 case '"':
1143 return (scm_read_string (chr, port));
1144 case '\'':
1145 case '`':
1146 case ',':
1147 return (scm_read_quote (chr, port));
1148 case '#':
1149 {
1150 SCM result;
1151 result = scm_read_sharp (chr, port);
1152 if (scm_is_eq (result, SCM_UNSPECIFIED))
1153 /* We read a comment or some such. */
1154 break;
1155 else
1156 return result;
1157 }
1158 case ')':
1159 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1160 break;
1161 case EOF:
1162 return SCM_EOF_VAL;
1163 case ':':
1164 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1165 return scm_symbol_to_keyword (scm_read_expression (port));
1166 /* Fall through. */
1167
1168 default:
1169 {
1170 if (((chr >= '0') && (chr <= '9'))
1171 || (strchr ("+-.", chr)))
1172 return (scm_read_number (chr, port));
1173 else
1174 return (scm_read_mixed_case_symbol (chr, port));
1175 }
1176 }
1177 }
1178 }
1179 #undef FUNC_NAME
1180
1181 \f
1182 /* Actual reader. */
1183
1184 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1185 (SCM port),
1186 "Read an s-expression from the input port @var{port}, or from\n"
1187 "the current input port if @var{port} is not specified.\n"
1188 "Any whitespace before the next token is discarded.")
1189 #define FUNC_NAME s_scm_read
1190 {
1191 int c;
1192
1193 if (SCM_UNBNDP (port))
1194 port = scm_current_input_port ();
1195 SCM_VALIDATE_OPINPORT (1, port);
1196
1197 c = flush_ws (port, (char *) NULL);
1198 if (EOF == c)
1199 return SCM_EOF_VAL;
1200 scm_ungetc (c, port);
1201
1202 return (scm_read_expression (port));
1203 }
1204 #undef FUNC_NAME
1205
1206
1207 \f
1208
1209 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1210 static SCM
1211 recsexpr (SCM obj, long line, int column, SCM filename)
1212 {
1213 if (!scm_is_pair(obj)) {
1214 return obj;
1215 } else {
1216 SCM tmp = obj, copy;
1217 /* If this sexpr is visible in the read:sharp source, we want to
1218 keep that information, so only record non-constant cons cells
1219 which haven't previously been read by the reader. */
1220 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1221 {
1222 if (SCM_COPY_SOURCE_P)
1223 {
1224 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1225 SCM_UNDEFINED);
1226 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1227 {
1228 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1229 line,
1230 column,
1231 filename),
1232 SCM_UNDEFINED));
1233 copy = SCM_CDR (copy);
1234 }
1235 SCM_SETCDR (copy, tmp);
1236 }
1237 else
1238 {
1239 recsexpr (SCM_CAR (obj), line, column, filename);
1240 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1241 recsexpr (SCM_CAR (tmp), line, column, filename);
1242 copy = SCM_UNDEFINED;
1243 }
1244 scm_whash_insert (scm_source_whash,
1245 obj,
1246 scm_make_srcprops (line,
1247 column,
1248 filename,
1249 copy,
1250 SCM_EOL));
1251 }
1252 return obj;
1253 }
1254 }
1255
1256 /* Manipulate the read-hash-procedures alist. This could be written in
1257 Scheme, but maybe it will also be used by C code during initialisation. */
1258 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1259 (SCM chr, SCM proc),
1260 "Install the procedure @var{proc} for reading expressions\n"
1261 "starting with the character sequence @code{#} and @var{chr}.\n"
1262 "@var{proc} will be called with two arguments: the character\n"
1263 "@var{chr} and the port to read further data from. The object\n"
1264 "returned will be the return value of @code{read}. \n"
1265 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1266 )
1267 #define FUNC_NAME s_scm_read_hash_extend
1268 {
1269 SCM this;
1270 SCM prev;
1271
1272 SCM_VALIDATE_CHAR (1, chr);
1273 SCM_ASSERT (scm_is_false (proc)
1274 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1275 proc, SCM_ARG2, FUNC_NAME);
1276
1277 /* Check if chr is already in the alist. */
1278 this = *scm_read_hash_procedures;
1279 prev = SCM_BOOL_F;
1280 while (1)
1281 {
1282 if (scm_is_null (this))
1283 {
1284 /* not found, so add it to the beginning. */
1285 if (scm_is_true (proc))
1286 {
1287 *scm_read_hash_procedures =
1288 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1289 }
1290 break;
1291 }
1292 if (scm_is_eq (chr, SCM_CAAR (this)))
1293 {
1294 /* already in the alist. */
1295 if (scm_is_false (proc))
1296 {
1297 /* remove it. */
1298 if (scm_is_false (prev))
1299 {
1300 *scm_read_hash_procedures =
1301 SCM_CDR (*scm_read_hash_procedures);
1302 }
1303 else
1304 scm_set_cdr_x (prev, SCM_CDR (this));
1305 }
1306 else
1307 {
1308 /* replace it. */
1309 scm_set_cdr_x (SCM_CAR (this), proc);
1310 }
1311 break;
1312 }
1313 prev = this;
1314 this = SCM_CDR (this);
1315 }
1316
1317 return SCM_UNSPECIFIED;
1318 }
1319 #undef FUNC_NAME
1320
1321 /* Recover the read-hash procedure corresponding to char c. */
1322 static SCM
1323 scm_get_hash_procedure (int c)
1324 {
1325 SCM rest = *scm_read_hash_procedures;
1326
1327 while (1)
1328 {
1329 if (scm_is_null (rest))
1330 return SCM_BOOL_F;
1331
1332 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1333 return SCM_CDAR (rest);
1334
1335 rest = SCM_CDR (rest);
1336 }
1337 }
1338
1339 void
1340 scm_init_read ()
1341 {
1342 scm_read_hash_procedures =
1343 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1344
1345 scm_init_opts (scm_read_options, scm_read_opts);
1346 #include "libguile/read.x"
1347 }
1348
1349 /*
1350 Local Variables:
1351 c-file-style: "gnu"
1352 End:
1353 */