Always cast input to toupper as int
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/root.h"
46 #include "libguile/strings.h"
47 #include "libguile/strports.h"
48 #include "libguile/vectors.h"
49 #include "libguile/validate.h"
50 #include "libguile/srfi-4.h"
51 #include "libguile/srfi-13.h"
52
53 #include "libguile/read.h"
54 #include "libguile/private-options.h"
55
56
57 \f
58
59 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
60 SCM_SYMBOL (scm_keyword_prefix, "prefix");
61 SCM_SYMBOL (scm_keyword_postfix, "postfix");
62
63 scm_t_option scm_read_opts[] = {
64 { SCM_OPTION_BOOLEAN, "copy", 0,
65 "Copy source code expressions." },
66 { SCM_OPTION_BOOLEAN, "positions", 0,
67 "Record positions of source code expressions." },
68 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
69 "Convert symbols to lower case."},
70 { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
71 "Style of keyword recognition: #f, 'prefix or 'postfix."},
72 #if SCM_ENABLE_ELISP
73 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
74 "Support Elisp vector syntax, namely `[...]'."},
75 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
76 "Support `\\(' and `\\)' in strings."},
77 #endif
78 { 0, },
79 };
80
81 /*
82 Give meaningful error messages for errors
83
84 We use the format
85
86 FILE:LINE:COL: MESSAGE
87 This happened in ....
88
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
91
92 */
93
94
95 void
96 scm_i_input_error (char const *function,
97 SCM port, const char *message, SCM arg)
98 {
99 SCM fn = (scm_is_string (SCM_FILENAME(port))
100 ? SCM_FILENAME(port)
101 : scm_from_locale_string ("#<unknown port>"));
102
103 SCM string_port = scm_open_output_string ();
104 SCM string = SCM_EOL;
105 scm_simple_format (string_port,
106 scm_from_locale_string ("~A:~S:~S: ~A"),
107 scm_list_4 (fn,
108 scm_from_long (SCM_LINUM (port) + 1),
109 scm_from_int (SCM_COL (port) + 1),
110 scm_from_locale_string (message)));
111
112 string = scm_get_output_string (string_port);
113 scm_close_output_port (string_port);
114 scm_error_scm (scm_from_locale_symbol ("read-error"),
115 function? scm_from_locale_string (function) : SCM_BOOL_F,
116 string,
117 arg,
118 SCM_BOOL_F);
119 }
120
121
122 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
123 (SCM setting),
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
127 #define FUNC_NAME s_scm_read_options
128 {
129 SCM ans = scm_options (setting,
130 scm_read_opts,
131 FUNC_NAME);
132 if (SCM_COPY_SOURCE_P)
133 SCM_RECORD_POSITIONS_P = 1;
134 return ans;
135 }
136 #undef FUNC_NAME
137
138 /* An association list mapping extra hash characters to procedures. */
139 static SCM *scm_read_hash_procedures;
140
141
142 \f
143 /* Token readers. */
144
145
146 /* Size of the C buffer used to read symbols and numbers. */
147 #define READER_BUFFER_SIZE 128
148
149 /* Size of the C buffer used to read strings. */
150 #define READER_STRING_BUFFER_SIZE 512
151
152 /* The maximum size of Scheme character names. */
153 #define READER_CHAR_NAME_MAX_SIZE 50
154
155
156 /* `isblank' is only in C99. */
157 #define CHAR_IS_BLANK_(_chr) \
158 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
159 || ((_chr) == '\f') || ((_chr) == '\r'))
160
161 #ifdef MSDOS
162 # define CHAR_IS_BLANK(_chr) \
163 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
164 #else
165 # define CHAR_IS_BLANK CHAR_IS_BLANK_
166 #endif
167
168
169 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
170 structure''). */
171 #define CHAR_IS_R5RS_DELIMITER(c) \
172 (CHAR_IS_BLANK (c) \
173 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
174
175 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
176
177 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
178 Structure''. */
179 #define CHAR_IS_EXPONENT_MARKER(_chr) \
180 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
181 || ((_chr) == 'd') || ((_chr) == 'l'))
182
183 /* Read an SCSH block comment. */
184 static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
185 static SCM scm_read_commented_expression (int chr, SCM port);
186
187 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
188 zero if the whole token fits in BUF, non-zero otherwise. */
189 static inline int
190 read_token (SCM port, SCM buf, size_t *read)
191 {
192 scm_t_wchar chr;
193 *read = 0;
194
195 buf = scm_i_string_start_writing (buf);
196 while (*read < scm_i_string_length (buf))
197 {
198 chr = scm_getc (port);
199
200 if (chr == EOF)
201 {
202 scm_i_string_stop_writing ();
203 return 0;
204 }
205
206 chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
207
208 if (CHAR_IS_DELIMITER (chr))
209 {
210 scm_i_string_stop_writing ();
211 scm_ungetc (chr, port);
212 return 0;
213 }
214
215 scm_i_string_set_x (buf, *read, chr);
216 (*read)++;
217 }
218 scm_i_string_stop_writing ();
219
220 return 1;
221 }
222
223 static SCM
224 read_complete_token (SCM port, size_t *read)
225 {
226 SCM buffer, str = SCM_EOL;
227 size_t len;
228 int overflow;
229
230 buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
231 overflow = read_token (port, buffer, read);
232 if (!overflow)
233 return scm_i_substring (buffer, 0, *read);
234
235 str = scm_string_copy (buffer);
236 do
237 {
238 overflow = read_token (port, buffer, &len);
239 str = scm_string_append (scm_list_2 (str, buffer));
240 *read += len;
241 }
242 while (overflow);
243
244 return scm_i_substring (str, 0, *read);
245 }
246
247 /* Skip whitespace from PORT and return the first non-whitespace character
248 read. Raise an error on end-of-file. */
249 static int
250 flush_ws (SCM port, const char *eoferr)
251 {
252 register scm_t_wchar c;
253 while (1)
254 switch (c = scm_getc (port))
255 {
256 case EOF:
257 goteof:
258 if (eoferr)
259 {
260 scm_i_input_error (eoferr,
261 port,
262 "end of file",
263 SCM_EOL);
264 }
265 return c;
266
267 case ';':
268 lp:
269 switch (c = scm_getc (port))
270 {
271 case EOF:
272 goto goteof;
273 default:
274 goto lp;
275 case SCM_LINE_INCREMENTORS:
276 break;
277 }
278 break;
279
280 case '#':
281 switch (c = scm_getc (port))
282 {
283 case EOF:
284 eoferr = "read_sharp";
285 goto goteof;
286 case '!':
287 scm_read_scsh_block_comment (c, port);
288 break;
289 case ';':
290 scm_read_commented_expression (c, port);
291 break;
292 default:
293 scm_ungetc (c, port);
294 return '#';
295 }
296 break;
297
298 case SCM_LINE_INCREMENTORS:
299 case SCM_SINGLE_SPACES:
300 case '\t':
301 break;
302
303 default:
304 return c;
305 }
306
307 return 0;
308 }
309
310
311 \f
312 /* Token readers. */
313
314 static SCM scm_read_expression (SCM port);
315 static SCM scm_read_sharp (int chr, SCM port);
316 static SCM scm_get_hash_procedure (int c);
317 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
318
319
320 static SCM
321 scm_read_sexp (scm_t_wchar chr, SCM port)
322 #define FUNC_NAME "scm_i_lreadparen"
323 {
324 register int c;
325 register SCM tmp;
326 register SCM tl, ans = SCM_EOL;
327 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
328 static const int terminating_char = ')';
329
330 /* Need to capture line and column numbers here. */
331 long line = SCM_LINUM (port);
332 int column = SCM_COL (port) - 1;
333
334
335 c = flush_ws (port, FUNC_NAME);
336 if (terminating_char == c)
337 return SCM_EOL;
338
339 scm_ungetc (c, port);
340 if (scm_is_eq (scm_sym_dot,
341 (tmp = scm_read_expression (port))))
342 {
343 ans = scm_read_expression (port);
344 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
345 scm_i_input_error (FUNC_NAME, port, "missing close paren",
346 SCM_EOL);
347 return ans;
348 }
349
350 /* Build the head of the list structure. */
351 ans = tl = scm_cons (tmp, SCM_EOL);
352
353 if (SCM_COPY_SOURCE_P)
354 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
355 ? copy
356 : tmp,
357 SCM_EOL);
358
359 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
360 {
361 SCM new_tail;
362
363 scm_ungetc (c, port);
364 if (scm_is_eq (scm_sym_dot,
365 (tmp = scm_read_expression (port))))
366 {
367 SCM_SETCDR (tl, tmp = scm_read_expression (port));
368
369 if (SCM_COPY_SOURCE_P)
370 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
371 SCM_EOL));
372
373 c = flush_ws (port, FUNC_NAME);
374 if (terminating_char != c)
375 scm_i_input_error (FUNC_NAME, port,
376 "in pair: missing close paren", SCM_EOL);
377 goto exit;
378 }
379
380 new_tail = scm_cons (tmp, SCM_EOL);
381 SCM_SETCDR (tl, new_tail);
382 tl = new_tail;
383
384 if (SCM_COPY_SOURCE_P)
385 {
386 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
387 ? copy
388 : tmp, SCM_EOL);
389 SCM_SETCDR (tl2, new_tail2);
390 tl2 = new_tail2;
391 }
392 }
393
394 exit:
395 if (SCM_RECORD_POSITIONS_P)
396 scm_whash_insert (scm_source_whash,
397 ans,
398 scm_make_srcprops (line, column,
399 SCM_FILENAME (port),
400 SCM_COPY_SOURCE_P
401 ? ans2
402 : SCM_UNDEFINED,
403 SCM_EOL));
404 return ans;
405 }
406 #undef FUNC_NAME
407
408 static SCM
409 scm_read_string (int chr, SCM port)
410 #define FUNC_NAME "scm_lreadr"
411 {
412 /* For strings smaller than C_STR, this function creates only one Scheme
413 object (the string returned). */
414
415 SCM str = SCM_BOOL_F;
416 unsigned c_str_len = 0;
417 scm_t_wchar c;
418
419 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
420 while ('"' != (c = scm_getc (port)))
421 {
422 if (c == EOF)
423 {
424 str_eof:
425 scm_i_input_error (FUNC_NAME, port,
426 "end of file in string constant", SCM_EOL);
427 }
428
429 if (c_str_len + 1 >= scm_i_string_length (str))
430 {
431 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
432
433 str = scm_string_append (scm_list_2 (str, addy));
434 }
435
436 if (c == '\\')
437 {
438 switch (c = scm_getc (port))
439 {
440 case EOF:
441 goto str_eof;
442 case '"':
443 case '\\':
444 break;
445 #if SCM_ENABLE_ELISP
446 case '(':
447 case ')':
448 if (SCM_ESCAPED_PARENS_P)
449 break;
450 goto bad_escaped;
451 #endif
452 case '\n':
453 continue;
454 case '0':
455 c = '\0';
456 break;
457 case 'f':
458 c = '\f';
459 break;
460 case 'n':
461 c = '\n';
462 break;
463 case 'r':
464 c = '\r';
465 break;
466 case 't':
467 c = '\t';
468 break;
469 case 'a':
470 c = '\007';
471 break;
472 case 'v':
473 c = '\v';
474 break;
475 case 'x':
476 {
477 scm_t_wchar a, b;
478 a = scm_getc (port);
479 if (a == EOF)
480 goto str_eof;
481 b = scm_getc (port);
482 if (b == EOF)
483 goto str_eof;
484 if ('0' <= a && a <= '9')
485 a -= '0';
486 else if ('A' <= a && a <= 'F')
487 a = a - 'A' + 10;
488 else if ('a' <= a && a <= 'f')
489 a = a - 'a' + 10;
490 else
491 {
492 c = a;
493 goto bad_escaped;
494 }
495 if ('0' <= b && b <= '9')
496 b -= '0';
497 else if ('A' <= b && b <= 'F')
498 b = b - 'A' + 10;
499 else if ('a' <= b && b <= 'f')
500 b = b - 'a' + 10;
501 else
502 {
503 c = b;
504 goto bad_escaped;
505 }
506 c = a * 16 + b;
507 break;
508 }
509 case 'u':
510 {
511 scm_t_wchar a;
512 int i;
513 c = 0;
514 for (i = 0; i < 4; i++)
515 {
516 a = scm_getc (port);
517 if (a == EOF)
518 goto str_eof;
519 if ('0' <= a && a <= '9')
520 a -= '0';
521 else if ('A' <= a && a <= 'F')
522 a = a - 'A' + 10;
523 else if ('a' <= a && a <= 'f')
524 a = a - 'a' + 10;
525 else
526 {
527 c = a;
528 goto bad_escaped;
529 }
530 c = c * 16 + a;
531 }
532 break;
533 }
534 case 'U':
535 {
536 scm_t_wchar a;
537 int i;
538 c = 0;
539 for (i = 0; i < 6; i++)
540 {
541 a = scm_getc (port);
542 if (a == EOF)
543 goto str_eof;
544 if ('0' <= a && a <= '9')
545 a -= '0';
546 else if ('A' <= a && a <= 'F')
547 a = a - 'A' + 10;
548 else if ('a' <= a && a <= 'f')
549 a = a - 'a' + 10;
550 else
551 {
552 c = a;
553 goto bad_escaped;
554 }
555 c = c * 16 + a;
556 }
557 break;
558 }
559 default:
560 bad_escaped:
561 scm_i_input_error (FUNC_NAME, port,
562 "illegal character in escape sequence: ~S",
563 scm_list_1 (SCM_MAKE_CHAR (c)));
564 }
565 }
566 str = scm_i_string_start_writing (str);
567 scm_i_string_set_x (str, c_str_len++, c);
568 scm_i_string_stop_writing ();
569 }
570
571 if (c_str_len > 0)
572 {
573 return scm_i_substring_copy (str, 0, c_str_len);
574 }
575
576 return scm_nullstr;
577 }
578 #undef FUNC_NAME
579
580
581 static SCM
582 scm_read_number (scm_t_wchar chr, SCM port)
583 {
584 SCM result;
585 SCM buffer;
586 size_t read;
587
588 scm_ungetc (chr, port);
589 buffer = read_complete_token (port, &read);
590 result = scm_string_to_number (buffer, SCM_UNDEFINED);
591 if (!scm_is_true (result))
592 /* Return a symbol instead of a number. */
593 result = scm_string_to_symbol (buffer);
594
595 return result;
596 }
597
598 static SCM
599 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
600 {
601 SCM result;
602 int ends_with_colon = 0;
603 SCM buffer;
604 size_t read = 0;
605 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
606
607 scm_ungetc (chr, port);
608 buffer = read_complete_token (port, &read);
609 if (read > 0)
610 ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
611
612 if (postfix && ends_with_colon && (read > 1))
613 result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
614 else
615 result = scm_string_to_symbol (buffer);
616
617 return result;
618 }
619
620 static SCM
621 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
622 #define FUNC_NAME "scm_lreadr"
623 {
624 SCM result;
625 size_t read;
626 SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
627 unsigned int radix;
628
629 switch (chr)
630 {
631 case 'B':
632 case 'b':
633 radix = 2;
634 break;
635
636 case 'o':
637 case 'O':
638 radix = 8;
639 break;
640
641 case 'd':
642 case 'D':
643 radix = 10;
644 break;
645
646 case 'x':
647 case 'X':
648 radix = 16;
649 break;
650
651 default:
652 scm_ungetc (chr, port);
653 scm_ungetc ('#', port);
654 radix = 10;
655 }
656
657 buffer = read_complete_token (port, &read);
658 result = scm_string_to_number (buffer, scm_from_uint (radix));
659
660 if (scm_is_true (result))
661 return result;
662
663 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
664
665 return SCM_BOOL_F;
666 }
667 #undef FUNC_NAME
668
669 static SCM
670 scm_read_quote (int chr, SCM port)
671 {
672 SCM p;
673 long line = SCM_LINUM (port);
674 int column = SCM_COL (port) - 1;
675
676 switch (chr)
677 {
678 case '`':
679 p = scm_sym_quasiquote;
680 break;
681
682 case '\'':
683 p = scm_sym_quote;
684 break;
685
686 case ',':
687 {
688 scm_t_wchar c;
689
690 c = scm_getc (port);
691 if ('@' == c)
692 p = scm_sym_uq_splicing;
693 else
694 {
695 scm_ungetc (c, port);
696 p = scm_sym_unquote;
697 }
698 break;
699 }
700
701 default:
702 fprintf (stderr, "%s: unhandled quote character (%i)\n",
703 "scm_read_quote", chr);
704 abort ();
705 }
706
707 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
708 if (SCM_RECORD_POSITIONS_P)
709 scm_whash_insert (scm_source_whash, p,
710 scm_make_srcprops (line, column,
711 SCM_FILENAME (port),
712 SCM_COPY_SOURCE_P
713 ? (scm_cons2 (SCM_CAR (p),
714 SCM_CAR (SCM_CDR (p)),
715 SCM_EOL))
716 : SCM_UNDEFINED,
717 SCM_EOL));
718
719
720 return p;
721 }
722
723 SCM_SYMBOL (sym_syntax, "syntax");
724 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
725 SCM_SYMBOL (sym_unsyntax, "unsyntax");
726 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
727
728 static SCM
729 scm_read_syntax (int chr, SCM port)
730 {
731 SCM p;
732 long line = SCM_LINUM (port);
733 int column = SCM_COL (port) - 1;
734
735 switch (chr)
736 {
737 case '`':
738 p = sym_quasisyntax;
739 break;
740
741 case '\'':
742 p = sym_syntax;
743 break;
744
745 case ',':
746 {
747 int c;
748
749 c = scm_getc (port);
750 if ('@' == c)
751 p = sym_unsyntax_splicing;
752 else
753 {
754 scm_ungetc (c, port);
755 p = sym_unsyntax;
756 }
757 break;
758 }
759
760 default:
761 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
762 "scm_read_syntax", chr);
763 abort ();
764 }
765
766 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
767 if (SCM_RECORD_POSITIONS_P)
768 scm_whash_insert (scm_source_whash, p,
769 scm_make_srcprops (line, column,
770 SCM_FILENAME (port),
771 SCM_COPY_SOURCE_P
772 ? (scm_cons2 (SCM_CAR (p),
773 SCM_CAR (SCM_CDR (p)),
774 SCM_EOL))
775 : SCM_UNDEFINED,
776 SCM_EOL));
777
778
779 return p;
780 }
781
782 static inline SCM
783 scm_read_semicolon_comment (int chr, SCM port)
784 {
785 int c;
786
787 /* We use the get_byte here because there is no need to get the
788 locale correct with comment input. This presumes that newline
789 always represents itself no matter what the encoding is. */
790 for (c = scm_get_byte_or_eof (port);
791 (c != EOF) && (c != '\n');
792 c = scm_getc (port));
793
794 return SCM_UNSPECIFIED;
795 }
796
797 \f
798 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
799
800 static SCM
801 scm_read_boolean (int chr, SCM port)
802 {
803 switch (chr)
804 {
805 case 't':
806 case 'T':
807 return SCM_BOOL_T;
808
809 case 'f':
810 case 'F':
811 return SCM_BOOL_F;
812 }
813
814 return SCM_UNSPECIFIED;
815 }
816
817 static SCM
818 scm_read_character (scm_t_wchar chr, SCM port)
819 #define FUNC_NAME "scm_lreadr"
820 {
821 SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
822 size_t charname_len;
823 scm_t_wchar cp;
824 int overflow;
825
826 overflow = read_token (port, charname, &charname_len);
827 charname = scm_c_substring (charname, 0, charname_len);
828
829 if (overflow)
830 goto char_error;
831
832 if (charname_len == 0)
833 {
834 chr = scm_getc (port);
835 if (chr == EOF)
836 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
837 "while reading character", SCM_EOL);
838
839 /* CHR must be a token delimiter, like a whitespace. */
840 return (SCM_MAKE_CHAR (chr));
841 }
842
843 if (charname_len == 1)
844 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
845
846 cp = scm_i_string_ref (charname, 0);
847 if (cp >= '0' && cp < '8')
848 {
849 /* Dirk:FIXME:: This type of character syntax is not R5RS
850 * compliant. Further, it should be verified that the constant
851 * does only consist of octal digits. Finally, it should be
852 * checked whether the resulting fixnum is in the range of
853 * characters. */
854 SCM p = scm_string_to_number (charname, scm_from_uint (8));
855 if (SCM_I_INUMP (p))
856 return SCM_MAKE_CHAR (SCM_I_INUM (p));
857 }
858
859 /* The names of characters should never have non-Latin1
860 characters. */
861 if (scm_i_is_narrow_string (charname)
862 || scm_i_try_narrow_string (charname))
863 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
864 charname_len);
865 if (scm_is_true (ch))
866 return ch;
867 }
868
869 char_error:
870 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
871 scm_list_1 (charname));
872
873 return SCM_UNSPECIFIED;
874 }
875 #undef FUNC_NAME
876
877 static inline SCM
878 scm_read_keyword (int chr, SCM port)
879 {
880 SCM symbol;
881
882 /* Read the symbol that comprises the keyword. Doing this instead of
883 invoking a specific symbol reader function allows `scm_read_keyword ()'
884 to adapt to the delimiters currently valid of symbols.
885
886 XXX: This implementation allows sloppy syntaxes like `#: key'. */
887 symbol = scm_read_expression (port);
888 if (!scm_is_symbol (symbol))
889 scm_i_input_error ("scm_read_keyword", port,
890 "keyword prefix `~a' not followed by a symbol: ~s",
891 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
892
893 return (scm_symbol_to_keyword (symbol));
894 }
895
896 static inline SCM
897 scm_read_vector (int chr, SCM port)
898 {
899 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
900 guarantee that it's going to do what we want. After all, this is an
901 implementation detail of `scm_read_vector ()', not a desirable
902 property. */
903 return (scm_vector (scm_read_sexp (chr, port)));
904 }
905
906 static inline SCM
907 scm_read_srfi4_vector (int chr, SCM port)
908 {
909 return scm_i_read_array (port, chr);
910 }
911
912 static SCM
913 scm_read_bytevector (scm_t_wchar chr, SCM port)
914 {
915 chr = scm_getc (port);
916 if (chr != 'u')
917 goto syntax;
918
919 chr = scm_getc (port);
920 if (chr != '8')
921 goto syntax;
922
923 chr = scm_getc (port);
924 if (chr != '(')
925 goto syntax;
926
927 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
928
929 syntax:
930 scm_i_input_error ("read_bytevector", port,
931 "invalid bytevector prefix",
932 SCM_MAKE_CHAR (chr));
933 return SCM_UNSPECIFIED;
934 }
935
936 static SCM
937 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
938 {
939 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
940 terribly inefficient but who cares? */
941 SCM s_bits = SCM_EOL;
942
943 for (chr = scm_getc (port);
944 (chr != EOF) && ((chr == '0') || (chr == '1'));
945 chr = scm_getc (port))
946 {
947 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
948 }
949
950 if (chr != EOF)
951 scm_ungetc (chr, port);
952
953 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
954 }
955
956 static inline SCM
957 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
958 {
959 int bang_seen = 0;
960
961 /* We can use the get_byte here because there is no need to get the
962 locale correct when reading comments. This presumes that
963 hash and exclamation points always represent themselves no
964 matter what the source encoding is.*/
965 for (;;)
966 {
967 int c = scm_get_byte_or_eof (port);
968
969 if (c == EOF)
970 scm_i_input_error ("skip_block_comment", port,
971 "unterminated `#! ... !#' comment", SCM_EOL);
972
973 if (c == '!')
974 bang_seen = 1;
975 else if (c == '#' && bang_seen)
976 break;
977 else
978 bang_seen = 0;
979 }
980
981 return SCM_UNSPECIFIED;
982 }
983
984 static SCM
985 scm_read_commented_expression (scm_t_wchar chr, SCM port)
986 {
987 scm_t_wchar c;
988
989 c = flush_ws (port, (char *) NULL);
990 if (EOF == c)
991 scm_i_input_error ("read_commented_expression", port,
992 "no expression after #; comment", SCM_EOL);
993 scm_ungetc (c, port);
994 scm_read_expression (port);
995 return SCM_UNSPECIFIED;
996 }
997
998 static SCM
999 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1000 {
1001 /* Guile's extended symbol read syntax looks like this:
1002
1003 #{This is all a symbol name}#
1004
1005 So here, CHR is expected to be `{'. */
1006 int saw_brace = 0, finished = 0;
1007 size_t len = 0;
1008 SCM buf = scm_i_make_string (1024, NULL);
1009
1010 buf = scm_i_string_start_writing (buf);
1011
1012 while ((chr = scm_getc (port)) != EOF)
1013 {
1014 if (saw_brace)
1015 {
1016 if (chr == '#')
1017 {
1018 finished = 1;
1019 break;
1020 }
1021 else
1022 {
1023 saw_brace = 0;
1024 scm_i_string_set_x (buf, len++, '}');
1025 scm_i_string_set_x (buf, len++, chr);
1026 }
1027 }
1028 else if (chr == '}')
1029 saw_brace = 1;
1030 else
1031 scm_i_string_set_x (buf, len++, chr);
1032
1033 if (len >= scm_i_string_length (buf) - 2)
1034 {
1035 scm_i_string_stop_writing ();
1036 SCM addy = scm_i_make_string (1024, NULL);
1037 buf = scm_string_append (scm_list_2 (buf, addy));
1038 len = 0;
1039 buf = scm_i_string_start_writing (buf);
1040 }
1041
1042 if (finished)
1043 break;
1044 }
1045 scm_i_string_stop_writing ();
1046
1047 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1048 }
1049
1050
1051 \f
1052 /* Top-level token readers, i.e., dispatchers. */
1053
1054 static SCM
1055 scm_read_sharp_extension (int chr, SCM port)
1056 {
1057 SCM proc;
1058
1059 proc = scm_get_hash_procedure (chr);
1060 if (scm_is_true (scm_procedure_p (proc)))
1061 {
1062 long line = SCM_LINUM (port);
1063 int column = SCM_COL (port) - 2;
1064 SCM got;
1065
1066 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1067 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1068 {
1069 if (SCM_RECORD_POSITIONS_P)
1070 return (recsexpr (got, line, column,
1071 SCM_FILENAME (port)));
1072 else
1073 return got;
1074 }
1075 }
1076
1077 return SCM_UNSPECIFIED;
1078 }
1079
1080 /* The reader for the sharp `#' character. It basically dispatches reads
1081 among the above token readers. */
1082 static SCM
1083 scm_read_sharp (scm_t_wchar chr, SCM port)
1084 #define FUNC_NAME "scm_lreadr"
1085 {
1086 SCM result;
1087
1088 chr = scm_getc (port);
1089
1090 result = scm_read_sharp_extension (chr, port);
1091 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1092 return result;
1093
1094 switch (chr)
1095 {
1096 case '\\':
1097 return (scm_read_character (chr, port));
1098 case '(':
1099 return (scm_read_vector (chr, port));
1100 case 's':
1101 case 'u':
1102 case 'f':
1103 /* This one may return either a boolean or an SRFI-4 vector. */
1104 return (scm_read_srfi4_vector (chr, port));
1105 case 'v':
1106 return (scm_read_bytevector (chr, port));
1107 case '*':
1108 return (scm_read_guile_bit_vector (chr, port));
1109 case 't':
1110 case 'T':
1111 case 'F':
1112 /* This one may return either a boolean or an SRFI-4 vector. */
1113 return (scm_read_boolean (chr, port));
1114 case ':':
1115 return (scm_read_keyword (chr, port));
1116 case '0': case '1': case '2': case '3': case '4':
1117 case '5': case '6': case '7': case '8': case '9':
1118 case '@':
1119 #if SCM_ENABLE_DEPRECATED
1120 /* See below for 'i' and 'e'. */
1121 case 'a':
1122 case 'c':
1123 case 'y':
1124 case 'h':
1125 case 'l':
1126 #endif
1127 return (scm_i_read_array (port, chr));
1128
1129 case 'i':
1130 case 'e':
1131 #if SCM_ENABLE_DEPRECATED
1132 {
1133 /* When next char is '(', it really is an old-style
1134 uniform array. */
1135 scm_t_wchar next_c = scm_getc (port);
1136 if (next_c != EOF)
1137 scm_ungetc (next_c, port);
1138 if (next_c == '(')
1139 return scm_i_read_array (port, chr);
1140 /* Fall through. */
1141 }
1142 #endif
1143 case 'b':
1144 case 'B':
1145 case 'o':
1146 case 'O':
1147 case 'd':
1148 case 'D':
1149 case 'x':
1150 case 'X':
1151 case 'I':
1152 case 'E':
1153 return (scm_read_number_and_radix (chr, port));
1154 case '{':
1155 return (scm_read_extended_symbol (chr, port));
1156 case '!':
1157 return (scm_read_scsh_block_comment (chr, port));
1158 case ';':
1159 return (scm_read_commented_expression (chr, port));
1160 case '`':
1161 case '\'':
1162 case ',':
1163 return (scm_read_syntax (chr, port));
1164 default:
1165 result = scm_read_sharp_extension (chr, port);
1166 if (scm_is_eq (result, SCM_UNSPECIFIED))
1167 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1168 scm_list_1 (SCM_MAKE_CHAR (chr)));
1169 else
1170 return result;
1171 }
1172
1173 return SCM_UNSPECIFIED;
1174 }
1175 #undef FUNC_NAME
1176
1177 static SCM
1178 scm_read_expression (SCM port)
1179 #define FUNC_NAME "scm_read_expression"
1180 {
1181 while (1)
1182 {
1183 register scm_t_wchar chr;
1184
1185 chr = scm_getc (port);
1186
1187 switch (chr)
1188 {
1189 case SCM_WHITE_SPACES:
1190 case SCM_LINE_INCREMENTORS:
1191 break;
1192 case ';':
1193 (void) scm_read_semicolon_comment (chr, port);
1194 break;
1195 case '(':
1196 return (scm_read_sexp (chr, port));
1197 case '"':
1198 return (scm_read_string (chr, port));
1199 case '\'':
1200 case '`':
1201 case ',':
1202 return (scm_read_quote (chr, port));
1203 case '#':
1204 {
1205 SCM result;
1206 result = scm_read_sharp (chr, port);
1207 if (scm_is_eq (result, SCM_UNSPECIFIED))
1208 /* We read a comment or some such. */
1209 break;
1210 else
1211 return result;
1212 }
1213 case ')':
1214 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1215 break;
1216 case EOF:
1217 return SCM_EOF_VAL;
1218 case ':':
1219 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1220 return scm_symbol_to_keyword (scm_read_expression (port));
1221 /* Fall through. */
1222
1223 default:
1224 {
1225 if (((chr >= '0') && (chr <= '9'))
1226 || (strchr ("+-.", chr)))
1227 return (scm_read_number (chr, port));
1228 else
1229 return (scm_read_mixed_case_symbol (chr, port));
1230 }
1231 }
1232 }
1233 }
1234 #undef FUNC_NAME
1235
1236 \f
1237 /* Actual reader. */
1238
1239 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1240 (SCM port),
1241 "Read an s-expression from the input port @var{port}, or from\n"
1242 "the current input port if @var{port} is not specified.\n"
1243 "Any whitespace before the next token is discarded.")
1244 #define FUNC_NAME s_scm_read
1245 {
1246 int c;
1247
1248 if (SCM_UNBNDP (port))
1249 port = scm_current_input_port ();
1250 SCM_VALIDATE_OPINPORT (1, port);
1251
1252 c = flush_ws (port, (char *) NULL);
1253 if (EOF == c)
1254 return SCM_EOF_VAL;
1255 scm_ungetc (c, port);
1256
1257 return (scm_read_expression (port));
1258 }
1259 #undef FUNC_NAME
1260
1261
1262 \f
1263
1264 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1265 static SCM
1266 recsexpr (SCM obj, long line, int column, SCM filename)
1267 {
1268 if (!scm_is_pair(obj)) {
1269 return obj;
1270 } else {
1271 SCM tmp = obj, copy;
1272 /* If this sexpr is visible in the read:sharp source, we want to
1273 keep that information, so only record non-constant cons cells
1274 which haven't previously been read by the reader. */
1275 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1276 {
1277 if (SCM_COPY_SOURCE_P)
1278 {
1279 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1280 SCM_UNDEFINED);
1281 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1282 {
1283 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1284 line,
1285 column,
1286 filename),
1287 SCM_UNDEFINED));
1288 copy = SCM_CDR (copy);
1289 }
1290 SCM_SETCDR (copy, tmp);
1291 }
1292 else
1293 {
1294 recsexpr (SCM_CAR (obj), line, column, filename);
1295 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1296 recsexpr (SCM_CAR (tmp), line, column, filename);
1297 copy = SCM_UNDEFINED;
1298 }
1299 scm_whash_insert (scm_source_whash,
1300 obj,
1301 scm_make_srcprops (line,
1302 column,
1303 filename,
1304 copy,
1305 SCM_EOL));
1306 }
1307 return obj;
1308 }
1309 }
1310
1311 /* Manipulate the read-hash-procedures alist. This could be written in
1312 Scheme, but maybe it will also be used by C code during initialisation. */
1313 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1314 (SCM chr, SCM proc),
1315 "Install the procedure @var{proc} for reading expressions\n"
1316 "starting with the character sequence @code{#} and @var{chr}.\n"
1317 "@var{proc} will be called with two arguments: the character\n"
1318 "@var{chr} and the port to read further data from. The object\n"
1319 "returned will be the return value of @code{read}. \n"
1320 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1321 )
1322 #define FUNC_NAME s_scm_read_hash_extend
1323 {
1324 SCM this;
1325 SCM prev;
1326
1327 SCM_VALIDATE_CHAR (1, chr);
1328 SCM_ASSERT (scm_is_false (proc)
1329 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1330 proc, SCM_ARG2, FUNC_NAME);
1331
1332 /* Check if chr is already in the alist. */
1333 this = *scm_read_hash_procedures;
1334 prev = SCM_BOOL_F;
1335 while (1)
1336 {
1337 if (scm_is_null (this))
1338 {
1339 /* not found, so add it to the beginning. */
1340 if (scm_is_true (proc))
1341 {
1342 *scm_read_hash_procedures =
1343 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1344 }
1345 break;
1346 }
1347 if (scm_is_eq (chr, SCM_CAAR (this)))
1348 {
1349 /* already in the alist. */
1350 if (scm_is_false (proc))
1351 {
1352 /* remove it. */
1353 if (scm_is_false (prev))
1354 {
1355 *scm_read_hash_procedures =
1356 SCM_CDR (*scm_read_hash_procedures);
1357 }
1358 else
1359 scm_set_cdr_x (prev, SCM_CDR (this));
1360 }
1361 else
1362 {
1363 /* replace it. */
1364 scm_set_cdr_x (SCM_CAR (this), proc);
1365 }
1366 break;
1367 }
1368 prev = this;
1369 this = SCM_CDR (this);
1370 }
1371
1372 return SCM_UNSPECIFIED;
1373 }
1374 #undef FUNC_NAME
1375
1376 /* Recover the read-hash procedure corresponding to char c. */
1377 static SCM
1378 scm_get_hash_procedure (int c)
1379 {
1380 SCM rest = *scm_read_hash_procedures;
1381
1382 while (1)
1383 {
1384 if (scm_is_null (rest))
1385 return SCM_BOOL_F;
1386
1387 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1388 return SCM_CDAR (rest);
1389
1390 rest = SCM_CDR (rest);
1391 }
1392 }
1393
1394 #define SCM_ENCODING_SEARCH_SIZE (500)
1395
1396 /* Search the first few hundred characters of a file for
1397 an emacs-like coding declaration. */
1398 char *
1399 scm_scan_for_encoding (SCM port)
1400 {
1401 char header[SCM_ENCODING_SEARCH_SIZE+1];
1402 size_t bytes_read;
1403 char *encoding = NULL;
1404 int utf8_bom = 0;
1405 char *pos;
1406 int i;
1407 int in_comment;
1408
1409 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1410 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1411
1412 if (bytes_read > 3
1413 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1414 utf8_bom = 1;
1415
1416 /* search past "coding[:=]" */
1417 pos = header;
1418 while (1)
1419 {
1420 if ((pos = strstr(pos, "coding")) == NULL)
1421 return NULL;
1422
1423 pos += strlen("coding");
1424 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1425 (*pos == ':' || *pos == '='))
1426 {
1427 pos ++;
1428 break;
1429 }
1430 }
1431
1432 /* skip spaces */
1433 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1434 (*pos == ' ' || *pos == '\t'))
1435 pos ++;
1436
1437 /* grab the next token */
1438 i = 0;
1439 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1440 && (isalnum(pos[i]) || pos[i] == '_' || pos[i] == '-' || pos[i] == '.'))
1441 i++;
1442
1443 if (i == 0)
1444 return NULL;
1445
1446 encoding = scm_malloc (i+1);
1447 memcpy (encoding, pos, i);
1448 encoding[i] ='\0';
1449 for (i = 0; i < strlen (encoding); i++)
1450 encoding[i] = toupper ((int) encoding[i]);
1451
1452 /* push backwards to make sure we were in a comment */
1453 in_comment = 0;
1454 while (pos - i - header > 0)
1455 {
1456 if (*(pos - i) == '\n')
1457 {
1458 /* This wasn't in a semicolon comment. Check for a
1459 hash-bang comment. */
1460 char *beg = strstr (header, "#!");
1461 char *end = strstr (header, "!#");
1462 if (beg < pos && pos < end)
1463 in_comment = 1;
1464 break;
1465 }
1466 if (*(pos - i) == ';')
1467 {
1468 in_comment = 1;
1469 break;
1470 }
1471 i ++;
1472 }
1473 if (!in_comment)
1474 {
1475 /* This wasn't in a comment */
1476 free (encoding);
1477 return NULL;
1478 }
1479 if (utf8_bom && strcmp(encoding, "UTF-8"))
1480 scm_misc_error (NULL,
1481 "the port input declares the encoding ~s but is encoded as UTF-8",
1482 scm_list_1 (scm_from_locale_string (encoding)));
1483
1484 return encoding;
1485 }
1486
1487 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1488 (SCM port),
1489 "Scans the port for an EMACS-like character coding declaration\n"
1490 "near the top of the contents of a port with random-acessible contents.\n"
1491 "The coding declaration is of the form\n"
1492 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1493 "\n"
1494 "Returns a string containing the character encoding of the file\n"
1495 "if a declaration was found, or @code{#f} otherwise.\n")
1496 #define FUNC_NAME s_scm_file_encoding
1497 {
1498 char *enc;
1499 SCM s_enc;
1500
1501 enc = scm_scan_for_encoding (port);
1502 if (enc == NULL)
1503 return SCM_BOOL_F;
1504 else
1505 {
1506 s_enc = scm_from_locale_string (enc);
1507 free (enc);
1508 return s_enc;
1509 }
1510
1511 return SCM_BOOL_F;
1512 }
1513 #undef FUNC_NAME
1514
1515 void
1516 scm_init_read ()
1517 {
1518 scm_read_hash_procedures =
1519 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1520
1521 scm_init_opts (scm_read_options, scm_read_opts);
1522 #include "libguile/read.x"
1523 }
1524
1525 /*
1526 Local Variables:
1527 c-file-style: "gnu"
1528 End:
1529 */