Modify read and print of combining characters
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/root.h"
46 #include "libguile/strings.h"
47 #include "libguile/strports.h"
48 #include "libguile/vectors.h"
49 #include "libguile/validate.h"
50 #include "libguile/srfi-4.h"
51 #include "libguile/srfi-13.h"
52
53 #include "libguile/read.h"
54 #include "libguile/private-options.h"
55
56
57 \f
58
59 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
60 SCM_SYMBOL (scm_keyword_prefix, "prefix");
61 SCM_SYMBOL (scm_keyword_postfix, "postfix");
62
63 scm_t_option scm_read_opts[] = {
64 { SCM_OPTION_BOOLEAN, "copy", 0,
65 "Copy source code expressions." },
66 { SCM_OPTION_BOOLEAN, "positions", 0,
67 "Record positions of source code expressions." },
68 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
69 "Convert symbols to lower case."},
70 { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
71 "Style of keyword recognition: #f, 'prefix or 'postfix."},
72 #if SCM_ENABLE_ELISP
73 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
74 "Support Elisp vector syntax, namely `[...]'."},
75 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
76 "Support `\\(' and `\\)' in strings."},
77 #endif
78 { 0, },
79 };
80
81 /*
82 Give meaningful error messages for errors
83
84 We use the format
85
86 FILE:LINE:COL: MESSAGE
87 This happened in ....
88
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
91
92 */
93
94
95 void
96 scm_i_input_error (char const *function,
97 SCM port, const char *message, SCM arg)
98 {
99 SCM fn = (scm_is_string (SCM_FILENAME(port))
100 ? SCM_FILENAME(port)
101 : scm_from_locale_string ("#<unknown port>"));
102
103 SCM string_port = scm_open_output_string ();
104 SCM string = SCM_EOL;
105 scm_simple_format (string_port,
106 scm_from_locale_string ("~A:~S:~S: ~A"),
107 scm_list_4 (fn,
108 scm_from_long (SCM_LINUM (port) + 1),
109 scm_from_int (SCM_COL (port) + 1),
110 scm_from_locale_string (message)));
111
112 string = scm_get_output_string (string_port);
113 scm_close_output_port (string_port);
114 scm_error_scm (scm_from_locale_symbol ("read-error"),
115 function? scm_from_locale_string (function) : SCM_BOOL_F,
116 string,
117 arg,
118 SCM_BOOL_F);
119 }
120
121
122 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
123 (SCM setting),
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
127 #define FUNC_NAME s_scm_read_options
128 {
129 SCM ans = scm_options (setting,
130 scm_read_opts,
131 FUNC_NAME);
132 if (SCM_COPY_SOURCE_P)
133 SCM_RECORD_POSITIONS_P = 1;
134 return ans;
135 }
136 #undef FUNC_NAME
137
138 /* An association list mapping extra hash characters to procedures. */
139 static SCM *scm_read_hash_procedures;
140
141
142 \f
143 /* Token readers. */
144
145
146 /* Size of the C buffer used to read symbols and numbers. */
147 #define READER_BUFFER_SIZE 128
148
149 /* Size of the C buffer used to read strings. */
150 #define READER_STRING_BUFFER_SIZE 512
151
152 /* The maximum size of Scheme character names. */
153 #define READER_CHAR_NAME_MAX_SIZE 50
154
155
156 /* `isblank' is only in C99. */
157 #define CHAR_IS_BLANK_(_chr) \
158 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
159 || ((_chr) == '\f') || ((_chr) == '\r'))
160
161 #ifdef MSDOS
162 # define CHAR_IS_BLANK(_chr) \
163 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
164 #else
165 # define CHAR_IS_BLANK CHAR_IS_BLANK_
166 #endif
167
168
169 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
170 structure''). */
171 #define CHAR_IS_R5RS_DELIMITER(c) \
172 (CHAR_IS_BLANK (c) \
173 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
174
175 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
176
177 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
178 Structure''. */
179 #define CHAR_IS_EXPONENT_MARKER(_chr) \
180 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
181 || ((_chr) == 'd') || ((_chr) == 'l'))
182
183 /* Read an SCSH block comment. */
184 static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
185 static SCM scm_read_commented_expression (int chr, SCM port);
186
187 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
188 zero if the whole token fits in BUF, non-zero otherwise. */
189 static inline int
190 read_token (SCM port, SCM buf, size_t *read)
191 {
192 scm_t_wchar chr;
193 *read = 0;
194
195 buf = scm_i_string_start_writing (buf);
196 while (*read < scm_i_string_length (buf))
197 {
198 chr = scm_getc (port);
199
200 if (chr == EOF)
201 {
202 scm_i_string_stop_writing ();
203 return 0;
204 }
205
206 chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
207
208 if (CHAR_IS_DELIMITER (chr))
209 {
210 scm_i_string_stop_writing ();
211 scm_ungetc (chr, port);
212 return 0;
213 }
214
215 scm_i_string_set_x (buf, *read, chr);
216 (*read)++;
217 }
218 scm_i_string_stop_writing ();
219
220 return 1;
221 }
222
223 static SCM
224 read_complete_token (SCM port, size_t *read)
225 {
226 SCM buffer, str = SCM_EOL;
227 size_t len;
228 int overflow;
229
230 buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
231 overflow = read_token (port, buffer, read);
232 if (!overflow)
233 return scm_i_substring (buffer, 0, *read);
234
235 str = scm_string_copy (buffer);
236 do
237 {
238 overflow = read_token (port, buffer, &len);
239 str = scm_string_append (scm_list_2 (str, buffer));
240 *read += len;
241 }
242 while (overflow);
243
244 return scm_i_substring (str, 0, *read);
245 }
246
247 /* Skip whitespace from PORT and return the first non-whitespace character
248 read. Raise an error on end-of-file. */
249 static int
250 flush_ws (SCM port, const char *eoferr)
251 {
252 register scm_t_wchar c;
253 while (1)
254 switch (c = scm_getc (port))
255 {
256 case EOF:
257 goteof:
258 if (eoferr)
259 {
260 scm_i_input_error (eoferr,
261 port,
262 "end of file",
263 SCM_EOL);
264 }
265 return c;
266
267 case ';':
268 lp:
269 switch (c = scm_getc (port))
270 {
271 case EOF:
272 goto goteof;
273 default:
274 goto lp;
275 case SCM_LINE_INCREMENTORS:
276 break;
277 }
278 break;
279
280 case '#':
281 switch (c = scm_getc (port))
282 {
283 case EOF:
284 eoferr = "read_sharp";
285 goto goteof;
286 case '!':
287 scm_read_scsh_block_comment (c, port);
288 break;
289 case ';':
290 scm_read_commented_expression (c, port);
291 break;
292 default:
293 scm_ungetc (c, port);
294 return '#';
295 }
296 break;
297
298 case SCM_LINE_INCREMENTORS:
299 case SCM_SINGLE_SPACES:
300 case '\t':
301 break;
302
303 default:
304 return c;
305 }
306
307 return 0;
308 }
309
310
311 \f
312 /* Token readers. */
313
314 static SCM scm_read_expression (SCM port);
315 static SCM scm_read_sharp (int chr, SCM port);
316 static SCM scm_get_hash_procedure (int c);
317 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
318
319
320 static SCM
321 scm_read_sexp (scm_t_wchar chr, SCM port)
322 #define FUNC_NAME "scm_i_lreadparen"
323 {
324 register int c;
325 register SCM tmp;
326 register SCM tl, ans = SCM_EOL;
327 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
328 static const int terminating_char = ')';
329
330 /* Need to capture line and column numbers here. */
331 long line = SCM_LINUM (port);
332 int column = SCM_COL (port) - 1;
333
334
335 c = flush_ws (port, FUNC_NAME);
336 if (terminating_char == c)
337 return SCM_EOL;
338
339 scm_ungetc (c, port);
340 if (scm_is_eq (scm_sym_dot,
341 (tmp = scm_read_expression (port))))
342 {
343 ans = scm_read_expression (port);
344 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
345 scm_i_input_error (FUNC_NAME, port, "missing close paren",
346 SCM_EOL);
347 return ans;
348 }
349
350 /* Build the head of the list structure. */
351 ans = tl = scm_cons (tmp, SCM_EOL);
352
353 if (SCM_COPY_SOURCE_P)
354 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
355 ? copy
356 : tmp,
357 SCM_EOL);
358
359 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
360 {
361 SCM new_tail;
362
363 scm_ungetc (c, port);
364 if (scm_is_eq (scm_sym_dot,
365 (tmp = scm_read_expression (port))))
366 {
367 SCM_SETCDR (tl, tmp = scm_read_expression (port));
368
369 if (SCM_COPY_SOURCE_P)
370 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
371 SCM_EOL));
372
373 c = flush_ws (port, FUNC_NAME);
374 if (terminating_char != c)
375 scm_i_input_error (FUNC_NAME, port,
376 "in pair: missing close paren", SCM_EOL);
377 goto exit;
378 }
379
380 new_tail = scm_cons (tmp, SCM_EOL);
381 SCM_SETCDR (tl, new_tail);
382 tl = new_tail;
383
384 if (SCM_COPY_SOURCE_P)
385 {
386 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
387 ? copy
388 : tmp, SCM_EOL);
389 SCM_SETCDR (tl2, new_tail2);
390 tl2 = new_tail2;
391 }
392 }
393
394 exit:
395 if (SCM_RECORD_POSITIONS_P)
396 scm_whash_insert (scm_source_whash,
397 ans,
398 scm_make_srcprops (line, column,
399 SCM_FILENAME (port),
400 SCM_COPY_SOURCE_P
401 ? ans2
402 : SCM_UNDEFINED,
403 SCM_EOL));
404 return ans;
405 }
406 #undef FUNC_NAME
407
408 static SCM
409 scm_read_string (int chr, SCM port)
410 #define FUNC_NAME "scm_lreadr"
411 {
412 /* For strings smaller than C_STR, this function creates only one Scheme
413 object (the string returned). */
414
415 SCM str = SCM_BOOL_F;
416 unsigned c_str_len = 0;
417 scm_t_wchar c;
418
419 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
420 while ('"' != (c = scm_getc (port)))
421 {
422 if (c == EOF)
423 {
424 str_eof:
425 scm_i_input_error (FUNC_NAME, port,
426 "end of file in string constant", SCM_EOL);
427 }
428
429 if (c_str_len + 1 >= scm_i_string_length (str))
430 {
431 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
432
433 str = scm_string_append (scm_list_2 (str, addy));
434 }
435
436 if (c == '\\')
437 {
438 switch (c = scm_getc (port))
439 {
440 case EOF:
441 goto str_eof;
442 case '"':
443 case '\\':
444 break;
445 #if SCM_ENABLE_ELISP
446 case '(':
447 case ')':
448 if (SCM_ESCAPED_PARENS_P)
449 break;
450 goto bad_escaped;
451 #endif
452 case '\n':
453 continue;
454 case '0':
455 c = '\0';
456 break;
457 case 'f':
458 c = '\f';
459 break;
460 case 'n':
461 c = '\n';
462 break;
463 case 'r':
464 c = '\r';
465 break;
466 case 't':
467 c = '\t';
468 break;
469 case 'a':
470 c = '\007';
471 break;
472 case 'v':
473 c = '\v';
474 break;
475 case 'x':
476 {
477 scm_t_wchar a, b;
478 a = scm_getc (port);
479 if (a == EOF)
480 goto str_eof;
481 b = scm_getc (port);
482 if (b == EOF)
483 goto str_eof;
484 if ('0' <= a && a <= '9')
485 a -= '0';
486 else if ('A' <= a && a <= 'F')
487 a = a - 'A' + 10;
488 else if ('a' <= a && a <= 'f')
489 a = a - 'a' + 10;
490 else
491 {
492 c = a;
493 goto bad_escaped;
494 }
495 if ('0' <= b && b <= '9')
496 b -= '0';
497 else if ('A' <= b && b <= 'F')
498 b = b - 'A' + 10;
499 else if ('a' <= b && b <= 'f')
500 b = b - 'a' + 10;
501 else
502 {
503 c = b;
504 goto bad_escaped;
505 }
506 c = a * 16 + b;
507 break;
508 }
509 case 'u':
510 {
511 scm_t_wchar a;
512 int i;
513 c = 0;
514 for (i = 0; i < 4; i++)
515 {
516 a = scm_getc (port);
517 if (a == EOF)
518 goto str_eof;
519 if ('0' <= a && a <= '9')
520 a -= '0';
521 else if ('A' <= a && a <= 'F')
522 a = a - 'A' + 10;
523 else if ('a' <= a && a <= 'f')
524 a = a - 'a' + 10;
525 else
526 {
527 c = a;
528 goto bad_escaped;
529 }
530 c = c * 16 + a;
531 }
532 break;
533 }
534 case 'U':
535 {
536 scm_t_wchar a;
537 int i;
538 c = 0;
539 for (i = 0; i < 6; i++)
540 {
541 a = scm_getc (port);
542 if (a == EOF)
543 goto str_eof;
544 if ('0' <= a && a <= '9')
545 a -= '0';
546 else if ('A' <= a && a <= 'F')
547 a = a - 'A' + 10;
548 else if ('a' <= a && a <= 'f')
549 a = a - 'a' + 10;
550 else
551 {
552 c = a;
553 goto bad_escaped;
554 }
555 c = c * 16 + a;
556 }
557 break;
558 }
559 default:
560 bad_escaped:
561 scm_i_input_error (FUNC_NAME, port,
562 "illegal character in escape sequence: ~S",
563 scm_list_1 (SCM_MAKE_CHAR (c)));
564 }
565 }
566 str = scm_i_string_start_writing (str);
567 scm_i_string_set_x (str, c_str_len++, c);
568 scm_i_string_stop_writing ();
569 }
570
571 if (c_str_len > 0)
572 {
573 return scm_i_substring_copy (str, 0, c_str_len);
574 }
575
576 return scm_nullstr;
577 }
578 #undef FUNC_NAME
579
580
581 static SCM
582 scm_read_number (scm_t_wchar chr, SCM port)
583 {
584 SCM result;
585 SCM buffer;
586 size_t read;
587
588 scm_ungetc (chr, port);
589 buffer = read_complete_token (port, &read);
590 result = scm_string_to_number (buffer, SCM_UNDEFINED);
591 if (!scm_is_true (result))
592 /* Return a symbol instead of a number. */
593 result = scm_string_to_symbol (buffer);
594
595 return result;
596 }
597
598 static SCM
599 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
600 {
601 SCM result;
602 int ends_with_colon = 0;
603 SCM buffer;
604 size_t read = 0;
605 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
606
607 scm_ungetc (chr, port);
608 buffer = read_complete_token (port, &read);
609 if (read > 0)
610 ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
611
612 if (postfix && ends_with_colon && (read > 1))
613 result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
614 else
615 result = scm_string_to_symbol (buffer);
616
617 return result;
618 }
619
620 static SCM
621 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
622 #define FUNC_NAME "scm_lreadr"
623 {
624 SCM result;
625 size_t read;
626 SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
627 unsigned int radix;
628
629 switch (chr)
630 {
631 case 'B':
632 case 'b':
633 radix = 2;
634 break;
635
636 case 'o':
637 case 'O':
638 radix = 8;
639 break;
640
641 case 'd':
642 case 'D':
643 radix = 10;
644 break;
645
646 case 'x':
647 case 'X':
648 radix = 16;
649 break;
650
651 default:
652 scm_ungetc (chr, port);
653 scm_ungetc ('#', port);
654 radix = 10;
655 }
656
657 buffer = read_complete_token (port, &read);
658 result = scm_string_to_number (buffer, scm_from_uint (radix));
659
660 if (scm_is_true (result))
661 return result;
662
663 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
664
665 return SCM_BOOL_F;
666 }
667 #undef FUNC_NAME
668
669 static SCM
670 scm_read_quote (int chr, SCM port)
671 {
672 SCM p;
673 long line = SCM_LINUM (port);
674 int column = SCM_COL (port) - 1;
675
676 switch (chr)
677 {
678 case '`':
679 p = scm_sym_quasiquote;
680 break;
681
682 case '\'':
683 p = scm_sym_quote;
684 break;
685
686 case ',':
687 {
688 scm_t_wchar c;
689
690 c = scm_getc (port);
691 if ('@' == c)
692 p = scm_sym_uq_splicing;
693 else
694 {
695 scm_ungetc (c, port);
696 p = scm_sym_unquote;
697 }
698 break;
699 }
700
701 default:
702 fprintf (stderr, "%s: unhandled quote character (%i)\n",
703 "scm_read_quote", chr);
704 abort ();
705 }
706
707 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
708 if (SCM_RECORD_POSITIONS_P)
709 scm_whash_insert (scm_source_whash, p,
710 scm_make_srcprops (line, column,
711 SCM_FILENAME (port),
712 SCM_COPY_SOURCE_P
713 ? (scm_cons2 (SCM_CAR (p),
714 SCM_CAR (SCM_CDR (p)),
715 SCM_EOL))
716 : SCM_UNDEFINED,
717 SCM_EOL));
718
719
720 return p;
721 }
722
723 SCM_SYMBOL (sym_syntax, "syntax");
724 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
725 SCM_SYMBOL (sym_unsyntax, "unsyntax");
726 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
727
728 static SCM
729 scm_read_syntax (int chr, SCM port)
730 {
731 SCM p;
732 long line = SCM_LINUM (port);
733 int column = SCM_COL (port) - 1;
734
735 switch (chr)
736 {
737 case '`':
738 p = sym_quasisyntax;
739 break;
740
741 case '\'':
742 p = sym_syntax;
743 break;
744
745 case ',':
746 {
747 int c;
748
749 c = scm_getc (port);
750 if ('@' == c)
751 p = sym_unsyntax_splicing;
752 else
753 {
754 scm_ungetc (c, port);
755 p = sym_unsyntax;
756 }
757 break;
758 }
759
760 default:
761 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
762 "scm_read_syntax", chr);
763 abort ();
764 }
765
766 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
767 if (SCM_RECORD_POSITIONS_P)
768 scm_whash_insert (scm_source_whash, p,
769 scm_make_srcprops (line, column,
770 SCM_FILENAME (port),
771 SCM_COPY_SOURCE_P
772 ? (scm_cons2 (SCM_CAR (p),
773 SCM_CAR (SCM_CDR (p)),
774 SCM_EOL))
775 : SCM_UNDEFINED,
776 SCM_EOL));
777
778
779 return p;
780 }
781
782 static inline SCM
783 scm_read_semicolon_comment (int chr, SCM port)
784 {
785 int c;
786
787 /* We use the get_byte here because there is no need to get the
788 locale correct with comment input. This presumes that newline
789 always represents itself no matter what the encoding is. */
790 for (c = scm_get_byte_or_eof (port);
791 (c != EOF) && (c != '\n');
792 c = scm_getc (port));
793
794 return SCM_UNSPECIFIED;
795 }
796
797 \f
798 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
799
800 static SCM
801 scm_read_boolean (int chr, SCM port)
802 {
803 switch (chr)
804 {
805 case 't':
806 case 'T':
807 return SCM_BOOL_T;
808
809 case 'f':
810 case 'F':
811 return SCM_BOOL_F;
812 }
813
814 return SCM_UNSPECIFIED;
815 }
816
817 static SCM
818 scm_read_character (scm_t_wchar chr, SCM port)
819 #define FUNC_NAME "scm_lreadr"
820 {
821 SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
822 size_t charname_len;
823 scm_t_wchar cp;
824 int overflow;
825
826 overflow = read_token (port, charname, &charname_len);
827 charname = scm_c_substring (charname, 0, charname_len);
828
829 if (overflow)
830 goto char_error;
831
832 if (charname_len == 0)
833 {
834 chr = scm_getc (port);
835 if (chr == EOF)
836 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
837 "while reading character", SCM_EOL);
838
839 /* CHR must be a token delimiter, like a whitespace. */
840 return (SCM_MAKE_CHAR (chr));
841 }
842
843 if (charname_len == 1)
844 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
845
846 cp = scm_i_string_ref (charname, 0);
847 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
848 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
849
850 if (cp >= '0' && cp < '8')
851 {
852 /* Dirk:FIXME:: This type of character syntax is not R5RS
853 * compliant. Further, it should be verified that the constant
854 * does only consist of octal digits. */
855 SCM p = scm_string_to_number (charname, scm_from_uint (8));
856 if (SCM_I_INUMP (p))
857 {
858 scm_t_wchar c = SCM_I_INUM (p);
859 if (SCM_IS_UNICODE_CHAR (c))
860 return SCM_MAKE_CHAR (c);
861 else
862 scm_i_input_error (FUNC_NAME, port,
863 "out-of-range octal character escape: ~a",
864 scm_list_1 (charname));
865 }
866 }
867
868 /* The names of characters should never have non-Latin1
869 characters. */
870 if (scm_i_is_narrow_string (charname)
871 || scm_i_try_narrow_string (charname))
872 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
873 charname_len);
874 if (scm_is_true (ch))
875 return ch;
876 }
877
878 char_error:
879 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
880 scm_list_1 (charname));
881
882 return SCM_UNSPECIFIED;
883 }
884 #undef FUNC_NAME
885
886 static inline SCM
887 scm_read_keyword (int chr, SCM port)
888 {
889 SCM symbol;
890
891 /* Read the symbol that comprises the keyword. Doing this instead of
892 invoking a specific symbol reader function allows `scm_read_keyword ()'
893 to adapt to the delimiters currently valid of symbols.
894
895 XXX: This implementation allows sloppy syntaxes like `#: key'. */
896 symbol = scm_read_expression (port);
897 if (!scm_is_symbol (symbol))
898 scm_i_input_error ("scm_read_keyword", port,
899 "keyword prefix `~a' not followed by a symbol: ~s",
900 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
901
902 return (scm_symbol_to_keyword (symbol));
903 }
904
905 static inline SCM
906 scm_read_vector (int chr, SCM port)
907 {
908 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
909 guarantee that it's going to do what we want. After all, this is an
910 implementation detail of `scm_read_vector ()', not a desirable
911 property. */
912 return (scm_vector (scm_read_sexp (chr, port)));
913 }
914
915 static inline SCM
916 scm_read_srfi4_vector (int chr, SCM port)
917 {
918 return scm_i_read_array (port, chr);
919 }
920
921 static SCM
922 scm_read_bytevector (scm_t_wchar chr, SCM port)
923 {
924 chr = scm_getc (port);
925 if (chr != 'u')
926 goto syntax;
927
928 chr = scm_getc (port);
929 if (chr != '8')
930 goto syntax;
931
932 chr = scm_getc (port);
933 if (chr != '(')
934 goto syntax;
935
936 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
937
938 syntax:
939 scm_i_input_error ("read_bytevector", port,
940 "invalid bytevector prefix",
941 SCM_MAKE_CHAR (chr));
942 return SCM_UNSPECIFIED;
943 }
944
945 static SCM
946 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
947 {
948 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
949 terribly inefficient but who cares? */
950 SCM s_bits = SCM_EOL;
951
952 for (chr = scm_getc (port);
953 (chr != EOF) && ((chr == '0') || (chr == '1'));
954 chr = scm_getc (port))
955 {
956 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
957 }
958
959 if (chr != EOF)
960 scm_ungetc (chr, port);
961
962 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
963 }
964
965 static inline SCM
966 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
967 {
968 int bang_seen = 0;
969
970 /* We can use the get_byte here because there is no need to get the
971 locale correct when reading comments. This presumes that
972 hash and exclamation points always represent themselves no
973 matter what the source encoding is.*/
974 for (;;)
975 {
976 int c = scm_get_byte_or_eof (port);
977
978 if (c == EOF)
979 scm_i_input_error ("skip_block_comment", port,
980 "unterminated `#! ... !#' comment", SCM_EOL);
981
982 if (c == '!')
983 bang_seen = 1;
984 else if (c == '#' && bang_seen)
985 break;
986 else
987 bang_seen = 0;
988 }
989
990 return SCM_UNSPECIFIED;
991 }
992
993 static SCM
994 scm_read_commented_expression (scm_t_wchar chr, SCM port)
995 {
996 scm_t_wchar c;
997
998 c = flush_ws (port, (char *) NULL);
999 if (EOF == c)
1000 scm_i_input_error ("read_commented_expression", port,
1001 "no expression after #; comment", SCM_EOL);
1002 scm_ungetc (c, port);
1003 scm_read_expression (port);
1004 return SCM_UNSPECIFIED;
1005 }
1006
1007 static SCM
1008 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1009 {
1010 /* Guile's extended symbol read syntax looks like this:
1011
1012 #{This is all a symbol name}#
1013
1014 So here, CHR is expected to be `{'. */
1015 int saw_brace = 0, finished = 0;
1016 size_t len = 0;
1017 SCM buf = scm_i_make_string (1024, NULL);
1018
1019 buf = scm_i_string_start_writing (buf);
1020
1021 while ((chr = scm_getc (port)) != EOF)
1022 {
1023 if (saw_brace)
1024 {
1025 if (chr == '#')
1026 {
1027 finished = 1;
1028 break;
1029 }
1030 else
1031 {
1032 saw_brace = 0;
1033 scm_i_string_set_x (buf, len++, '}');
1034 scm_i_string_set_x (buf, len++, chr);
1035 }
1036 }
1037 else if (chr == '}')
1038 saw_brace = 1;
1039 else
1040 scm_i_string_set_x (buf, len++, chr);
1041
1042 if (len >= scm_i_string_length (buf) - 2)
1043 {
1044 scm_i_string_stop_writing ();
1045 SCM addy = scm_i_make_string (1024, NULL);
1046 buf = scm_string_append (scm_list_2 (buf, addy));
1047 len = 0;
1048 buf = scm_i_string_start_writing (buf);
1049 }
1050
1051 if (finished)
1052 break;
1053 }
1054 scm_i_string_stop_writing ();
1055
1056 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1057 }
1058
1059
1060 \f
1061 /* Top-level token readers, i.e., dispatchers. */
1062
1063 static SCM
1064 scm_read_sharp_extension (int chr, SCM port)
1065 {
1066 SCM proc;
1067
1068 proc = scm_get_hash_procedure (chr);
1069 if (scm_is_true (scm_procedure_p (proc)))
1070 {
1071 long line = SCM_LINUM (port);
1072 int column = SCM_COL (port) - 2;
1073 SCM got;
1074
1075 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1076 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1077 {
1078 if (SCM_RECORD_POSITIONS_P)
1079 return (recsexpr (got, line, column,
1080 SCM_FILENAME (port)));
1081 else
1082 return got;
1083 }
1084 }
1085
1086 return SCM_UNSPECIFIED;
1087 }
1088
1089 /* The reader for the sharp `#' character. It basically dispatches reads
1090 among the above token readers. */
1091 static SCM
1092 scm_read_sharp (scm_t_wchar chr, SCM port)
1093 #define FUNC_NAME "scm_lreadr"
1094 {
1095 SCM result;
1096
1097 chr = scm_getc (port);
1098
1099 result = scm_read_sharp_extension (chr, port);
1100 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1101 return result;
1102
1103 switch (chr)
1104 {
1105 case '\\':
1106 return (scm_read_character (chr, port));
1107 case '(':
1108 return (scm_read_vector (chr, port));
1109 case 's':
1110 case 'u':
1111 case 'f':
1112 /* This one may return either a boolean or an SRFI-4 vector. */
1113 return (scm_read_srfi4_vector (chr, port));
1114 case 'v':
1115 return (scm_read_bytevector (chr, port));
1116 case '*':
1117 return (scm_read_guile_bit_vector (chr, port));
1118 case 't':
1119 case 'T':
1120 case 'F':
1121 /* This one may return either a boolean or an SRFI-4 vector. */
1122 return (scm_read_boolean (chr, port));
1123 case ':':
1124 return (scm_read_keyword (chr, port));
1125 case '0': case '1': case '2': case '3': case '4':
1126 case '5': case '6': case '7': case '8': case '9':
1127 case '@':
1128 #if SCM_ENABLE_DEPRECATED
1129 /* See below for 'i' and 'e'. */
1130 case 'a':
1131 case 'c':
1132 case 'y':
1133 case 'h':
1134 case 'l':
1135 #endif
1136 return (scm_i_read_array (port, chr));
1137
1138 case 'i':
1139 case 'e':
1140 #if SCM_ENABLE_DEPRECATED
1141 {
1142 /* When next char is '(', it really is an old-style
1143 uniform array. */
1144 scm_t_wchar next_c = scm_getc (port);
1145 if (next_c != EOF)
1146 scm_ungetc (next_c, port);
1147 if (next_c == '(')
1148 return scm_i_read_array (port, chr);
1149 /* Fall through. */
1150 }
1151 #endif
1152 case 'b':
1153 case 'B':
1154 case 'o':
1155 case 'O':
1156 case 'd':
1157 case 'D':
1158 case 'x':
1159 case 'X':
1160 case 'I':
1161 case 'E':
1162 return (scm_read_number_and_radix (chr, port));
1163 case '{':
1164 return (scm_read_extended_symbol (chr, port));
1165 case '!':
1166 return (scm_read_scsh_block_comment (chr, port));
1167 case ';':
1168 return (scm_read_commented_expression (chr, port));
1169 case '`':
1170 case '\'':
1171 case ',':
1172 return (scm_read_syntax (chr, port));
1173 default:
1174 result = scm_read_sharp_extension (chr, port);
1175 if (scm_is_eq (result, SCM_UNSPECIFIED))
1176 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1177 scm_list_1 (SCM_MAKE_CHAR (chr)));
1178 else
1179 return result;
1180 }
1181
1182 return SCM_UNSPECIFIED;
1183 }
1184 #undef FUNC_NAME
1185
1186 static SCM
1187 scm_read_expression (SCM port)
1188 #define FUNC_NAME "scm_read_expression"
1189 {
1190 while (1)
1191 {
1192 register scm_t_wchar chr;
1193
1194 chr = scm_getc (port);
1195
1196 switch (chr)
1197 {
1198 case SCM_WHITE_SPACES:
1199 case SCM_LINE_INCREMENTORS:
1200 break;
1201 case ';':
1202 (void) scm_read_semicolon_comment (chr, port);
1203 break;
1204 case '(':
1205 return (scm_read_sexp (chr, port));
1206 case '"':
1207 return (scm_read_string (chr, port));
1208 case '\'':
1209 case '`':
1210 case ',':
1211 return (scm_read_quote (chr, port));
1212 case '#':
1213 {
1214 SCM result;
1215 result = scm_read_sharp (chr, port);
1216 if (scm_is_eq (result, SCM_UNSPECIFIED))
1217 /* We read a comment or some such. */
1218 break;
1219 else
1220 return result;
1221 }
1222 case ')':
1223 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1224 break;
1225 case EOF:
1226 return SCM_EOF_VAL;
1227 case ':':
1228 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1229 return scm_symbol_to_keyword (scm_read_expression (port));
1230 /* Fall through. */
1231
1232 default:
1233 {
1234 if (((chr >= '0') && (chr <= '9'))
1235 || (strchr ("+-.", chr)))
1236 return (scm_read_number (chr, port));
1237 else
1238 return (scm_read_mixed_case_symbol (chr, port));
1239 }
1240 }
1241 }
1242 }
1243 #undef FUNC_NAME
1244
1245 \f
1246 /* Actual reader. */
1247
1248 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1249 (SCM port),
1250 "Read an s-expression from the input port @var{port}, or from\n"
1251 "the current input port if @var{port} is not specified.\n"
1252 "Any whitespace before the next token is discarded.")
1253 #define FUNC_NAME s_scm_read
1254 {
1255 int c;
1256
1257 if (SCM_UNBNDP (port))
1258 port = scm_current_input_port ();
1259 SCM_VALIDATE_OPINPORT (1, port);
1260
1261 c = flush_ws (port, (char *) NULL);
1262 if (EOF == c)
1263 return SCM_EOF_VAL;
1264 scm_ungetc (c, port);
1265
1266 return (scm_read_expression (port));
1267 }
1268 #undef FUNC_NAME
1269
1270
1271 \f
1272
1273 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1274 static SCM
1275 recsexpr (SCM obj, long line, int column, SCM filename)
1276 {
1277 if (!scm_is_pair(obj)) {
1278 return obj;
1279 } else {
1280 SCM tmp = obj, copy;
1281 /* If this sexpr is visible in the read:sharp source, we want to
1282 keep that information, so only record non-constant cons cells
1283 which haven't previously been read by the reader. */
1284 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1285 {
1286 if (SCM_COPY_SOURCE_P)
1287 {
1288 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1289 SCM_UNDEFINED);
1290 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1291 {
1292 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1293 line,
1294 column,
1295 filename),
1296 SCM_UNDEFINED));
1297 copy = SCM_CDR (copy);
1298 }
1299 SCM_SETCDR (copy, tmp);
1300 }
1301 else
1302 {
1303 recsexpr (SCM_CAR (obj), line, column, filename);
1304 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1305 recsexpr (SCM_CAR (tmp), line, column, filename);
1306 copy = SCM_UNDEFINED;
1307 }
1308 scm_whash_insert (scm_source_whash,
1309 obj,
1310 scm_make_srcprops (line,
1311 column,
1312 filename,
1313 copy,
1314 SCM_EOL));
1315 }
1316 return obj;
1317 }
1318 }
1319
1320 /* Manipulate the read-hash-procedures alist. This could be written in
1321 Scheme, but maybe it will also be used by C code during initialisation. */
1322 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1323 (SCM chr, SCM proc),
1324 "Install the procedure @var{proc} for reading expressions\n"
1325 "starting with the character sequence @code{#} and @var{chr}.\n"
1326 "@var{proc} will be called with two arguments: the character\n"
1327 "@var{chr} and the port to read further data from. The object\n"
1328 "returned will be the return value of @code{read}. \n"
1329 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1330 )
1331 #define FUNC_NAME s_scm_read_hash_extend
1332 {
1333 SCM this;
1334 SCM prev;
1335
1336 SCM_VALIDATE_CHAR (1, chr);
1337 SCM_ASSERT (scm_is_false (proc)
1338 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1339 proc, SCM_ARG2, FUNC_NAME);
1340
1341 /* Check if chr is already in the alist. */
1342 this = *scm_read_hash_procedures;
1343 prev = SCM_BOOL_F;
1344 while (1)
1345 {
1346 if (scm_is_null (this))
1347 {
1348 /* not found, so add it to the beginning. */
1349 if (scm_is_true (proc))
1350 {
1351 *scm_read_hash_procedures =
1352 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1353 }
1354 break;
1355 }
1356 if (scm_is_eq (chr, SCM_CAAR (this)))
1357 {
1358 /* already in the alist. */
1359 if (scm_is_false (proc))
1360 {
1361 /* remove it. */
1362 if (scm_is_false (prev))
1363 {
1364 *scm_read_hash_procedures =
1365 SCM_CDR (*scm_read_hash_procedures);
1366 }
1367 else
1368 scm_set_cdr_x (prev, SCM_CDR (this));
1369 }
1370 else
1371 {
1372 /* replace it. */
1373 scm_set_cdr_x (SCM_CAR (this), proc);
1374 }
1375 break;
1376 }
1377 prev = this;
1378 this = SCM_CDR (this);
1379 }
1380
1381 return SCM_UNSPECIFIED;
1382 }
1383 #undef FUNC_NAME
1384
1385 /* Recover the read-hash procedure corresponding to char c. */
1386 static SCM
1387 scm_get_hash_procedure (int c)
1388 {
1389 SCM rest = *scm_read_hash_procedures;
1390
1391 while (1)
1392 {
1393 if (scm_is_null (rest))
1394 return SCM_BOOL_F;
1395
1396 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1397 return SCM_CDAR (rest);
1398
1399 rest = SCM_CDR (rest);
1400 }
1401 }
1402
1403 #define SCM_ENCODING_SEARCH_SIZE (500)
1404
1405 /* Search the first few hundred characters of a file for
1406 an emacs-like coding declaration. */
1407 char *
1408 scm_scan_for_encoding (SCM port)
1409 {
1410 char header[SCM_ENCODING_SEARCH_SIZE+1];
1411 size_t bytes_read;
1412 char *encoding = NULL;
1413 int utf8_bom = 0;
1414 char *pos;
1415 int i;
1416 int in_comment;
1417
1418 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1419 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1420
1421 if (bytes_read > 3
1422 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1423 utf8_bom = 1;
1424
1425 /* search past "coding[:=]" */
1426 pos = header;
1427 while (1)
1428 {
1429 if ((pos = strstr(pos, "coding")) == NULL)
1430 return NULL;
1431
1432 pos += strlen("coding");
1433 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1434 (*pos == ':' || *pos == '='))
1435 {
1436 pos ++;
1437 break;
1438 }
1439 }
1440
1441 /* skip spaces */
1442 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1443 (*pos == ' ' || *pos == '\t'))
1444 pos ++;
1445
1446 /* grab the next token */
1447 i = 0;
1448 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1449 && (isalnum((int) pos[i]) || pos[i] == '_' || pos[i] == '-'
1450 || pos[i] == '.'))
1451 i++;
1452
1453 if (i == 0)
1454 return NULL;
1455
1456 encoding = scm_malloc (i+1);
1457 memcpy (encoding, pos, i);
1458 encoding[i] ='\0';
1459 for (i = 0; i < strlen (encoding); i++)
1460 encoding[i] = toupper ((int) encoding[i]);
1461
1462 /* push backwards to make sure we were in a comment */
1463 in_comment = 0;
1464 while (pos - i - header > 0)
1465 {
1466 if (*(pos - i) == '\n')
1467 {
1468 /* This wasn't in a semicolon comment. Check for a
1469 hash-bang comment. */
1470 char *beg = strstr (header, "#!");
1471 char *end = strstr (header, "!#");
1472 if (beg < pos && pos < end)
1473 in_comment = 1;
1474 break;
1475 }
1476 if (*(pos - i) == ';')
1477 {
1478 in_comment = 1;
1479 break;
1480 }
1481 i ++;
1482 }
1483 if (!in_comment)
1484 {
1485 /* This wasn't in a comment */
1486 free (encoding);
1487 return NULL;
1488 }
1489 if (utf8_bom && strcmp(encoding, "UTF-8"))
1490 scm_misc_error (NULL,
1491 "the port input declares the encoding ~s but is encoded as UTF-8",
1492 scm_list_1 (scm_from_locale_string (encoding)));
1493
1494 return encoding;
1495 }
1496
1497 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1498 (SCM port),
1499 "Scans the port for an EMACS-like character coding declaration\n"
1500 "near the top of the contents of a port with random-acessible contents.\n"
1501 "The coding declaration is of the form\n"
1502 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1503 "\n"
1504 "Returns a string containing the character encoding of the file\n"
1505 "if a declaration was found, or @code{#f} otherwise.\n")
1506 #define FUNC_NAME s_scm_file_encoding
1507 {
1508 char *enc;
1509 SCM s_enc;
1510
1511 enc = scm_scan_for_encoding (port);
1512 if (enc == NULL)
1513 return SCM_BOOL_F;
1514 else
1515 {
1516 s_enc = scm_from_locale_string (enc);
1517 free (enc);
1518 return s_enc;
1519 }
1520
1521 return SCM_BOOL_F;
1522 }
1523 #undef FUNC_NAME
1524
1525 void
1526 scm_init_read ()
1527 {
1528 scm_read_hash_procedures =
1529 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1530
1531 scm_init_opts (scm_read_options, scm_read_opts);
1532 #include "libguile/read.x"
1533 }
1534
1535 /*
1536 Local Variables:
1537 c-file-style: "gnu"
1538 End:
1539 */