merge strictness branch from 2.0
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static inline SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static inline void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static inline int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 register scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port);
360 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
361
362
363 static SCM
364 scm_read_sexp (scm_t_wchar chr, SCM port)
365 #define FUNC_NAME "scm_i_lreadparen"
366 {
367 register int c;
368 register SCM tmp;
369 register SCM tl, ans = SCM_EOL;
370 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
371 const int terminating_char = ((chr == '[') ? ']' : ')');
372
373 /* Need to capture line and column numbers here. */
374 long line = SCM_LINUM (port);
375 int column = SCM_COL (port) - 1;
376
377
378 c = flush_ws (port, FUNC_NAME);
379 if (terminating_char == c)
380 return SCM_EOL;
381
382 scm_ungetc (c, port);
383 if (scm_is_eq (scm_sym_dot,
384 (tmp = scm_read_expression (port))))
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
392
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
396 if (SCM_COPY_SOURCE_P)
397 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
398 ? copy
399 : tmp,
400 SCM_EOL);
401
402 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
403 {
404 SCM new_tail;
405
406 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
407 scm_i_input_error (FUNC_NAME, port,
408 "in pair: mismatched close paren: ~A",
409 scm_list_1 (SCM_MAKE_CHAR (c)));
410
411 scm_ungetc (c, port);
412 tmp = scm_read_expression (port);
413
414 if (scm_is_eq (scm_sym_dot, tmp))
415 {
416 SCM_SETCDR (tl, tmp = scm_read_expression (port));
417
418 if (SCM_COPY_SOURCE_P)
419 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
420 SCM_EOL));
421
422 c = flush_ws (port, FUNC_NAME);
423 if (terminating_char != c)
424 scm_i_input_error (FUNC_NAME, port,
425 "in pair: missing close paren", SCM_EOL);
426 goto exit;
427 }
428
429 new_tail = scm_cons (tmp, SCM_EOL);
430 SCM_SETCDR (tl, new_tail);
431 tl = new_tail;
432
433 if (SCM_COPY_SOURCE_P)
434 {
435 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
436 ? copy
437 : tmp, SCM_EOL);
438 SCM_SETCDR (tl2, new_tail2);
439 tl2 = new_tail2;
440 }
441 }
442
443 exit:
444 if (SCM_RECORD_POSITIONS_P)
445 scm_hashq_set_x (scm_source_whash,
446 ans,
447 scm_make_srcprops (line, column,
448 SCM_FILENAME (port),
449 SCM_COPY_SOURCE_P
450 ? ans2
451 : SCM_UNDEFINED,
452 SCM_EOL));
453 return ans;
454 }
455 #undef FUNC_NAME
456
457
458 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
459 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
460 found. */
461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
462 do \
463 { \
464 scm_t_wchar a; \
465 size_t i = 0; \
466 c = 0; \
467 while (i < ndigits) \
468 { \
469 a = scm_getc (port); \
470 if (a == EOF) \
471 goto str_eof; \
472 if (terminator \
473 && (a == (scm_t_wchar) terminator) \
474 && (i > 0)) \
475 break; \
476 if ('0' <= a && a <= '9') \
477 a -= '0'; \
478 else if ('A' <= a && a <= 'F') \
479 a = a - 'A' + 10; \
480 else if ('a' <= a && a <= 'f') \
481 a = a - 'a' + 10; \
482 else \
483 { \
484 c = a; \
485 goto bad_escaped; \
486 } \
487 c = c * 16 + a; \
488 i ++; \
489 } \
490 } while (0)
491
492 static void
493 skip_intraline_whitespace (SCM port)
494 {
495 scm_t_wchar c;
496
497 do
498 {
499 c = scm_getc (port);
500 if (c == EOF)
501 return;
502 }
503 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
504
505 scm_ungetc (c, port);
506 }
507
508 static SCM
509 scm_read_string (int chr, SCM port)
510 #define FUNC_NAME "scm_lreadr"
511 {
512 /* For strings smaller than C_STR, this function creates only one Scheme
513 object (the string returned). */
514
515 SCM str = SCM_BOOL_F;
516 unsigned c_str_len = 0;
517 scm_t_wchar c;
518
519 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
520 while ('"' != (c = scm_getc (port)))
521 {
522 if (c == EOF)
523 {
524 str_eof:
525 scm_i_input_error (FUNC_NAME, port,
526 "end of file in string constant", SCM_EOL);
527 }
528
529 if (c_str_len + 1 >= scm_i_string_length (str))
530 {
531 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
532
533 str = scm_string_append (scm_list_2 (str, addy));
534 }
535
536 if (c == '\\')
537 {
538 switch (c = scm_getc (port))
539 {
540 case EOF:
541 goto str_eof;
542 case '"':
543 case '\\':
544 break;
545 case '\n':
546 if (SCM_HUNGRY_EOL_ESCAPES_P)
547 skip_intraline_whitespace (port);
548 continue;
549 case '0':
550 c = '\0';
551 break;
552 case 'f':
553 c = '\f';
554 break;
555 case 'n':
556 c = '\n';
557 break;
558 case 'r':
559 c = '\r';
560 break;
561 case 't':
562 c = '\t';
563 break;
564 case 'a':
565 c = '\007';
566 break;
567 case 'v':
568 c = '\v';
569 break;
570 case 'b':
571 c = '\010';
572 break;
573 case 'x':
574 if (SCM_R6RS_ESCAPES_P)
575 SCM_READ_HEX_ESCAPE (10, ';');
576 else
577 SCM_READ_HEX_ESCAPE (2, '\0');
578 break;
579 case 'u':
580 if (!SCM_R6RS_ESCAPES_P)
581 {
582 SCM_READ_HEX_ESCAPE (4, '\0');
583 break;
584 }
585 case 'U':
586 if (!SCM_R6RS_ESCAPES_P)
587 {
588 SCM_READ_HEX_ESCAPE (6, '\0');
589 break;
590 }
591 default:
592 bad_escaped:
593 scm_i_input_error (FUNC_NAME, port,
594 "illegal character in escape sequence: ~S",
595 scm_list_1 (SCM_MAKE_CHAR (c)));
596 }
597 }
598 str = scm_i_string_start_writing (str);
599 scm_i_string_set_x (str, c_str_len++, c);
600 scm_i_string_stop_writing ();
601 }
602
603 if (c_str_len > 0)
604 {
605 return scm_i_substring_copy (str, 0, c_str_len);
606 }
607
608 return scm_nullstr;
609 }
610 #undef FUNC_NAME
611
612
613 static SCM
614 scm_read_number (scm_t_wchar chr, SCM port)
615 {
616 SCM result, str = SCM_EOL;
617 char buffer[READER_BUFFER_SIZE];
618 char *overflow_buffer = NULL;
619 size_t bytes_read;
620 int overflow;
621 scm_t_port *pt = SCM_PTAB_ENTRY (port);
622
623 scm_ungetc (chr, port);
624 overflow = read_complete_token (port, buffer, sizeof (buffer),
625 &overflow_buffer, &bytes_read);
626
627 if (!overflow)
628 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
629 else
630 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
631 pt->ilseq_handler);
632
633 result = scm_string_to_number (str, SCM_UNDEFINED);
634 if (!scm_is_true (result))
635 {
636 /* Return a symbol instead of a number */
637 if (SCM_CASE_INSENSITIVE_P)
638 str = scm_string_downcase_x (str);
639 result = scm_string_to_symbol (str);
640 }
641
642 if (overflow)
643 free (overflow_buffer);
644 SCM_COL (port) += scm_i_string_length (str);
645 return result;
646 }
647
648 static SCM
649 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
650 {
651 SCM result;
652 int ends_with_colon = 0;
653 size_t bytes_read;
654 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
655 int overflow;
656 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
657 scm_t_port *pt = SCM_PTAB_ENTRY (port);
658 SCM str;
659
660 scm_ungetc (chr, port);
661 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
662 &overflow_buffer, &bytes_read);
663 if (bytes_read > 0)
664 {
665 if (!overflow)
666 ends_with_colon = buffer[bytes_read - 1] == ':';
667 else
668 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
669 }
670
671 if (postfix && ends_with_colon && (bytes_read > 1))
672 {
673 if (!overflow)
674 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
675 else
676 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
677 pt->ilseq_handler);
678
679 if (SCM_CASE_INSENSITIVE_P)
680 str = scm_string_downcase_x (str);
681 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
682 }
683 else
684 {
685 if (!overflow)
686 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
687 else
688 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
689 pt->ilseq_handler);
690
691 if (SCM_CASE_INSENSITIVE_P)
692 str = scm_string_downcase_x (str);
693 result = scm_string_to_symbol (str);
694 }
695
696 if (overflow)
697 free (overflow_buffer);
698 SCM_COL (port) += scm_i_string_length (str);
699 return result;
700 }
701
702 static SCM
703 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
704 #define FUNC_NAME "scm_lreadr"
705 {
706 SCM result;
707 size_t read;
708 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
709 int overflow;
710 unsigned int radix;
711 SCM str;
712 scm_t_port *pt;
713
714 switch (chr)
715 {
716 case 'B':
717 case 'b':
718 radix = 2;
719 break;
720
721 case 'o':
722 case 'O':
723 radix = 8;
724 break;
725
726 case 'd':
727 case 'D':
728 radix = 10;
729 break;
730
731 case 'x':
732 case 'X':
733 radix = 16;
734 break;
735
736 default:
737 scm_ungetc (chr, port);
738 scm_ungetc ('#', port);
739 radix = 10;
740 }
741
742 overflow = read_complete_token (port, buffer, sizeof (buffer),
743 &overflow_buffer, &read);
744
745 pt = SCM_PTAB_ENTRY (port);
746 if (!overflow)
747 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
748 else
749 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
750 pt->ilseq_handler);
751
752 result = scm_string_to_number (str, scm_from_uint (radix));
753
754 if (overflow)
755 free (overflow_buffer);
756
757 SCM_COL (port) += scm_i_string_length (str);
758
759 if (scm_is_true (result))
760 return result;
761
762 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
763
764 return SCM_BOOL_F;
765 }
766 #undef FUNC_NAME
767
768 static SCM
769 scm_read_quote (int chr, SCM port)
770 {
771 SCM p;
772 long line = SCM_LINUM (port);
773 int column = SCM_COL (port) - 1;
774
775 switch (chr)
776 {
777 case '`':
778 p = scm_sym_quasiquote;
779 break;
780
781 case '\'':
782 p = scm_sym_quote;
783 break;
784
785 case ',':
786 {
787 scm_t_wchar c;
788
789 c = scm_getc (port);
790 if ('@' == c)
791 p = scm_sym_uq_splicing;
792 else
793 {
794 scm_ungetc (c, port);
795 p = scm_sym_unquote;
796 }
797 break;
798 }
799
800 default:
801 fprintf (stderr, "%s: unhandled quote character (%i)\n",
802 "scm_read_quote", chr);
803 abort ();
804 }
805
806 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
807 if (SCM_RECORD_POSITIONS_P)
808 scm_hashq_set_x (scm_source_whash, p,
809 scm_make_srcprops (line, column,
810 SCM_FILENAME (port),
811 SCM_COPY_SOURCE_P
812 ? (scm_cons2 (SCM_CAR (p),
813 SCM_CAR (SCM_CDR (p)),
814 SCM_EOL))
815 : SCM_UNDEFINED,
816 SCM_EOL));
817
818
819 return p;
820 }
821
822 SCM_SYMBOL (sym_syntax, "syntax");
823 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
824 SCM_SYMBOL (sym_unsyntax, "unsyntax");
825 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
826
827 static SCM
828 scm_read_syntax (int chr, SCM port)
829 {
830 SCM p;
831 long line = SCM_LINUM (port);
832 int column = SCM_COL (port) - 1;
833
834 switch (chr)
835 {
836 case '`':
837 p = sym_quasisyntax;
838 break;
839
840 case '\'':
841 p = sym_syntax;
842 break;
843
844 case ',':
845 {
846 int c;
847
848 c = scm_getc (port);
849 if ('@' == c)
850 p = sym_unsyntax_splicing;
851 else
852 {
853 scm_ungetc (c, port);
854 p = sym_unsyntax;
855 }
856 break;
857 }
858
859 default:
860 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
861 "scm_read_syntax", chr);
862 abort ();
863 }
864
865 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
866 if (SCM_RECORD_POSITIONS_P)
867 scm_hashq_set_x (scm_source_whash, p,
868 scm_make_srcprops (line, column,
869 SCM_FILENAME (port),
870 SCM_COPY_SOURCE_P
871 ? (scm_cons2 (SCM_CAR (p),
872 SCM_CAR (SCM_CDR (p)),
873 SCM_EOL))
874 : SCM_UNDEFINED,
875 SCM_EOL));
876
877
878 return p;
879 }
880
881 static inline SCM
882 scm_read_nil (int chr, SCM port)
883 {
884 SCM id = scm_read_mixed_case_symbol (chr, port);
885
886 if (!scm_is_eq (id, sym_nil))
887 scm_i_input_error ("scm_read_nil", port,
888 "unexpected input while reading #nil: ~a",
889 scm_list_1 (id));
890
891 return SCM_ELISP_NIL;
892 }
893
894 static inline SCM
895 scm_read_semicolon_comment (int chr, SCM port)
896 {
897 int c;
898
899 /* We use the get_byte here because there is no need to get the
900 locale correct with comment input. This presumes that newline
901 always represents itself no matter what the encoding is. */
902 for (c = scm_get_byte_or_eof (port);
903 (c != EOF) && (c != '\n');
904 c = scm_get_byte_or_eof (port));
905
906 return SCM_UNSPECIFIED;
907 }
908
909 \f
910 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
911
912 static SCM
913 scm_read_boolean (int chr, SCM port)
914 {
915 switch (chr)
916 {
917 case 't':
918 case 'T':
919 return SCM_BOOL_T;
920
921 case 'f':
922 case 'F':
923 return SCM_BOOL_F;
924 }
925
926 return SCM_UNSPECIFIED;
927 }
928
929 static SCM
930 scm_read_character (scm_t_wchar chr, SCM port)
931 #define FUNC_NAME "scm_lreadr"
932 {
933 char buffer[READER_CHAR_NAME_MAX_SIZE];
934 SCM charname;
935 size_t charname_len, bytes_read;
936 scm_t_wchar cp;
937 int overflow;
938 scm_t_port *pt;
939
940 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
941 if (overflow)
942 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
943
944 if (bytes_read == 0)
945 {
946 chr = scm_getc (port);
947 if (chr == EOF)
948 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
949 "while reading character", SCM_EOL);
950
951 /* CHR must be a token delimiter, like a whitespace. */
952 return (SCM_MAKE_CHAR (chr));
953 }
954
955 pt = SCM_PTAB_ENTRY (port);
956
957 /* Simple ASCII characters can be processed immediately. Also, simple
958 ISO-8859-1 characters can be processed immediately if the encoding for this
959 port is ISO-8859-1. */
960 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
961 {
962 SCM_COL (port) += 1;
963 return SCM_MAKE_CHAR (buffer[0]);
964 }
965
966 /* Otherwise, convert the buffer into a proper scheme string for
967 processing. */
968 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
969 pt->ilseq_handler);
970 charname_len = scm_i_string_length (charname);
971 SCM_COL (port) += charname_len;
972 cp = scm_i_string_ref (charname, 0);
973 if (charname_len == 1)
974 return SCM_MAKE_CHAR (cp);
975
976 /* Ignore dotted circles, which may be used to keep combining characters from
977 combining with the backslash in #\charname. */
978 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
979 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
980
981 if (cp >= '0' && cp < '8')
982 {
983 /* Dirk:FIXME:: This type of character syntax is not R5RS
984 * compliant. Further, it should be verified that the constant
985 * does only consist of octal digits. */
986 SCM p = scm_string_to_number (charname, scm_from_uint (8));
987 if (SCM_I_INUMP (p))
988 {
989 scm_t_wchar c = scm_to_uint32 (p);
990 if (SCM_IS_UNICODE_CHAR (c))
991 return SCM_MAKE_CHAR (c);
992 else
993 scm_i_input_error (FUNC_NAME, port,
994 "out-of-range octal character escape: ~a",
995 scm_list_1 (charname));
996 }
997 }
998
999 if (cp == 'x' && (charname_len > 1))
1000 {
1001 SCM p;
1002
1003 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1005 scm_from_uint (16));
1006 if (SCM_I_INUMP (p))
1007 {
1008 scm_t_wchar c = scm_to_uint32 (p);
1009 if (SCM_IS_UNICODE_CHAR (c))
1010 return SCM_MAKE_CHAR (c);
1011 else
1012 scm_i_input_error (FUNC_NAME, port,
1013 "out-of-range hex character escape: ~a",
1014 scm_list_1 (charname));
1015 }
1016 }
1017
1018 /* The names of characters should never have non-Latin1
1019 characters. */
1020 if (scm_i_is_narrow_string (charname)
1021 || scm_i_try_narrow_string (charname))
1022 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1023 charname_len);
1024 if (scm_is_true (ch))
1025 return ch;
1026 }
1027
1028 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1029 scm_list_1 (charname));
1030
1031 return SCM_UNSPECIFIED;
1032 }
1033 #undef FUNC_NAME
1034
1035 static inline SCM
1036 scm_read_keyword (int chr, SCM port)
1037 {
1038 SCM symbol;
1039
1040 /* Read the symbol that comprises the keyword. Doing this instead of
1041 invoking a specific symbol reader function allows `scm_read_keyword ()'
1042 to adapt to the delimiters currently valid of symbols.
1043
1044 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1045 symbol = scm_read_expression (port);
1046 if (!scm_is_symbol (symbol))
1047 scm_i_input_error ("scm_read_keyword", port,
1048 "keyword prefix `~a' not followed by a symbol: ~s",
1049 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1050
1051 return (scm_symbol_to_keyword (symbol));
1052 }
1053
1054 static inline SCM
1055 scm_read_vector (int chr, SCM port)
1056 {
1057 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058 guarantee that it's going to do what we want. After all, this is an
1059 implementation detail of `scm_read_vector ()', not a desirable
1060 property. */
1061 return (scm_vector (scm_read_sexp (chr, port)));
1062 }
1063
1064 static inline SCM
1065 scm_read_srfi4_vector (int chr, SCM port)
1066 {
1067 return scm_i_read_array (port, chr);
1068 }
1069
1070 static SCM
1071 scm_read_bytevector (scm_t_wchar chr, SCM port)
1072 {
1073 chr = scm_getc (port);
1074 if (chr != 'u')
1075 goto syntax;
1076
1077 chr = scm_getc (port);
1078 if (chr != '8')
1079 goto syntax;
1080
1081 chr = scm_getc (port);
1082 if (chr != '(')
1083 goto syntax;
1084
1085 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1086
1087 syntax:
1088 scm_i_input_error ("read_bytevector", port,
1089 "invalid bytevector prefix",
1090 SCM_MAKE_CHAR (chr));
1091 return SCM_UNSPECIFIED;
1092 }
1093
1094 static SCM
1095 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1096 {
1097 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1098 terribly inefficient but who cares? */
1099 SCM s_bits = SCM_EOL;
1100
1101 for (chr = scm_getc (port);
1102 (chr != EOF) && ((chr == '0') || (chr == '1'));
1103 chr = scm_getc (port))
1104 {
1105 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1106 }
1107
1108 if (chr != EOF)
1109 scm_ungetc (chr, port);
1110
1111 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1112 }
1113
1114 static inline SCM
1115 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1116 {
1117 int bang_seen = 0;
1118
1119 for (;;)
1120 {
1121 int c = scm_getc (port);
1122
1123 if (c == EOF)
1124 scm_i_input_error ("skip_block_comment", port,
1125 "unterminated `#! ... !#' comment", SCM_EOL);
1126
1127 if (c == '!')
1128 bang_seen = 1;
1129 else if (c == '#' && bang_seen)
1130 break;
1131 else
1132 bang_seen = 0;
1133 }
1134
1135 return SCM_UNSPECIFIED;
1136 }
1137
1138 static SCM
1139 scm_read_shebang (scm_t_wchar chr, SCM port)
1140 {
1141 int c = 0;
1142 if ((c = scm_get_byte_or_eof (port)) != 'r')
1143 {
1144 scm_ungetc (c, port);
1145 return scm_read_scsh_block_comment (chr, port);
1146 }
1147 if ((c = scm_get_byte_or_eof (port)) != '6')
1148 {
1149 scm_ungetc (c, port);
1150 scm_ungetc ('r', port);
1151 return scm_read_scsh_block_comment (chr, port);
1152 }
1153 if ((c = scm_get_byte_or_eof (port)) != 'r')
1154 {
1155 scm_ungetc (c, port);
1156 scm_ungetc ('6', port);
1157 scm_ungetc ('r', port);
1158 return scm_read_scsh_block_comment (chr, port);
1159 }
1160 if ((c = scm_get_byte_or_eof (port)) != 's')
1161 {
1162 scm_ungetc (c, port);
1163 scm_ungetc ('r', port);
1164 scm_ungetc ('6', port);
1165 scm_ungetc ('r', port);
1166 return scm_read_scsh_block_comment (chr, port);
1167 }
1168
1169 return SCM_UNSPECIFIED;
1170 }
1171
1172 static SCM
1173 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1174 {
1175 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1176 nested. So care must be taken. */
1177 int nesting_level = 1;
1178 int opening_seen = 0, closing_seen = 0;
1179
1180 while (nesting_level > 0)
1181 {
1182 int c = scm_getc (port);
1183
1184 if (c == EOF)
1185 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1186 "unterminated `#| ... |#' comment", SCM_EOL);
1187
1188 if (opening_seen)
1189 {
1190 if (c == '|')
1191 nesting_level++;
1192 opening_seen = 0;
1193 }
1194 else if (closing_seen)
1195 {
1196 if (c == '#')
1197 nesting_level--;
1198 closing_seen = 0;
1199 }
1200 else if (c == '|')
1201 closing_seen = 1;
1202 else if (c == '#')
1203 opening_seen = 1;
1204 else
1205 opening_seen = closing_seen = 0;
1206 }
1207
1208 return SCM_UNSPECIFIED;
1209 }
1210
1211 static SCM
1212 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1213 {
1214 scm_t_wchar c;
1215
1216 c = flush_ws (port, (char *) NULL);
1217 if (EOF == c)
1218 scm_i_input_error ("read_commented_expression", port,
1219 "no expression after #; comment", SCM_EOL);
1220 scm_ungetc (c, port);
1221 scm_read_expression (port);
1222 return SCM_UNSPECIFIED;
1223 }
1224
1225 static SCM
1226 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1227 {
1228 /* Guile's extended symbol read syntax looks like this:
1229
1230 #{This is all a symbol name}#
1231
1232 So here, CHR is expected to be `{'. */
1233 int saw_brace = 0;
1234 size_t len = 0;
1235 SCM buf = scm_i_make_string (1024, NULL, 0);
1236
1237 buf = scm_i_string_start_writing (buf);
1238
1239 while ((chr = scm_getc (port)) != EOF)
1240 {
1241 if (saw_brace)
1242 {
1243 if (chr == '#')
1244 {
1245 break;
1246 }
1247 else
1248 {
1249 saw_brace = 0;
1250 scm_i_string_set_x (buf, len++, '}');
1251 }
1252 }
1253
1254 if (chr == '}')
1255 saw_brace = 1;
1256 else if (chr == '\\')
1257 {
1258 /* It used to be that print.c would print extended-read-syntax
1259 symbols with backslashes before "non-standard" chars, but
1260 this routine wouldn't do anything with those escapes.
1261 Bummer. What we've done is to change print.c to output
1262 R6RS hex escapes for those characters, relying on the fact
1263 that the extended read syntax would never put a `\' before
1264 an `x'. For now, we just ignore other instances of
1265 backslash in the string. */
1266 switch ((chr = scm_getc (port)))
1267 {
1268 case EOF:
1269 goto done;
1270 case 'x':
1271 {
1272 scm_t_wchar c;
1273
1274 SCM_READ_HEX_ESCAPE (10, ';');
1275 scm_i_string_set_x (buf, len++, c);
1276 break;
1277
1278 str_eof:
1279 chr = EOF;
1280 goto done;
1281
1282 bad_escaped:
1283 scm_i_string_stop_writing ();
1284 scm_i_input_error ("scm_read_extended_symbol", port,
1285 "illegal character in escape sequence: ~S",
1286 scm_list_1 (SCM_MAKE_CHAR (c)));
1287 break;
1288 }
1289 default:
1290 scm_i_string_set_x (buf, len++, chr);
1291 break;
1292 }
1293 }
1294 else
1295 scm_i_string_set_x (buf, len++, chr);
1296
1297 if (len >= scm_i_string_length (buf) - 2)
1298 {
1299 SCM addy;
1300
1301 scm_i_string_stop_writing ();
1302 addy = scm_i_make_string (1024, NULL, 0);
1303 buf = scm_string_append (scm_list_2 (buf, addy));
1304 len = 0;
1305 buf = scm_i_string_start_writing (buf);
1306 }
1307 }
1308
1309 done:
1310 scm_i_string_stop_writing ();
1311 if (chr == EOF)
1312 scm_i_input_error ("scm_read_extended_symbol", port,
1313 "end of file while reading symbol", SCM_EOL);
1314
1315 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1316 }
1317
1318
1319 \f
1320 /* Top-level token readers, i.e., dispatchers. */
1321
1322 static SCM
1323 scm_read_sharp_extension (int chr, SCM port)
1324 {
1325 SCM proc;
1326
1327 proc = scm_get_hash_procedure (chr);
1328 if (scm_is_true (scm_procedure_p (proc)))
1329 {
1330 long line = SCM_LINUM (port);
1331 int column = SCM_COL (port) - 2;
1332 SCM got;
1333
1334 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1335 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1336 {
1337 if (SCM_RECORD_POSITIONS_P)
1338 return (recsexpr (got, line, column,
1339 SCM_FILENAME (port)));
1340 else
1341 return got;
1342 }
1343 }
1344
1345 return SCM_UNSPECIFIED;
1346 }
1347
1348 /* The reader for the sharp `#' character. It basically dispatches reads
1349 among the above token readers. */
1350 static SCM
1351 scm_read_sharp (scm_t_wchar chr, SCM port)
1352 #define FUNC_NAME "scm_lreadr"
1353 {
1354 SCM result;
1355
1356 chr = scm_getc (port);
1357
1358 result = scm_read_sharp_extension (chr, port);
1359 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1360 return result;
1361
1362 switch (chr)
1363 {
1364 case '\\':
1365 return (scm_read_character (chr, port));
1366 case '(':
1367 return (scm_read_vector (chr, port));
1368 case 's':
1369 case 'u':
1370 case 'f':
1371 case 'c':
1372 /* This one may return either a boolean or an SRFI-4 vector. */
1373 return (scm_read_srfi4_vector (chr, port));
1374 case 'v':
1375 return (scm_read_bytevector (chr, port));
1376 case '*':
1377 return (scm_read_guile_bit_vector (chr, port));
1378 case 't':
1379 case 'T':
1380 case 'F':
1381 /* This one may return either a boolean or an SRFI-4 vector. */
1382 return (scm_read_boolean (chr, port));
1383 case ':':
1384 return (scm_read_keyword (chr, port));
1385 case '0': case '1': case '2': case '3': case '4':
1386 case '5': case '6': case '7': case '8': case '9':
1387 case '@':
1388 return (scm_i_read_array (port, chr));
1389
1390 case 'i':
1391 case 'e':
1392 case 'b':
1393 case 'B':
1394 case 'o':
1395 case 'O':
1396 case 'd':
1397 case 'D':
1398 case 'x':
1399 case 'X':
1400 case 'I':
1401 case 'E':
1402 return (scm_read_number_and_radix (chr, port));
1403 case '{':
1404 return (scm_read_extended_symbol (chr, port));
1405 case '!':
1406 return (scm_read_shebang (chr, port));
1407 case ';':
1408 return (scm_read_commented_expression (chr, port));
1409 case '`':
1410 case '\'':
1411 case ',':
1412 return (scm_read_syntax (chr, port));
1413 case 'n':
1414 return (scm_read_nil (chr, port));
1415 default:
1416 result = scm_read_sharp_extension (chr, port);
1417 if (scm_is_eq (result, SCM_UNSPECIFIED))
1418 {
1419 /* To remain compatible with 1.8 and earlier, the following
1420 characters have lower precedence than `read-hash-extend'
1421 characters. */
1422 switch (chr)
1423 {
1424 case '|':
1425 return scm_read_r6rs_block_comment (chr, port);
1426 default:
1427 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1428 scm_list_1 (SCM_MAKE_CHAR (chr)));
1429 }
1430 }
1431 else
1432 return result;
1433 }
1434
1435 return SCM_UNSPECIFIED;
1436 }
1437 #undef FUNC_NAME
1438
1439 static SCM
1440 scm_read_expression (SCM port)
1441 #define FUNC_NAME "scm_read_expression"
1442 {
1443 while (1)
1444 {
1445 register scm_t_wchar chr;
1446
1447 chr = scm_getc (port);
1448
1449 switch (chr)
1450 {
1451 case SCM_WHITE_SPACES:
1452 case SCM_LINE_INCREMENTORS:
1453 break;
1454 case ';':
1455 (void) scm_read_semicolon_comment (chr, port);
1456 break;
1457 case '[':
1458 if (!SCM_SQUARE_BRACKETS_P)
1459 return (scm_read_mixed_case_symbol (chr, port));
1460 /* otherwise fall through */
1461 case '(':
1462 return (scm_read_sexp (chr, port));
1463 case '"':
1464 return (scm_read_string (chr, port));
1465 case '\'':
1466 case '`':
1467 case ',':
1468 return (scm_read_quote (chr, port));
1469 case '#':
1470 {
1471 SCM result;
1472 result = scm_read_sharp (chr, port);
1473 if (scm_is_eq (result, SCM_UNSPECIFIED))
1474 /* We read a comment or some such. */
1475 break;
1476 else
1477 return result;
1478 }
1479 case ')':
1480 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1481 break;
1482 case ']':
1483 if (SCM_SQUARE_BRACKETS_P)
1484 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1485 /* otherwise fall through */
1486 case EOF:
1487 return SCM_EOF_VAL;
1488 case ':':
1489 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1490 return scm_symbol_to_keyword (scm_read_expression (port));
1491 /* Fall through. */
1492
1493 default:
1494 {
1495 if (((chr >= '0') && (chr <= '9'))
1496 || (strchr ("+-.", chr)))
1497 return (scm_read_number (chr, port));
1498 else
1499 return (scm_read_mixed_case_symbol (chr, port));
1500 }
1501 }
1502 }
1503 }
1504 #undef FUNC_NAME
1505
1506 \f
1507 /* Actual reader. */
1508
1509 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1510 (SCM port),
1511 "Read an s-expression from the input port @var{port}, or from\n"
1512 "the current input port if @var{port} is not specified.\n"
1513 "Any whitespace before the next token is discarded.")
1514 #define FUNC_NAME s_scm_read
1515 {
1516 int c;
1517
1518 if (SCM_UNBNDP (port))
1519 port = scm_current_input_port ();
1520 SCM_VALIDATE_OPINPORT (1, port);
1521
1522 c = flush_ws (port, (char *) NULL);
1523 if (EOF == c)
1524 return SCM_EOF_VAL;
1525 scm_ungetc (c, port);
1526
1527 return (scm_read_expression (port));
1528 }
1529 #undef FUNC_NAME
1530
1531
1532 \f
1533
1534 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1535 static SCM
1536 recsexpr (SCM obj, long line, int column, SCM filename)
1537 {
1538 if (!scm_is_pair(obj)) {
1539 return obj;
1540 } else {
1541 SCM tmp, copy;
1542 /* If this sexpr is visible in the read:sharp source, we want to
1543 keep that information, so only record non-constant cons cells
1544 which haven't previously been read by the reader. */
1545 if (scm_is_false (scm_hashq_ref (scm_source_whash, obj, SCM_BOOL_F)))
1546 {
1547 if (SCM_COPY_SOURCE_P)
1548 {
1549 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1550 SCM_UNDEFINED);
1551 for (tmp = obj; scm_is_pair (tmp); tmp = SCM_CDR (tmp))
1552 {
1553 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1554 line,
1555 column,
1556 filename),
1557 SCM_UNDEFINED));
1558 copy = SCM_CDR (copy);
1559 }
1560 SCM_SETCDR (copy, tmp);
1561 }
1562 else
1563 {
1564 recsexpr (SCM_CAR (obj), line, column, filename);
1565 for (tmp = obj; scm_is_pair (tmp); tmp = SCM_CDR (tmp))
1566 recsexpr (SCM_CAR (tmp), line, column, filename);
1567 copy = SCM_UNDEFINED;
1568 }
1569 scm_hashq_set_x (scm_source_whash,
1570 obj,
1571 scm_make_srcprops (line,
1572 column,
1573 filename,
1574 copy,
1575 SCM_EOL));
1576 }
1577 return obj;
1578 }
1579 }
1580
1581 /* Manipulate the read-hash-procedures alist. This could be written in
1582 Scheme, but maybe it will also be used by C code during initialisation. */
1583 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1584 (SCM chr, SCM proc),
1585 "Install the procedure @var{proc} for reading expressions\n"
1586 "starting with the character sequence @code{#} and @var{chr}.\n"
1587 "@var{proc} will be called with two arguments: the character\n"
1588 "@var{chr} and the port to read further data from. The object\n"
1589 "returned will be the return value of @code{read}. \n"
1590 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1591 )
1592 #define FUNC_NAME s_scm_read_hash_extend
1593 {
1594 SCM this;
1595 SCM prev;
1596
1597 SCM_VALIDATE_CHAR (1, chr);
1598 SCM_ASSERT (scm_is_false (proc)
1599 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1600 proc, SCM_ARG2, FUNC_NAME);
1601
1602 /* Check if chr is already in the alist. */
1603 this = scm_i_read_hash_procedures_ref ();
1604 prev = SCM_BOOL_F;
1605 while (1)
1606 {
1607 if (scm_is_null (this))
1608 {
1609 /* not found, so add it to the beginning. */
1610 if (scm_is_true (proc))
1611 {
1612 SCM new = scm_cons (scm_cons (chr, proc),
1613 scm_i_read_hash_procedures_ref ());
1614 scm_i_read_hash_procedures_set_x (new);
1615 }
1616 break;
1617 }
1618 if (scm_is_eq (chr, SCM_CAAR (this)))
1619 {
1620 /* already in the alist. */
1621 if (scm_is_false (proc))
1622 {
1623 /* remove it. */
1624 if (scm_is_false (prev))
1625 {
1626 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1627 scm_i_read_hash_procedures_set_x (rest);
1628 }
1629 else
1630 scm_set_cdr_x (prev, SCM_CDR (this));
1631 }
1632 else
1633 {
1634 /* replace it. */
1635 scm_set_cdr_x (SCM_CAR (this), proc);
1636 }
1637 break;
1638 }
1639 prev = this;
1640 this = SCM_CDR (this);
1641 }
1642
1643 return SCM_UNSPECIFIED;
1644 }
1645 #undef FUNC_NAME
1646
1647 /* Recover the read-hash procedure corresponding to char c. */
1648 static SCM
1649 scm_get_hash_procedure (int c)
1650 {
1651 SCM rest = scm_i_read_hash_procedures_ref ();
1652
1653 while (1)
1654 {
1655 if (scm_is_null (rest))
1656 return SCM_BOOL_F;
1657
1658 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1659 return SCM_CDAR (rest);
1660
1661 rest = SCM_CDR (rest);
1662 }
1663 }
1664
1665 #define SCM_ENCODING_SEARCH_SIZE (500)
1666
1667 /* Search the first few hundred characters of a file for an Emacs-like coding
1668 declaration. Returns either NULL or a string whose storage has been
1669 allocated with `scm_gc_malloc ()'. */
1670 char *
1671 scm_i_scan_for_encoding (SCM port)
1672 {
1673 scm_t_port *pt;
1674 char header[SCM_ENCODING_SEARCH_SIZE+1];
1675 size_t bytes_read, encoding_length, i;
1676 char *encoding = NULL;
1677 int utf8_bom = 0;
1678 char *pos, *encoding_start;
1679 int in_comment;
1680
1681 pt = SCM_PTAB_ENTRY (port);
1682
1683 if (pt->rw_active == SCM_PORT_WRITE)
1684 scm_flush (port);
1685
1686 if (pt->rw_random)
1687 pt->rw_active = SCM_PORT_READ;
1688
1689 if (pt->read_pos == pt->read_end)
1690 {
1691 /* We can use the read buffer, and thus avoid a seek. */
1692 if (scm_fill_input (port) == EOF)
1693 return NULL;
1694
1695 bytes_read = pt->read_end - pt->read_pos;
1696 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1697 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1698
1699 if (bytes_read <= 1)
1700 /* An unbuffered port -- don't scan. */
1701 return NULL;
1702
1703 memcpy (header, pt->read_pos, bytes_read);
1704 header[bytes_read] = '\0';
1705 }
1706 else
1707 {
1708 /* Try to read some bytes and then seek back. Not all ports
1709 support seeking back; and indeed some file ports (like
1710 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1711 check performed by SCM_FPORT_FDES---but fail to seek
1712 backwards. Hence this block comes second. We prefer to use
1713 the read buffer in-place. */
1714 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1715 return NULL;
1716
1717 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1718 header[bytes_read] = '\0';
1719 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1720 }
1721
1722 if (bytes_read > 3
1723 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1724 utf8_bom = 1;
1725
1726 /* search past "coding[:=]" */
1727 pos = header;
1728 while (1)
1729 {
1730 if ((pos = strstr(pos, "coding")) == NULL)
1731 return NULL;
1732
1733 pos += strlen("coding");
1734 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1735 (*pos == ':' || *pos == '='))
1736 {
1737 pos ++;
1738 break;
1739 }
1740 }
1741
1742 /* skip spaces */
1743 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1744 (*pos == ' ' || *pos == '\t'))
1745 pos ++;
1746
1747 /* grab the next token */
1748 encoding_start = pos;
1749 i = 0;
1750 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1751 && encoding_start + i - header < bytes_read
1752 && (isalnum ((int) encoding_start[i])
1753 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1754 i++;
1755
1756 encoding_length = i;
1757 if (encoding_length == 0)
1758 return NULL;
1759
1760 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1761 for (i = 0; i < encoding_length; i++)
1762 encoding[i] = toupper ((int) encoding[i]);
1763
1764 /* push backwards to make sure we were in a comment */
1765 in_comment = 0;
1766 pos = encoding_start;
1767 while (pos >= header)
1768 {
1769 if (*pos == ';')
1770 {
1771 in_comment = 1;
1772 break;
1773 }
1774 else if (*pos == '\n' || pos == header)
1775 {
1776 /* This wasn't in a semicolon comment. Check for a
1777 hash-bang comment. */
1778 char *beg = strstr (header, "#!");
1779 char *end = strstr (header, "!#");
1780 if (beg < encoding_start && encoding_start + encoding_length <= end)
1781 in_comment = 1;
1782 break;
1783 }
1784 else
1785 {
1786 pos --;
1787 continue;
1788 }
1789 }
1790 if (!in_comment)
1791 /* This wasn't in a comment */
1792 return NULL;
1793
1794 if (utf8_bom && strcmp(encoding, "UTF-8"))
1795 scm_misc_error (NULL,
1796 "the port input declares the encoding ~s but is encoded as UTF-8",
1797 scm_list_1 (scm_from_locale_string (encoding)));
1798
1799 return encoding;
1800 }
1801
1802 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1803 (SCM port),
1804 "Scans the port for an Emacs-like character coding declaration\n"
1805 "near the top of the contents of a port with random-accessible contents.\n"
1806 "The coding declaration is of the form\n"
1807 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1808 "\n"
1809 "Returns a string containing the character encoding of the file\n"
1810 "if a declaration was found, or @code{#f} otherwise.\n")
1811 #define FUNC_NAME s_scm_file_encoding
1812 {
1813 char *enc;
1814 SCM s_enc;
1815
1816 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1817
1818 enc = scm_i_scan_for_encoding (port);
1819 if (enc == NULL)
1820 return SCM_BOOL_F;
1821 else
1822 {
1823 s_enc = scm_from_locale_string (enc);
1824 return s_enc;
1825 }
1826
1827 return SCM_BOOL_F;
1828 }
1829 #undef FUNC_NAME
1830
1831 void
1832 scm_init_read ()
1833 {
1834 SCM read_hash_procs;
1835
1836 read_hash_procs = scm_make_fluid ();
1837 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1838
1839 scm_i_read_hash_procedures =
1840 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1841
1842 scm_init_opts (scm_read_options, scm_read_opts);
1843 #include "libguile/read.x"
1844 }
1845
1846 /*
1847 Local Variables:
1848 c-file-style: "gnu"
1849 End:
1850 */