support for new GC_move_disappearing_link
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static inline SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static inline void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static inline int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof_unlocked (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte_unlocked (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 register scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc_unlocked (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc_unlocked (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc_unlocked (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc_unlocked (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port);
360
361
362 static SCM
363 scm_read_sexp (scm_t_wchar chr, SCM port)
364 #define FUNC_NAME "scm_i_lreadparen"
365 {
366 int c;
367 SCM tmp, tl, ans = SCM_EOL;
368 const int terminating_char = ((chr == '[') ? ']' : ')');
369
370 /* Need to capture line and column numbers here. */
371 long line = SCM_LINUM (port);
372 int column = SCM_COL (port) - 1;
373
374 c = flush_ws (port, FUNC_NAME);
375 if (terminating_char == c)
376 return SCM_EOL;
377
378 scm_ungetc_unlocked (c, port);
379 tmp = scm_read_expression (port);
380
381 /* Note that it is possible for scm_read_expression to return
382 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
383 check that it's a real dot by checking `c'. */
384 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
392
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
396 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
397 {
398 SCM new_tail;
399
400 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
401 scm_i_input_error (FUNC_NAME, port,
402 "in pair: mismatched close paren: ~A",
403 scm_list_1 (SCM_MAKE_CHAR (c)));
404
405 scm_ungetc_unlocked (c, port);
406 tmp = scm_read_expression (port);
407
408 /* See above note about scm_sym_dot. */
409 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
410 {
411 SCM_SETCDR (tl, scm_read_expression (port));
412
413 c = flush_ws (port, FUNC_NAME);
414 if (terminating_char != c)
415 scm_i_input_error (FUNC_NAME, port,
416 "in pair: missing close paren", SCM_EOL);
417 goto exit;
418 }
419
420 new_tail = scm_cons (tmp, SCM_EOL);
421 SCM_SETCDR (tl, new_tail);
422 tl = new_tail;
423 }
424
425 exit:
426 if (SCM_RECORD_POSITIONS_P)
427 scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
428
429 return ans;
430 }
431 #undef FUNC_NAME
432
433
434 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
435 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
436 found. */
437 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
438 do \
439 { \
440 scm_t_wchar a; \
441 size_t i = 0; \
442 c = 0; \
443 while (i < ndigits) \
444 { \
445 a = scm_getc_unlocked (port); \
446 if (a == EOF) \
447 goto str_eof; \
448 if (terminator \
449 && (a == (scm_t_wchar) terminator) \
450 && (i > 0)) \
451 break; \
452 if ('0' <= a && a <= '9') \
453 a -= '0'; \
454 else if ('A' <= a && a <= 'F') \
455 a = a - 'A' + 10; \
456 else if ('a' <= a && a <= 'f') \
457 a = a - 'a' + 10; \
458 else \
459 { \
460 c = a; \
461 goto bad_escaped; \
462 } \
463 c = c * 16 + a; \
464 i ++; \
465 } \
466 } while (0)
467
468 static void
469 skip_intraline_whitespace (SCM port)
470 {
471 scm_t_wchar c;
472
473 do
474 {
475 c = scm_getc_unlocked (port);
476 if (c == EOF)
477 return;
478 }
479 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
480
481 scm_ungetc_unlocked (c, port);
482 }
483
484 static SCM
485 scm_read_string (int chr, SCM port)
486 #define FUNC_NAME "scm_lreadr"
487 {
488 /* For strings smaller than C_STR, this function creates only one Scheme
489 object (the string returned). */
490
491 SCM str = SCM_BOOL_F;
492 unsigned c_str_len = 0;
493 scm_t_wchar c;
494
495 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
496 while ('"' != (c = scm_getc_unlocked (port)))
497 {
498 if (c == EOF)
499 {
500 str_eof:
501 scm_i_input_error (FUNC_NAME, port,
502 "end of file in string constant", SCM_EOL);
503 }
504
505 if (c_str_len + 1 >= scm_i_string_length (str))
506 {
507 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
508
509 str = scm_string_append (scm_list_2 (str, addy));
510 }
511
512 if (c == '\\')
513 {
514 switch (c = scm_getc_unlocked (port))
515 {
516 case EOF:
517 goto str_eof;
518 case '"':
519 case '\\':
520 break;
521 case '\n':
522 if (SCM_HUNGRY_EOL_ESCAPES_P)
523 skip_intraline_whitespace (port);
524 continue;
525 case '0':
526 c = '\0';
527 break;
528 case 'f':
529 c = '\f';
530 break;
531 case 'n':
532 c = '\n';
533 break;
534 case 'r':
535 c = '\r';
536 break;
537 case 't':
538 c = '\t';
539 break;
540 case 'a':
541 c = '\007';
542 break;
543 case 'v':
544 c = '\v';
545 break;
546 case 'b':
547 c = '\010';
548 break;
549 case 'x':
550 if (SCM_R6RS_ESCAPES_P)
551 SCM_READ_HEX_ESCAPE (10, ';');
552 else
553 SCM_READ_HEX_ESCAPE (2, '\0');
554 break;
555 case 'u':
556 if (!SCM_R6RS_ESCAPES_P)
557 {
558 SCM_READ_HEX_ESCAPE (4, '\0');
559 break;
560 }
561 case 'U':
562 if (!SCM_R6RS_ESCAPES_P)
563 {
564 SCM_READ_HEX_ESCAPE (6, '\0');
565 break;
566 }
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
577 }
578
579 if (c_str_len > 0)
580 {
581 return scm_i_substring_copy (str, 0, c_str_len);
582 }
583
584 return scm_nullstr;
585 }
586 #undef FUNC_NAME
587
588
589 static SCM
590 scm_read_number (scm_t_wchar chr, SCM port)
591 {
592 SCM result, str = SCM_EOL;
593 char buffer[READER_BUFFER_SIZE];
594 char *overflow_buffer = NULL;
595 size_t bytes_read;
596 int overflow;
597 scm_t_port *pt = SCM_PTAB_ENTRY (port);
598
599 scm_ungetc_unlocked (chr, port);
600 overflow = read_complete_token (port, buffer, sizeof (buffer),
601 &overflow_buffer, &bytes_read);
602
603 if (!overflow)
604 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
605 else
606 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
607 pt->ilseq_handler);
608
609 result = scm_string_to_number (str, SCM_UNDEFINED);
610 if (!scm_is_true (result))
611 {
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P)
614 str = scm_string_downcase_x (str);
615 result = scm_string_to_symbol (str);
616 }
617
618 if (overflow)
619 free (overflow_buffer);
620 SCM_COL (port) += scm_i_string_length (str);
621 return result;
622 }
623
624 static SCM
625 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
626 {
627 SCM result;
628 int ends_with_colon = 0;
629 size_t bytes_read;
630 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
631 int overflow;
632 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
633 scm_t_port *pt = SCM_PTAB_ENTRY (port);
634 SCM str;
635
636 scm_ungetc_unlocked (chr, port);
637 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
638 &overflow_buffer, &bytes_read);
639 if (bytes_read > 0)
640 {
641 if (!overflow)
642 ends_with_colon = buffer[bytes_read - 1] == ':';
643 else
644 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
645 }
646
647 if (postfix && ends_with_colon && (bytes_read > 1))
648 {
649 if (!overflow)
650 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
651 else
652 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
653 pt->ilseq_handler);
654
655 if (SCM_CASE_INSENSITIVE_P)
656 str = scm_string_downcase_x (str);
657 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
658 }
659 else
660 {
661 if (!overflow)
662 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
663 else
664 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
665 pt->ilseq_handler);
666
667 if (SCM_CASE_INSENSITIVE_P)
668 str = scm_string_downcase_x (str);
669 result = scm_string_to_symbol (str);
670 }
671
672 if (overflow)
673 free (overflow_buffer);
674 SCM_COL (port) += scm_i_string_length (str);
675 return result;
676 }
677
678 static SCM
679 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
680 #define FUNC_NAME "scm_lreadr"
681 {
682 SCM result;
683 size_t read;
684 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
685 int overflow;
686 unsigned int radix;
687 SCM str;
688 scm_t_port *pt;
689
690 switch (chr)
691 {
692 case 'B':
693 case 'b':
694 radix = 2;
695 break;
696
697 case 'o':
698 case 'O':
699 radix = 8;
700 break;
701
702 case 'd':
703 case 'D':
704 radix = 10;
705 break;
706
707 case 'x':
708 case 'X':
709 radix = 16;
710 break;
711
712 default:
713 scm_ungetc_unlocked (chr, port);
714 scm_ungetc_unlocked ('#', port);
715 radix = 10;
716 }
717
718 overflow = read_complete_token (port, buffer, sizeof (buffer),
719 &overflow_buffer, &read);
720
721 pt = SCM_PTAB_ENTRY (port);
722 if (!overflow)
723 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
724 else
725 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
726 pt->ilseq_handler);
727
728 result = scm_string_to_number (str, scm_from_uint (radix));
729
730 if (overflow)
731 free (overflow_buffer);
732
733 SCM_COL (port) += scm_i_string_length (str);
734
735 if (scm_is_true (result))
736 return result;
737
738 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
739
740 return SCM_BOOL_F;
741 }
742 #undef FUNC_NAME
743
744 static SCM
745 scm_read_quote (int chr, SCM port)
746 {
747 SCM p;
748 long line = SCM_LINUM (port);
749 int column = SCM_COL (port) - 1;
750
751 switch (chr)
752 {
753 case '`':
754 p = scm_sym_quasiquote;
755 break;
756
757 case '\'':
758 p = scm_sym_quote;
759 break;
760
761 case ',':
762 {
763 scm_t_wchar c;
764
765 c = scm_getc_unlocked (port);
766 if ('@' == c)
767 p = scm_sym_uq_splicing;
768 else
769 {
770 scm_ungetc_unlocked (c, port);
771 p = scm_sym_unquote;
772 }
773 break;
774 }
775
776 default:
777 fprintf (stderr, "%s: unhandled quote character (%i)\n",
778 "scm_read_quote", chr);
779 abort ();
780 }
781
782 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
783 if (SCM_RECORD_POSITIONS_P)
784 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
785
786 return p;
787 }
788
789 SCM_SYMBOL (sym_syntax, "syntax");
790 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
791 SCM_SYMBOL (sym_unsyntax, "unsyntax");
792 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
793
794 static SCM
795 scm_read_syntax (int chr, SCM port)
796 {
797 SCM p;
798 long line = SCM_LINUM (port);
799 int column = SCM_COL (port) - 1;
800
801 switch (chr)
802 {
803 case '`':
804 p = sym_quasisyntax;
805 break;
806
807 case '\'':
808 p = sym_syntax;
809 break;
810
811 case ',':
812 {
813 int c;
814
815 c = scm_getc_unlocked (port);
816 if ('@' == c)
817 p = sym_unsyntax_splicing;
818 else
819 {
820 scm_ungetc_unlocked (c, port);
821 p = sym_unsyntax;
822 }
823 break;
824 }
825
826 default:
827 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr);
829 abort ();
830 }
831
832 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
833 if (SCM_RECORD_POSITIONS_P)
834 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
835
836 return p;
837 }
838
839 static inline SCM
840 scm_read_nil (int chr, SCM port)
841 {
842 SCM id = scm_read_mixed_case_symbol (chr, port);
843
844 if (!scm_is_eq (id, sym_nil))
845 scm_i_input_error ("scm_read_nil", port,
846 "unexpected input while reading #nil: ~a",
847 scm_list_1 (id));
848
849 return SCM_ELISP_NIL;
850 }
851
852 static inline SCM
853 scm_read_semicolon_comment (int chr, SCM port)
854 {
855 int c;
856
857 /* We use the get_byte here because there is no need to get the
858 locale correct with comment input. This presumes that newline
859 always represents itself no matter what the encoding is. */
860 for (c = scm_get_byte_or_eof_unlocked (port);
861 (c != EOF) && (c != '\n');
862 c = scm_get_byte_or_eof_unlocked (port));
863
864 return SCM_UNSPECIFIED;
865 }
866
867 \f
868 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
869
870 static SCM
871 scm_read_boolean (int chr, SCM port)
872 {
873 switch (chr)
874 {
875 case 't':
876 case 'T':
877 return SCM_BOOL_T;
878
879 case 'f':
880 case 'F':
881 return SCM_BOOL_F;
882 }
883
884 return SCM_UNSPECIFIED;
885 }
886
887 static SCM
888 scm_read_character (scm_t_wchar chr, SCM port)
889 #define FUNC_NAME "scm_lreadr"
890 {
891 char buffer[READER_CHAR_NAME_MAX_SIZE];
892 SCM charname;
893 size_t charname_len, bytes_read;
894 scm_t_wchar cp;
895 int overflow;
896 scm_t_port *pt;
897
898 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
899 if (overflow)
900 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
901
902 if (bytes_read == 0)
903 {
904 chr = scm_getc_unlocked (port);
905 if (chr == EOF)
906 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
907 "while reading character", SCM_EOL);
908
909 /* CHR must be a token delimiter, like a whitespace. */
910 return (SCM_MAKE_CHAR (chr));
911 }
912
913 pt = SCM_PTAB_ENTRY (port);
914
915 /* Simple ASCII characters can be processed immediately. Also, simple
916 ISO-8859-1 characters can be processed immediately if the encoding for this
917 port is ISO-8859-1. */
918 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
919 {
920 SCM_COL (port) += 1;
921 return SCM_MAKE_CHAR (buffer[0]);
922 }
923
924 /* Otherwise, convert the buffer into a proper scheme string for
925 processing. */
926 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
927 pt->ilseq_handler);
928 charname_len = scm_i_string_length (charname);
929 SCM_COL (port) += charname_len;
930 cp = scm_i_string_ref (charname, 0);
931 if (charname_len == 1)
932 return SCM_MAKE_CHAR (cp);
933
934 /* Ignore dotted circles, which may be used to keep combining characters from
935 combining with the backslash in #\charname. */
936 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
937 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
938
939 if (cp >= '0' && cp < '8')
940 {
941 /* Dirk:FIXME:: This type of character syntax is not R5RS
942 * compliant. Further, it should be verified that the constant
943 * does only consist of octal digits. */
944 SCM p = scm_string_to_number (charname, scm_from_uint (8));
945 if (SCM_I_INUMP (p))
946 {
947 scm_t_wchar c = scm_to_uint32 (p);
948 if (SCM_IS_UNICODE_CHAR (c))
949 return SCM_MAKE_CHAR (c);
950 else
951 scm_i_input_error (FUNC_NAME, port,
952 "out-of-range octal character escape: ~a",
953 scm_list_1 (charname));
954 }
955 }
956
957 if (cp == 'x' && (charname_len > 1))
958 {
959 SCM p;
960
961 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
962 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
963 scm_from_uint (16));
964 if (SCM_I_INUMP (p))
965 {
966 scm_t_wchar c = scm_to_uint32 (p);
967 if (SCM_IS_UNICODE_CHAR (c))
968 return SCM_MAKE_CHAR (c);
969 else
970 scm_i_input_error (FUNC_NAME, port,
971 "out-of-range hex character escape: ~a",
972 scm_list_1 (charname));
973 }
974 }
975
976 /* The names of characters should never have non-Latin1
977 characters. */
978 if (scm_i_is_narrow_string (charname)
979 || scm_i_try_narrow_string (charname))
980 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
981 charname_len);
982 if (scm_is_true (ch))
983 return ch;
984 }
985
986 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
987 scm_list_1 (charname));
988
989 return SCM_UNSPECIFIED;
990 }
991 #undef FUNC_NAME
992
993 static inline SCM
994 scm_read_keyword (int chr, SCM port)
995 {
996 SCM symbol;
997
998 /* Read the symbol that comprises the keyword. Doing this instead of
999 invoking a specific symbol reader function allows `scm_read_keyword ()'
1000 to adapt to the delimiters currently valid of symbols.
1001
1002 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1003 symbol = scm_read_expression (port);
1004 if (!scm_is_symbol (symbol))
1005 scm_i_input_error ("scm_read_keyword", port,
1006 "keyword prefix `~a' not followed by a symbol: ~s",
1007 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009 return (scm_symbol_to_keyword (symbol));
1010 }
1011
1012 static inline SCM
1013 scm_read_vector (int chr, SCM port)
1014 {
1015 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016 guarantee that it's going to do what we want. After all, this is an
1017 implementation detail of `scm_read_vector ()', not a desirable
1018 property. */
1019 return (scm_vector (scm_read_sexp (chr, port)));
1020 }
1021
1022 static inline SCM
1023 scm_read_srfi4_vector (int chr, SCM port)
1024 {
1025 return scm_i_read_array (port, chr);
1026 }
1027
1028 static SCM
1029 scm_read_bytevector (scm_t_wchar chr, SCM port)
1030 {
1031 chr = scm_getc_unlocked (port);
1032 if (chr != 'u')
1033 goto syntax;
1034
1035 chr = scm_getc_unlocked (port);
1036 if (chr != '8')
1037 goto syntax;
1038
1039 chr = scm_getc_unlocked (port);
1040 if (chr != '(')
1041 goto syntax;
1042
1043 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045 syntax:
1046 scm_i_input_error ("read_bytevector", port,
1047 "invalid bytevector prefix",
1048 SCM_MAKE_CHAR (chr));
1049 return SCM_UNSPECIFIED;
1050 }
1051
1052 static SCM
1053 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1054 {
1055 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1056 terribly inefficient but who cares? */
1057 SCM s_bits = SCM_EOL;
1058
1059 for (chr = scm_getc_unlocked (port);
1060 (chr != EOF) && ((chr == '0') || (chr == '1'));
1061 chr = scm_getc_unlocked (port))
1062 {
1063 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1064 }
1065
1066 if (chr != EOF)
1067 scm_ungetc_unlocked (chr, port);
1068
1069 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070 }
1071
1072 static inline SCM
1073 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1074 {
1075 int bang_seen = 0;
1076
1077 for (;;)
1078 {
1079 int c = scm_getc_unlocked (port);
1080
1081 if (c == EOF)
1082 scm_i_input_error ("skip_block_comment", port,
1083 "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085 if (c == '!')
1086 bang_seen = 1;
1087 else if (c == '#' && bang_seen)
1088 break;
1089 else
1090 bang_seen = 0;
1091 }
1092
1093 return SCM_UNSPECIFIED;
1094 }
1095
1096 static SCM
1097 scm_read_shebang (scm_t_wchar chr, SCM port)
1098 {
1099 int c = 0;
1100 if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
1101 {
1102 scm_ungetc_unlocked (c, port);
1103 return scm_read_scsh_block_comment (chr, port);
1104 }
1105 if ((c = scm_get_byte_or_eof_unlocked (port)) != '6')
1106 {
1107 scm_ungetc_unlocked (c, port);
1108 scm_ungetc_unlocked ('r', port);
1109 return scm_read_scsh_block_comment (chr, port);
1110 }
1111 if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
1112 {
1113 scm_ungetc_unlocked (c, port);
1114 scm_ungetc_unlocked ('6', port);
1115 scm_ungetc_unlocked ('r', port);
1116 return scm_read_scsh_block_comment (chr, port);
1117 }
1118 if ((c = scm_get_byte_or_eof_unlocked (port)) != 's')
1119 {
1120 scm_ungetc_unlocked (c, port);
1121 scm_ungetc_unlocked ('r', port);
1122 scm_ungetc_unlocked ('6', port);
1123 scm_ungetc_unlocked ('r', port);
1124 return scm_read_scsh_block_comment (chr, port);
1125 }
1126
1127 return SCM_UNSPECIFIED;
1128 }
1129
1130 static SCM
1131 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132 {
1133 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134 nested. So care must be taken. */
1135 int nesting_level = 1;
1136
1137 int a = scm_getc_unlocked (port);
1138
1139 if (a == EOF)
1140 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1141 "unterminated `#| ... |#' comment", SCM_EOL);
1142
1143 while (nesting_level > 0)
1144 {
1145 int b = scm_getc_unlocked (port);
1146
1147 if (b == EOF)
1148 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1149 "unterminated `#| ... |#' comment", SCM_EOL);
1150
1151 if (a == '|' && b == '#')
1152 {
1153 nesting_level--;
1154 b = EOF;
1155 }
1156 else if (a == '#' && b == '|')
1157 {
1158 nesting_level++;
1159 b = EOF;
1160 }
1161
1162 a = b;
1163 }
1164
1165 return SCM_UNSPECIFIED;
1166 }
1167
1168 static SCM
1169 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1170 {
1171 scm_t_wchar c;
1172
1173 c = flush_ws (port, (char *) NULL);
1174 if (EOF == c)
1175 scm_i_input_error ("read_commented_expression", port,
1176 "no expression after #; comment", SCM_EOL);
1177 scm_ungetc_unlocked (c, port);
1178 scm_read_expression (port);
1179 return SCM_UNSPECIFIED;
1180 }
1181
1182 static SCM
1183 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1184 {
1185 /* Guile's extended symbol read syntax looks like this:
1186
1187 #{This is all a symbol name}#
1188
1189 So here, CHR is expected to be `{'. */
1190 int saw_brace = 0;
1191 size_t len = 0;
1192 SCM buf = scm_i_make_string (1024, NULL, 0);
1193
1194 buf = scm_i_string_start_writing (buf);
1195
1196 while ((chr = scm_getc_unlocked (port)) != EOF)
1197 {
1198 if (saw_brace)
1199 {
1200 if (chr == '#')
1201 {
1202 break;
1203 }
1204 else
1205 {
1206 saw_brace = 0;
1207 scm_i_string_set_x (buf, len++, '}');
1208 }
1209 }
1210
1211 if (chr == '}')
1212 saw_brace = 1;
1213 else if (chr == '\\')
1214 {
1215 /* It used to be that print.c would print extended-read-syntax
1216 symbols with backslashes before "non-standard" chars, but
1217 this routine wouldn't do anything with those escapes.
1218 Bummer. What we've done is to change print.c to output
1219 R6RS hex escapes for those characters, relying on the fact
1220 that the extended read syntax would never put a `\' before
1221 an `x'. For now, we just ignore other instances of
1222 backslash in the string. */
1223 switch ((chr = scm_getc_unlocked (port)))
1224 {
1225 case EOF:
1226 goto done;
1227 case 'x':
1228 {
1229 scm_t_wchar c;
1230
1231 SCM_READ_HEX_ESCAPE (10, ';');
1232 scm_i_string_set_x (buf, len++, c);
1233 break;
1234
1235 str_eof:
1236 chr = EOF;
1237 goto done;
1238
1239 bad_escaped:
1240 scm_i_string_stop_writing ();
1241 scm_i_input_error ("scm_read_extended_symbol", port,
1242 "illegal character in escape sequence: ~S",
1243 scm_list_1 (SCM_MAKE_CHAR (c)));
1244 break;
1245 }
1246 default:
1247 scm_i_string_set_x (buf, len++, chr);
1248 break;
1249 }
1250 }
1251 else
1252 scm_i_string_set_x (buf, len++, chr);
1253
1254 if (len >= scm_i_string_length (buf) - 2)
1255 {
1256 SCM addy;
1257
1258 scm_i_string_stop_writing ();
1259 addy = scm_i_make_string (1024, NULL, 0);
1260 buf = scm_string_append (scm_list_2 (buf, addy));
1261 len = 0;
1262 buf = scm_i_string_start_writing (buf);
1263 }
1264 }
1265
1266 done:
1267 scm_i_string_stop_writing ();
1268 if (chr == EOF)
1269 scm_i_input_error ("scm_read_extended_symbol", port,
1270 "end of file while reading symbol", SCM_EOL);
1271
1272 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1273 }
1274
1275
1276 \f
1277 /* Top-level token readers, i.e., dispatchers. */
1278
1279 static SCM
1280 scm_read_sharp_extension (int chr, SCM port)
1281 {
1282 SCM proc;
1283
1284 proc = scm_get_hash_procedure (chr);
1285 if (scm_is_true (scm_procedure_p (proc)))
1286 {
1287 long line = SCM_LINUM (port);
1288 int column = SCM_COL (port) - 2;
1289 SCM got;
1290
1291 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1292
1293 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1294 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1295
1296 return got;
1297 }
1298
1299 return SCM_UNSPECIFIED;
1300 }
1301
1302 /* The reader for the sharp `#' character. It basically dispatches reads
1303 among the above token readers. */
1304 static SCM
1305 scm_read_sharp (scm_t_wchar chr, SCM port)
1306 #define FUNC_NAME "scm_lreadr"
1307 {
1308 SCM result;
1309
1310 chr = scm_getc_unlocked (port);
1311
1312 result = scm_read_sharp_extension (chr, port);
1313 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1314 return result;
1315
1316 switch (chr)
1317 {
1318 case '\\':
1319 return (scm_read_character (chr, port));
1320 case '(':
1321 return (scm_read_vector (chr, port));
1322 case 's':
1323 case 'u':
1324 case 'f':
1325 case 'c':
1326 /* This one may return either a boolean or an SRFI-4 vector. */
1327 return (scm_read_srfi4_vector (chr, port));
1328 case 'v':
1329 return (scm_read_bytevector (chr, port));
1330 case '*':
1331 return (scm_read_guile_bit_vector (chr, port));
1332 case 't':
1333 case 'T':
1334 case 'F':
1335 /* This one may return either a boolean or an SRFI-4 vector. */
1336 return (scm_read_boolean (chr, port));
1337 case ':':
1338 return (scm_read_keyword (chr, port));
1339 case '0': case '1': case '2': case '3': case '4':
1340 case '5': case '6': case '7': case '8': case '9':
1341 case '@':
1342 return (scm_i_read_array (port, chr));
1343
1344 case 'i':
1345 case 'e':
1346 case 'b':
1347 case 'B':
1348 case 'o':
1349 case 'O':
1350 case 'd':
1351 case 'D':
1352 case 'x':
1353 case 'X':
1354 case 'I':
1355 case 'E':
1356 return (scm_read_number_and_radix (chr, port));
1357 case '{':
1358 return (scm_read_extended_symbol (chr, port));
1359 case '!':
1360 return (scm_read_shebang (chr, port));
1361 case ';':
1362 return (scm_read_commented_expression (chr, port));
1363 case '`':
1364 case '\'':
1365 case ',':
1366 return (scm_read_syntax (chr, port));
1367 case 'n':
1368 return (scm_read_nil (chr, port));
1369 default:
1370 result = scm_read_sharp_extension (chr, port);
1371 if (scm_is_eq (result, SCM_UNSPECIFIED))
1372 {
1373 /* To remain compatible with 1.8 and earlier, the following
1374 characters have lower precedence than `read-hash-extend'
1375 characters. */
1376 switch (chr)
1377 {
1378 case '|':
1379 return scm_read_r6rs_block_comment (chr, port);
1380 default:
1381 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1382 scm_list_1 (SCM_MAKE_CHAR (chr)));
1383 }
1384 }
1385 else
1386 return result;
1387 }
1388
1389 return SCM_UNSPECIFIED;
1390 }
1391 #undef FUNC_NAME
1392
1393 static SCM
1394 scm_read_expression (SCM port)
1395 #define FUNC_NAME "scm_read_expression"
1396 {
1397 while (1)
1398 {
1399 register scm_t_wchar chr;
1400
1401 chr = scm_getc_unlocked (port);
1402
1403 switch (chr)
1404 {
1405 case SCM_WHITE_SPACES:
1406 case SCM_LINE_INCREMENTORS:
1407 break;
1408 case ';':
1409 (void) scm_read_semicolon_comment (chr, port);
1410 break;
1411 case '[':
1412 if (!SCM_SQUARE_BRACKETS_P)
1413 return (scm_read_mixed_case_symbol (chr, port));
1414 /* otherwise fall through */
1415 case '(':
1416 return (scm_read_sexp (chr, port));
1417 case '"':
1418 return (scm_read_string (chr, port));
1419 case '\'':
1420 case '`':
1421 case ',':
1422 return (scm_read_quote (chr, port));
1423 case '#':
1424 {
1425 SCM result;
1426 result = scm_read_sharp (chr, port);
1427 if (scm_is_eq (result, SCM_UNSPECIFIED))
1428 /* We read a comment or some such. */
1429 break;
1430 else
1431 return result;
1432 }
1433 case ')':
1434 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1435 break;
1436 case ']':
1437 if (SCM_SQUARE_BRACKETS_P)
1438 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1439 /* otherwise fall through */
1440 case EOF:
1441 return SCM_EOF_VAL;
1442 case ':':
1443 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1444 return scm_symbol_to_keyword (scm_read_expression (port));
1445 /* Fall through. */
1446
1447 default:
1448 {
1449 if (((chr >= '0') && (chr <= '9'))
1450 || (strchr ("+-.", chr)))
1451 return (scm_read_number (chr, port));
1452 else
1453 return (scm_read_mixed_case_symbol (chr, port));
1454 }
1455 }
1456 }
1457 }
1458 #undef FUNC_NAME
1459
1460 \f
1461 /* Actual reader. */
1462
1463 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1464 (SCM port),
1465 "Read an s-expression from the input port @var{port}, or from\n"
1466 "the current input port if @var{port} is not specified.\n"
1467 "Any whitespace before the next token is discarded.")
1468 #define FUNC_NAME s_scm_read
1469 {
1470 int c;
1471
1472 if (SCM_UNBNDP (port))
1473 port = scm_current_input_port ();
1474 SCM_VALIDATE_OPINPORT (1, port);
1475
1476 c = flush_ws (port, (char *) NULL);
1477 if (EOF == c)
1478 return SCM_EOF_VAL;
1479 scm_ungetc_unlocked (c, port);
1480
1481 return (scm_read_expression (port));
1482 }
1483 #undef FUNC_NAME
1484
1485
1486 \f
1487
1488 /* Manipulate the read-hash-procedures alist. This could be written in
1489 Scheme, but maybe it will also be used by C code during initialisation. */
1490 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1491 (SCM chr, SCM proc),
1492 "Install the procedure @var{proc} for reading expressions\n"
1493 "starting with the character sequence @code{#} and @var{chr}.\n"
1494 "@var{proc} will be called with two arguments: the character\n"
1495 "@var{chr} and the port to read further data from. The object\n"
1496 "returned will be the return value of @code{read}. \n"
1497 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1498 )
1499 #define FUNC_NAME s_scm_read_hash_extend
1500 {
1501 SCM this;
1502 SCM prev;
1503
1504 SCM_VALIDATE_CHAR (1, chr);
1505 SCM_ASSERT (scm_is_false (proc)
1506 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1507 proc, SCM_ARG2, FUNC_NAME);
1508
1509 /* Check if chr is already in the alist. */
1510 this = scm_i_read_hash_procedures_ref ();
1511 prev = SCM_BOOL_F;
1512 while (1)
1513 {
1514 if (scm_is_null (this))
1515 {
1516 /* not found, so add it to the beginning. */
1517 if (scm_is_true (proc))
1518 {
1519 SCM new = scm_cons (scm_cons (chr, proc),
1520 scm_i_read_hash_procedures_ref ());
1521 scm_i_read_hash_procedures_set_x (new);
1522 }
1523 break;
1524 }
1525 if (scm_is_eq (chr, SCM_CAAR (this)))
1526 {
1527 /* already in the alist. */
1528 if (scm_is_false (proc))
1529 {
1530 /* remove it. */
1531 if (scm_is_false (prev))
1532 {
1533 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1534 scm_i_read_hash_procedures_set_x (rest);
1535 }
1536 else
1537 scm_set_cdr_x (prev, SCM_CDR (this));
1538 }
1539 else
1540 {
1541 /* replace it. */
1542 scm_set_cdr_x (SCM_CAR (this), proc);
1543 }
1544 break;
1545 }
1546 prev = this;
1547 this = SCM_CDR (this);
1548 }
1549
1550 return SCM_UNSPECIFIED;
1551 }
1552 #undef FUNC_NAME
1553
1554 /* Recover the read-hash procedure corresponding to char c. */
1555 static SCM
1556 scm_get_hash_procedure (int c)
1557 {
1558 SCM rest = scm_i_read_hash_procedures_ref ();
1559
1560 while (1)
1561 {
1562 if (scm_is_null (rest))
1563 return SCM_BOOL_F;
1564
1565 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1566 return SCM_CDAR (rest);
1567
1568 rest = SCM_CDR (rest);
1569 }
1570 }
1571
1572 #define SCM_ENCODING_SEARCH_SIZE (500)
1573
1574 /* Search the first few hundred characters of a file for an Emacs-like coding
1575 declaration. Returns either NULL or a string whose storage has been
1576 allocated with `scm_gc_malloc ()'. */
1577 char *
1578 scm_i_scan_for_encoding (SCM port)
1579 {
1580 scm_t_port *pt;
1581 char header[SCM_ENCODING_SEARCH_SIZE+1];
1582 size_t bytes_read, encoding_length, i;
1583 char *encoding = NULL;
1584 int utf8_bom = 0;
1585 char *pos, *encoding_start;
1586 int in_comment;
1587
1588 pt = SCM_PTAB_ENTRY (port);
1589
1590 if (pt->rw_active == SCM_PORT_WRITE)
1591 scm_flush_unlocked (port);
1592
1593 if (pt->rw_random)
1594 pt->rw_active = SCM_PORT_READ;
1595
1596 if (pt->read_pos == pt->read_end)
1597 {
1598 /* We can use the read buffer, and thus avoid a seek. */
1599 if (scm_fill_input_unlocked (port) == EOF)
1600 return NULL;
1601
1602 bytes_read = pt->read_end - pt->read_pos;
1603 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1604 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1605
1606 if (bytes_read <= 1)
1607 /* An unbuffered port -- don't scan. */
1608 return NULL;
1609
1610 memcpy (header, pt->read_pos, bytes_read);
1611 header[bytes_read] = '\0';
1612 }
1613 else
1614 {
1615 /* Try to read some bytes and then seek back. Not all ports
1616 support seeking back; and indeed some file ports (like
1617 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1618 check performed by SCM_FPORT_FDES---but fail to seek
1619 backwards. Hence this block comes second. We prefer to use
1620 the read buffer in-place. */
1621 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1622 return NULL;
1623
1624 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
1625 header[bytes_read] = '\0';
1626 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1627 }
1628
1629 if (bytes_read > 3
1630 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1631 utf8_bom = 1;
1632
1633 /* search past "coding[:=]" */
1634 pos = header;
1635 while (1)
1636 {
1637 if ((pos = strstr(pos, "coding")) == NULL)
1638 return NULL;
1639
1640 pos += strlen("coding");
1641 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1642 (*pos == ':' || *pos == '='))
1643 {
1644 pos ++;
1645 break;
1646 }
1647 }
1648
1649 /* skip spaces */
1650 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1651 (*pos == ' ' || *pos == '\t'))
1652 pos ++;
1653
1654 /* grab the next token */
1655 encoding_start = pos;
1656 i = 0;
1657 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1658 && encoding_start + i - header < bytes_read
1659 && (isalnum ((int) encoding_start[i])
1660 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1661 i++;
1662
1663 encoding_length = i;
1664 if (encoding_length == 0)
1665 return NULL;
1666
1667 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1668 for (i = 0; i < encoding_length; i++)
1669 encoding[i] = toupper ((int) encoding[i]);
1670
1671 /* push backwards to make sure we were in a comment */
1672 in_comment = 0;
1673 pos = encoding_start;
1674 while (pos >= header)
1675 {
1676 if (*pos == ';')
1677 {
1678 in_comment = 1;
1679 break;
1680 }
1681 else if (*pos == '\n' || pos == header)
1682 {
1683 /* This wasn't in a semicolon comment. Check for a
1684 hash-bang comment. */
1685 char *beg = strstr (header, "#!");
1686 char *end = strstr (header, "!#");
1687 if (beg < encoding_start && encoding_start + encoding_length <= end)
1688 in_comment = 1;
1689 break;
1690 }
1691 else
1692 {
1693 pos --;
1694 continue;
1695 }
1696 }
1697 if (!in_comment)
1698 /* This wasn't in a comment */
1699 return NULL;
1700
1701 if (utf8_bom && strcmp(encoding, "UTF-8"))
1702 scm_misc_error (NULL,
1703 "the port input declares the encoding ~s but is encoded as UTF-8",
1704 scm_list_1 (scm_from_locale_string (encoding)));
1705
1706 return encoding;
1707 }
1708
1709 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1710 (SCM port),
1711 "Scans the port for an Emacs-like character coding declaration\n"
1712 "near the top of the contents of a port with random-accessible contents.\n"
1713 "The coding declaration is of the form\n"
1714 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1715 "\n"
1716 "Returns a string containing the character encoding of the file\n"
1717 "if a declaration was found, or @code{#f} otherwise.\n")
1718 #define FUNC_NAME s_scm_file_encoding
1719 {
1720 char *enc;
1721 SCM s_enc;
1722
1723 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1724
1725 enc = scm_i_scan_for_encoding (port);
1726 if (enc == NULL)
1727 return SCM_BOOL_F;
1728 else
1729 {
1730 s_enc = scm_from_locale_string (enc);
1731 return s_enc;
1732 }
1733
1734 return SCM_BOOL_F;
1735 }
1736 #undef FUNC_NAME
1737
1738 void
1739 scm_init_read ()
1740 {
1741 SCM read_hash_procs;
1742
1743 read_hash_procs = scm_make_fluid ();
1744 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1745
1746 scm_i_read_hash_procedures =
1747 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1748
1749 scm_init_opts (scm_read_options, scm_read_opts);
1750 #include "libguile/read.x"
1751 }
1752
1753 /*
1754 Local Variables:
1755 c-file-style: "gnu"
1756 End:
1757 */