add read syntax for #nil
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
53
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
56
57
58 \f
59
60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
63 SCM_SYMBOL (sym_nil, "nil");
64
65 scm_t_option scm_read_opts[] = {
66 { SCM_OPTION_BOOLEAN, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
76 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
77 "Support `\\(' and `\\)' in strings."},
78 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
79 "Use R6RS variable-length character and string hex escapes."},
80 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
81 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
82 { 0, },
83 };
84
85 /*
86 Give meaningful error messages for errors
87
88 We use the format
89
90 FILE:LINE:COL: MESSAGE
91 This happened in ....
92
93 This is not standard GNU format, but the test-suite likes the real
94 message to be in front.
95
96 */
97
98
99 void
100 scm_i_input_error (char const *function,
101 SCM port, const char *message, SCM arg)
102 {
103 SCM fn = (scm_is_string (SCM_FILENAME(port))
104 ? SCM_FILENAME(port)
105 : scm_from_locale_string ("#<unknown port>"));
106
107 SCM string_port = scm_open_output_string ();
108 SCM string = SCM_EOL;
109 scm_simple_format (string_port,
110 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_list_4 (fn,
112 scm_from_long (SCM_LINUM (port) + 1),
113 scm_from_int (SCM_COL (port) + 1),
114 scm_from_locale_string (message)));
115
116 string = scm_get_output_string (string_port);
117 scm_close_output_port (string_port);
118 scm_error_scm (scm_from_locale_symbol ("read-error"),
119 function? scm_from_locale_string (function) : SCM_BOOL_F,
120 string,
121 arg,
122 SCM_BOOL_F);
123 }
124
125
126 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
127 (SCM setting),
128 "Option interface for the read options. Instead of using\n"
129 "this procedure directly, use the procedures @code{read-enable},\n"
130 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
131 #define FUNC_NAME s_scm_read_options
132 {
133 SCM ans = scm_options (setting,
134 scm_read_opts,
135 FUNC_NAME);
136 if (SCM_COPY_SOURCE_P)
137 SCM_RECORD_POSITIONS_P = 1;
138 return ans;
139 }
140 #undef FUNC_NAME
141
142 /* An association list mapping extra hash characters to procedures. */
143 static SCM *scm_read_hash_procedures;
144
145
146 \f
147 /* Token readers. */
148
149
150 /* Size of the C buffer used to read symbols and numbers. */
151 #define READER_BUFFER_SIZE 128
152
153 /* Size of the C buffer used to read strings. */
154 #define READER_STRING_BUFFER_SIZE 512
155
156 /* The maximum size of Scheme character names. */
157 #define READER_CHAR_NAME_MAX_SIZE 50
158
159
160 /* `isblank' is only in C99. */
161 #define CHAR_IS_BLANK_(_chr) \
162 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
163 || ((_chr) == '\f') || ((_chr) == '\r'))
164
165 #ifdef MSDOS
166 # define CHAR_IS_BLANK(_chr) \
167 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
168 #else
169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
170 #endif
171
172
173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
174 structure''). */
175 #define CHAR_IS_R5RS_DELIMITER(c) \
176 (CHAR_IS_BLANK (c) \
177 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
178 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
179
180 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
181
182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
183 Structure''. */
184 #define CHAR_IS_EXPONENT_MARKER(_chr) \
185 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
186 || ((_chr) == 'd') || ((_chr) == 'l'))
187
188 /* Read an SCSH block comment. */
189 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
190 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
191 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
192 static SCM scm_get_hash_procedure (int);
193
194 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
195 result in the pre-allocated buffer BUF. Return zero if the whole token has
196 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
197 bytes actually read. */
198 static inline int
199 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
200 {
201 *read = 0;
202
203 while (*read < buf_size)
204 {
205 int chr;
206
207 chr = scm_get_byte_or_eof (port);
208
209 if (chr == EOF)
210 return 0;
211 else if (CHAR_IS_DELIMITER (chr))
212 {
213 scm_unget_byte (chr, port);
214 return 0;
215 }
216 else
217 {
218 *buf = (char) chr;
219 buf++, (*read)++;
220 }
221 }
222
223 return 1;
224 }
225
226 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
227 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
228 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
229 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
230 will be set the number of bytes actually read. */
231 static int
232 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
233 char **overflow_buffer, size_t *read)
234 {
235 int overflow = 0;
236 size_t bytes_read, overflow_size;
237
238 *overflow_buffer = NULL;
239 overflow_size = 0;
240
241 do
242 {
243 overflow = read_token (port, buffer, buffer_size, &bytes_read);
244 if (bytes_read == 0)
245 break;
246 if (overflow || overflow_size != 0)
247 {
248 if (overflow_size == 0)
249 {
250 *overflow_buffer = scm_malloc (bytes_read);
251 memcpy (*overflow_buffer, buffer, bytes_read);
252 overflow_size = bytes_read;
253 }
254 else
255 {
256 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
257 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
258 overflow_size += bytes_read;
259 }
260 }
261 }
262 while (overflow);
263
264 if (overflow_size)
265 *read = overflow_size;
266 else
267 *read = bytes_read;
268
269 return (overflow_size != 0);
270 }
271
272 /* Skip whitespace from PORT and return the first non-whitespace character
273 read. Raise an error on end-of-file. */
274 static int
275 flush_ws (SCM port, const char *eoferr)
276 {
277 register scm_t_wchar c;
278 while (1)
279 switch (c = scm_getc (port))
280 {
281 case EOF:
282 goteof:
283 if (eoferr)
284 {
285 scm_i_input_error (eoferr,
286 port,
287 "end of file",
288 SCM_EOL);
289 }
290 return c;
291
292 case ';':
293 lp:
294 switch (c = scm_getc (port))
295 {
296 case EOF:
297 goto goteof;
298 default:
299 goto lp;
300 case SCM_LINE_INCREMENTORS:
301 break;
302 }
303 break;
304
305 case '#':
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 eoferr = "read_sharp";
310 goto goteof;
311 case '!':
312 scm_read_scsh_block_comment (c, port);
313 break;
314 case ';':
315 scm_read_commented_expression (c, port);
316 break;
317 case '|':
318 if (scm_is_false (scm_get_hash_procedure (c)))
319 {
320 scm_read_r6rs_block_comment (c, port);
321 break;
322 }
323 /* fall through */
324 default:
325 scm_ungetc (c, port);
326 return '#';
327 }
328 break;
329
330 case SCM_LINE_INCREMENTORS:
331 case SCM_SINGLE_SPACES:
332 case '\t':
333 break;
334
335 default:
336 return c;
337 }
338
339 return 0;
340 }
341
342
343 \f
344 /* Token readers. */
345
346 static SCM scm_read_expression (SCM port);
347 static SCM scm_read_sharp (int chr, SCM port);
348 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
349
350
351 static SCM
352 scm_read_sexp (scm_t_wchar chr, SCM port)
353 #define FUNC_NAME "scm_i_lreadparen"
354 {
355 register int c;
356 register SCM tmp;
357 register SCM tl, ans = SCM_EOL;
358 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
359 const int terminating_char = ((chr == '[') ? ']' : ')');
360
361 /* Need to capture line and column numbers here. */
362 long line = SCM_LINUM (port);
363 int column = SCM_COL (port) - 1;
364
365
366 c = flush_ws (port, FUNC_NAME);
367 if (terminating_char == c)
368 return SCM_EOL;
369
370 scm_ungetc (c, port);
371 if (scm_is_eq (scm_sym_dot,
372 (tmp = scm_read_expression (port))))
373 {
374 ans = scm_read_expression (port);
375 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
376 scm_i_input_error (FUNC_NAME, port, "missing close paren",
377 SCM_EOL);
378 return ans;
379 }
380
381 /* Build the head of the list structure. */
382 ans = tl = scm_cons (tmp, SCM_EOL);
383
384 if (SCM_COPY_SOURCE_P)
385 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
386 ? copy
387 : tmp,
388 SCM_EOL);
389
390 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
391 {
392 SCM new_tail;
393
394 scm_ungetc (c, port);
395 if (scm_is_eq (scm_sym_dot,
396 (tmp = scm_read_expression (port))))
397 {
398 SCM_SETCDR (tl, tmp = scm_read_expression (port));
399
400 if (SCM_COPY_SOURCE_P)
401 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
402 SCM_EOL));
403
404 c = flush_ws (port, FUNC_NAME);
405 if (terminating_char != c)
406 scm_i_input_error (FUNC_NAME, port,
407 "in pair: missing close paren", SCM_EOL);
408 goto exit;
409 }
410
411 new_tail = scm_cons (tmp, SCM_EOL);
412 SCM_SETCDR (tl, new_tail);
413 tl = new_tail;
414
415 if (SCM_COPY_SOURCE_P)
416 {
417 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
418 ? copy
419 : tmp, SCM_EOL);
420 SCM_SETCDR (tl2, new_tail2);
421 tl2 = new_tail2;
422 }
423 }
424
425 exit:
426 if (SCM_RECORD_POSITIONS_P)
427 scm_whash_insert (scm_source_whash,
428 ans,
429 scm_make_srcprops (line, column,
430 SCM_FILENAME (port),
431 SCM_COPY_SOURCE_P
432 ? ans2
433 : SCM_UNDEFINED,
434 SCM_EOL));
435 return ans;
436 }
437 #undef FUNC_NAME
438
439
440 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
441 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
442 found. */
443 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
444 do \
445 { \
446 scm_t_wchar a; \
447 size_t i = 0; \
448 c = 0; \
449 while (i < ndigits) \
450 { \
451 a = scm_getc (port); \
452 if (a == EOF) \
453 goto str_eof; \
454 if (terminator \
455 && (a == (scm_t_wchar) terminator) \
456 && (i > 0)) \
457 break; \
458 if ('0' <= a && a <= '9') \
459 a -= '0'; \
460 else if ('A' <= a && a <= 'F') \
461 a = a - 'A' + 10; \
462 else if ('a' <= a && a <= 'f') \
463 a = a - 'a' + 10; \
464 else \
465 { \
466 c = a; \
467 goto bad_escaped; \
468 } \
469 c = c * 16 + a; \
470 i ++; \
471 } \
472 } while (0)
473
474 static SCM
475 scm_read_string (int chr, SCM port)
476 #define FUNC_NAME "scm_lreadr"
477 {
478 /* For strings smaller than C_STR, this function creates only one Scheme
479 object (the string returned). */
480
481 SCM str = SCM_BOOL_F;
482 unsigned c_str_len = 0;
483 scm_t_wchar c;
484
485 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
486 while ('"' != (c = scm_getc (port)))
487 {
488 if (c == EOF)
489 {
490 str_eof:
491 scm_i_input_error (FUNC_NAME, port,
492 "end of file in string constant", SCM_EOL);
493 }
494
495 if (c_str_len + 1 >= scm_i_string_length (str))
496 {
497 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
498
499 str = scm_string_append (scm_list_2 (str, addy));
500 }
501
502 if (c == '\\')
503 {
504 switch (c = scm_getc (port))
505 {
506 case EOF:
507 goto str_eof;
508 case '"':
509 case '\\':
510 break;
511 case '(':
512 case ')':
513 if (SCM_ESCAPED_PARENS_P)
514 break;
515 goto bad_escaped;
516 case '\n':
517 continue;
518 case '0':
519 c = '\0';
520 break;
521 case 'f':
522 c = '\f';
523 break;
524 case 'n':
525 c = '\n';
526 break;
527 case 'r':
528 c = '\r';
529 break;
530 case 't':
531 c = '\t';
532 break;
533 case 'a':
534 c = '\007';
535 break;
536 case 'v':
537 c = '\v';
538 break;
539 case 'b':
540 c = '\010';
541 break;
542 case 'x':
543 if (SCM_R6RS_ESCAPES_P)
544 SCM_READ_HEX_ESCAPE (10, ';');
545 else
546 SCM_READ_HEX_ESCAPE (2, '\0');
547 break;
548 case 'u':
549 if (!SCM_R6RS_ESCAPES_P)
550 {
551 SCM_READ_HEX_ESCAPE (4, '\0');
552 break;
553 }
554 case 'U':
555 if (!SCM_R6RS_ESCAPES_P)
556 {
557 SCM_READ_HEX_ESCAPE (6, '\0');
558 break;
559 }
560 default:
561 bad_escaped:
562 scm_i_input_error (FUNC_NAME, port,
563 "illegal character in escape sequence: ~S",
564 scm_list_1 (SCM_MAKE_CHAR (c)));
565 }
566 }
567 str = scm_i_string_start_writing (str);
568 scm_i_string_set_x (str, c_str_len++, c);
569 scm_i_string_stop_writing ();
570 }
571
572 if (c_str_len > 0)
573 {
574 return scm_i_substring_copy (str, 0, c_str_len);
575 }
576
577 return scm_nullstr;
578 }
579 #undef FUNC_NAME
580
581
582 static SCM
583 scm_read_number (scm_t_wchar chr, SCM port)
584 {
585 SCM result, str = SCM_EOL;
586 char buffer[READER_BUFFER_SIZE];
587 char *overflow_buffer = NULL;
588 size_t bytes_read;
589 int overflow;
590 scm_t_port *pt = SCM_PTAB_ENTRY (port);
591
592 scm_ungetc (chr, port);
593 overflow = read_complete_token (port, buffer, sizeof (buffer),
594 &overflow_buffer, &bytes_read);
595
596 if (!overflow)
597 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
598 else
599 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
600 pt->ilseq_handler);
601
602 result = scm_string_to_number (str, SCM_UNDEFINED);
603 if (!scm_is_true (result))
604 {
605 /* Return a symbol instead of a number */
606 if (SCM_CASE_INSENSITIVE_P)
607 str = scm_string_downcase_x (str);
608 result = scm_string_to_symbol (str);
609 }
610
611 if (overflow)
612 free (overflow_buffer);
613 SCM_COL (port) += scm_i_string_length (str);
614 return result;
615 }
616
617 static SCM
618 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
619 {
620 SCM result;
621 int ends_with_colon = 0;
622 size_t bytes_read;
623 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
624 int overflow;
625 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
626 scm_t_port *pt = SCM_PTAB_ENTRY (port);
627 SCM str;
628
629 scm_ungetc (chr, port);
630 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
631 &overflow_buffer, &bytes_read);
632 if (bytes_read > 0)
633 {
634 if (!overflow)
635 ends_with_colon = buffer[bytes_read - 1] == ':';
636 else
637 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
638 }
639
640 if (postfix && ends_with_colon && (bytes_read > 1))
641 {
642 if (!overflow)
643 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
644 else
645 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
646 pt->ilseq_handler);
647
648 if (SCM_CASE_INSENSITIVE_P)
649 str = scm_string_downcase_x (str);
650 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
651 }
652 else
653 {
654 if (!overflow)
655 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
656 else
657 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
658 pt->ilseq_handler);
659
660 if (SCM_CASE_INSENSITIVE_P)
661 str = scm_string_downcase_x (str);
662 result = scm_string_to_symbol (str);
663 }
664
665 if (overflow)
666 free (overflow_buffer);
667 SCM_COL (port) += scm_i_string_length (str);
668 return result;
669 }
670
671 static SCM
672 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
673 #define FUNC_NAME "scm_lreadr"
674 {
675 SCM result;
676 size_t read;
677 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
678 int overflow;
679 unsigned int radix;
680 SCM str;
681 scm_t_port *pt;
682
683 switch (chr)
684 {
685 case 'B':
686 case 'b':
687 radix = 2;
688 break;
689
690 case 'o':
691 case 'O':
692 radix = 8;
693 break;
694
695 case 'd':
696 case 'D':
697 radix = 10;
698 break;
699
700 case 'x':
701 case 'X':
702 radix = 16;
703 break;
704
705 default:
706 scm_ungetc (chr, port);
707 scm_ungetc ('#', port);
708 radix = 10;
709 }
710
711 overflow = read_complete_token (port, buffer, sizeof (buffer),
712 &overflow_buffer, &read);
713
714 pt = SCM_PTAB_ENTRY (port);
715 if (!overflow)
716 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
717 else
718 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
719 pt->ilseq_handler);
720
721 result = scm_string_to_number (str, scm_from_uint (radix));
722
723 if (overflow)
724 free (overflow_buffer);
725
726 SCM_COL (port) += scm_i_string_length (str);
727
728 if (scm_is_true (result))
729 return result;
730
731 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
732
733 return SCM_BOOL_F;
734 }
735 #undef FUNC_NAME
736
737 static SCM
738 scm_read_quote (int chr, SCM port)
739 {
740 SCM p;
741 long line = SCM_LINUM (port);
742 int column = SCM_COL (port) - 1;
743
744 switch (chr)
745 {
746 case '`':
747 p = scm_sym_quasiquote;
748 break;
749
750 case '\'':
751 p = scm_sym_quote;
752 break;
753
754 case ',':
755 {
756 scm_t_wchar c;
757
758 c = scm_getc (port);
759 if ('@' == c)
760 p = scm_sym_uq_splicing;
761 else
762 {
763 scm_ungetc (c, port);
764 p = scm_sym_unquote;
765 }
766 break;
767 }
768
769 default:
770 fprintf (stderr, "%s: unhandled quote character (%i)\n",
771 "scm_read_quote", chr);
772 abort ();
773 }
774
775 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
776 if (SCM_RECORD_POSITIONS_P)
777 scm_whash_insert (scm_source_whash, p,
778 scm_make_srcprops (line, column,
779 SCM_FILENAME (port),
780 SCM_COPY_SOURCE_P
781 ? (scm_cons2 (SCM_CAR (p),
782 SCM_CAR (SCM_CDR (p)),
783 SCM_EOL))
784 : SCM_UNDEFINED,
785 SCM_EOL));
786
787
788 return p;
789 }
790
791 SCM_SYMBOL (sym_syntax, "syntax");
792 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
793 SCM_SYMBOL (sym_unsyntax, "unsyntax");
794 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
795
796 static SCM
797 scm_read_syntax (int chr, SCM port)
798 {
799 SCM p;
800 long line = SCM_LINUM (port);
801 int column = SCM_COL (port) - 1;
802
803 switch (chr)
804 {
805 case '`':
806 p = sym_quasisyntax;
807 break;
808
809 case '\'':
810 p = sym_syntax;
811 break;
812
813 case ',':
814 {
815 int c;
816
817 c = scm_getc (port);
818 if ('@' == c)
819 p = sym_unsyntax_splicing;
820 else
821 {
822 scm_ungetc (c, port);
823 p = sym_unsyntax;
824 }
825 break;
826 }
827
828 default:
829 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
830 "scm_read_syntax", chr);
831 abort ();
832 }
833
834 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
835 if (SCM_RECORD_POSITIONS_P)
836 scm_whash_insert (scm_source_whash, p,
837 scm_make_srcprops (line, column,
838 SCM_FILENAME (port),
839 SCM_COPY_SOURCE_P
840 ? (scm_cons2 (SCM_CAR (p),
841 SCM_CAR (SCM_CDR (p)),
842 SCM_EOL))
843 : SCM_UNDEFINED,
844 SCM_EOL));
845
846
847 return p;
848 }
849
850 static inline SCM
851 scm_read_nil (int chr, SCM port)
852 {
853 SCM id = scm_read_mixed_case_symbol (chr, port);
854
855 if (!scm_is_eq (id, sym_nil))
856 scm_i_input_error ("scm_read_nil", port,
857 "unexpected input while reading #nil: ~a",
858 scm_list_1 (id));
859
860 return SCM_ELISP_NIL;
861 }
862
863 static inline SCM
864 scm_read_semicolon_comment (int chr, SCM port)
865 {
866 int c;
867
868 /* We use the get_byte here because there is no need to get the
869 locale correct with comment input. This presumes that newline
870 always represents itself no matter what the encoding is. */
871 for (c = scm_get_byte_or_eof (port);
872 (c != EOF) && (c != '\n');
873 c = scm_get_byte_or_eof (port));
874
875 return SCM_UNSPECIFIED;
876 }
877
878 \f
879 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
880
881 static SCM
882 scm_read_boolean (int chr, SCM port)
883 {
884 switch (chr)
885 {
886 case 't':
887 case 'T':
888 return SCM_BOOL_T;
889
890 case 'f':
891 case 'F':
892 return SCM_BOOL_F;
893 }
894
895 return SCM_UNSPECIFIED;
896 }
897
898 static SCM
899 scm_read_character (scm_t_wchar chr, SCM port)
900 #define FUNC_NAME "scm_lreadr"
901 {
902 char buffer[READER_CHAR_NAME_MAX_SIZE];
903 SCM charname;
904 size_t charname_len, bytes_read;
905 scm_t_wchar cp;
906 int overflow;
907 scm_t_port *pt;
908
909 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
910 if (overflow)
911 goto char_error;
912
913 if (bytes_read == 0)
914 {
915 chr = scm_getc (port);
916 if (chr == EOF)
917 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
918 "while reading character", SCM_EOL);
919
920 /* CHR must be a token delimiter, like a whitespace. */
921 return (SCM_MAKE_CHAR (chr));
922 }
923
924 pt = SCM_PTAB_ENTRY (port);
925
926 /* Simple ASCII characters can be processed immediately. Also, simple
927 ISO-8859-1 characters can be processed immediately if the encoding for this
928 port is ISO-8859-1. */
929 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
930 {
931 SCM_COL (port) += 1;
932 return SCM_MAKE_CHAR (buffer[0]);
933 }
934
935 /* Otherwise, convert the buffer into a proper scheme string for
936 processing. */
937 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
938 pt->ilseq_handler);
939 charname_len = scm_i_string_length (charname);
940 SCM_COL (port) += charname_len;
941 cp = scm_i_string_ref (charname, 0);
942 if (charname_len == 1)
943 return SCM_MAKE_CHAR (cp);
944
945 /* Ignore dotted circles, which may be used to keep combining characters from
946 combining with the backslash in #\charname. */
947 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
948 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
949
950 if (cp >= '0' && cp < '8')
951 {
952 /* Dirk:FIXME:: This type of character syntax is not R5RS
953 * compliant. Further, it should be verified that the constant
954 * does only consist of octal digits. */
955 SCM p = scm_string_to_number (charname, scm_from_uint (8));
956 if (SCM_I_INUMP (p))
957 {
958 scm_t_wchar c = SCM_I_INUM (p);
959 if (SCM_IS_UNICODE_CHAR (c))
960 return SCM_MAKE_CHAR (c);
961 else
962 scm_i_input_error (FUNC_NAME, port,
963 "out-of-range octal character escape: ~a",
964 scm_list_1 (charname));
965 }
966 }
967
968 if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
969 {
970 SCM p;
971
972 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
973 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
974 scm_from_uint (16));
975 if (SCM_I_INUMP (p))
976 {
977 scm_t_wchar c = SCM_I_INUM (p);
978 if (SCM_IS_UNICODE_CHAR (c))
979 return SCM_MAKE_CHAR (c);
980 else
981 scm_i_input_error (FUNC_NAME, port,
982 "out-of-range hex character escape: ~a",
983 scm_list_1 (charname));
984 }
985 }
986
987 /* The names of characters should never have non-Latin1
988 characters. */
989 if (scm_i_is_narrow_string (charname)
990 || scm_i_try_narrow_string (charname))
991 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
992 charname_len);
993 if (scm_is_true (ch))
994 return ch;
995 }
996
997 char_error:
998 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
999 scm_list_1 (charname));
1000
1001 return SCM_UNSPECIFIED;
1002 }
1003 #undef FUNC_NAME
1004
1005 static inline SCM
1006 scm_read_keyword (int chr, SCM port)
1007 {
1008 SCM symbol;
1009
1010 /* Read the symbol that comprises the keyword. Doing this instead of
1011 invoking a specific symbol reader function allows `scm_read_keyword ()'
1012 to adapt to the delimiters currently valid of symbols.
1013
1014 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1015 symbol = scm_read_expression (port);
1016 if (!scm_is_symbol (symbol))
1017 scm_i_input_error ("scm_read_keyword", port,
1018 "keyword prefix `~a' not followed by a symbol: ~s",
1019 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1020
1021 return (scm_symbol_to_keyword (symbol));
1022 }
1023
1024 static inline SCM
1025 scm_read_vector (int chr, SCM port)
1026 {
1027 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1028 guarantee that it's going to do what we want. After all, this is an
1029 implementation detail of `scm_read_vector ()', not a desirable
1030 property. */
1031 return (scm_vector (scm_read_sexp (chr, port)));
1032 }
1033
1034 static inline SCM
1035 scm_read_srfi4_vector (int chr, SCM port)
1036 {
1037 return scm_i_read_array (port, chr);
1038 }
1039
1040 static SCM
1041 scm_read_bytevector (scm_t_wchar chr, SCM port)
1042 {
1043 chr = scm_getc (port);
1044 if (chr != 'u')
1045 goto syntax;
1046
1047 chr = scm_getc (port);
1048 if (chr != '8')
1049 goto syntax;
1050
1051 chr = scm_getc (port);
1052 if (chr != '(')
1053 goto syntax;
1054
1055 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1056
1057 syntax:
1058 scm_i_input_error ("read_bytevector", port,
1059 "invalid bytevector prefix",
1060 SCM_MAKE_CHAR (chr));
1061 return SCM_UNSPECIFIED;
1062 }
1063
1064 static SCM
1065 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1066 {
1067 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1068 terribly inefficient but who cares? */
1069 SCM s_bits = SCM_EOL;
1070
1071 for (chr = scm_getc (port);
1072 (chr != EOF) && ((chr == '0') || (chr == '1'));
1073 chr = scm_getc (port))
1074 {
1075 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1076 }
1077
1078 if (chr != EOF)
1079 scm_ungetc (chr, port);
1080
1081 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1082 }
1083
1084 static inline SCM
1085 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1086 {
1087 int bang_seen = 0;
1088
1089 /* We can use the get_byte here because there is no need to get the
1090 locale correct when reading comments. This presumes that
1091 hash and exclamation points always represent themselves no
1092 matter what the source encoding is.*/
1093 for (;;)
1094 {
1095 int c = scm_get_byte_or_eof (port);
1096
1097 if (c == EOF)
1098 scm_i_input_error ("skip_block_comment", port,
1099 "unterminated `#! ... !#' comment", SCM_EOL);
1100
1101 if (c == '!')
1102 bang_seen = 1;
1103 else if (c == '#' && bang_seen)
1104 break;
1105 else
1106 bang_seen = 0;
1107 }
1108
1109 return SCM_UNSPECIFIED;
1110 }
1111
1112 static SCM
1113 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1114 {
1115 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1116 nested. So care must be taken. */
1117 int nesting_level = 1;
1118 int opening_seen = 0, closing_seen = 0;
1119
1120 while (nesting_level > 0)
1121 {
1122 int c = scm_getc (port);
1123
1124 if (c == EOF)
1125 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1126 "unterminated `#| ... |#' comment", SCM_EOL);
1127
1128 if (opening_seen)
1129 {
1130 if (c == '|')
1131 nesting_level++;
1132 opening_seen = 0;
1133 }
1134 else if (closing_seen)
1135 {
1136 if (c == '#')
1137 nesting_level--;
1138 closing_seen = 0;
1139 }
1140 else if (c == '|')
1141 closing_seen = 1;
1142 else if (c == '#')
1143 opening_seen = 1;
1144 else
1145 opening_seen = closing_seen = 0;
1146 }
1147
1148 return SCM_UNSPECIFIED;
1149 }
1150
1151 static SCM
1152 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1153 {
1154 scm_t_wchar c;
1155
1156 c = flush_ws (port, (char *) NULL);
1157 if (EOF == c)
1158 scm_i_input_error ("read_commented_expression", port,
1159 "no expression after #; comment", SCM_EOL);
1160 scm_ungetc (c, port);
1161 scm_read_expression (port);
1162 return SCM_UNSPECIFIED;
1163 }
1164
1165 static SCM
1166 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1167 {
1168 /* Guile's extended symbol read syntax looks like this:
1169
1170 #{This is all a symbol name}#
1171
1172 So here, CHR is expected to be `{'. */
1173 int saw_brace = 0, finished = 0;
1174 size_t len = 0;
1175 SCM buf = scm_i_make_string (1024, NULL);
1176
1177 buf = scm_i_string_start_writing (buf);
1178
1179 while ((chr = scm_getc (port)) != EOF)
1180 {
1181 if (saw_brace)
1182 {
1183 if (chr == '#')
1184 {
1185 finished = 1;
1186 break;
1187 }
1188 else
1189 {
1190 saw_brace = 0;
1191 scm_i_string_set_x (buf, len++, '}');
1192 scm_i_string_set_x (buf, len++, chr);
1193 }
1194 }
1195 else if (chr == '}')
1196 saw_brace = 1;
1197 else
1198 scm_i_string_set_x (buf, len++, chr);
1199
1200 if (len >= scm_i_string_length (buf) - 2)
1201 {
1202 SCM addy;
1203
1204 scm_i_string_stop_writing ();
1205 addy = scm_i_make_string (1024, NULL);
1206 buf = scm_string_append (scm_list_2 (buf, addy));
1207 len = 0;
1208 buf = scm_i_string_start_writing (buf);
1209 }
1210
1211 if (finished)
1212 break;
1213 }
1214 scm_i_string_stop_writing ();
1215
1216 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1217 }
1218
1219
1220 \f
1221 /* Top-level token readers, i.e., dispatchers. */
1222
1223 static SCM
1224 scm_read_sharp_extension (int chr, SCM port)
1225 {
1226 SCM proc;
1227
1228 proc = scm_get_hash_procedure (chr);
1229 if (scm_is_true (scm_procedure_p (proc)))
1230 {
1231 long line = SCM_LINUM (port);
1232 int column = SCM_COL (port) - 2;
1233 SCM got;
1234
1235 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1236 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1237 {
1238 if (SCM_RECORD_POSITIONS_P)
1239 return (recsexpr (got, line, column,
1240 SCM_FILENAME (port)));
1241 else
1242 return got;
1243 }
1244 }
1245
1246 return SCM_UNSPECIFIED;
1247 }
1248
1249 /* The reader for the sharp `#' character. It basically dispatches reads
1250 among the above token readers. */
1251 static SCM
1252 scm_read_sharp (scm_t_wchar chr, SCM port)
1253 #define FUNC_NAME "scm_lreadr"
1254 {
1255 SCM result;
1256
1257 chr = scm_getc (port);
1258
1259 result = scm_read_sharp_extension (chr, port);
1260 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1261 return result;
1262
1263 switch (chr)
1264 {
1265 case '\\':
1266 return (scm_read_character (chr, port));
1267 case '(':
1268 return (scm_read_vector (chr, port));
1269 case 's':
1270 case 'u':
1271 case 'f':
1272 /* This one may return either a boolean or an SRFI-4 vector. */
1273 return (scm_read_srfi4_vector (chr, port));
1274 case 'v':
1275 return (scm_read_bytevector (chr, port));
1276 case '*':
1277 return (scm_read_guile_bit_vector (chr, port));
1278 case 't':
1279 case 'T':
1280 case 'F':
1281 /* This one may return either a boolean or an SRFI-4 vector. */
1282 return (scm_read_boolean (chr, port));
1283 case ':':
1284 return (scm_read_keyword (chr, port));
1285 case '0': case '1': case '2': case '3': case '4':
1286 case '5': case '6': case '7': case '8': case '9':
1287 case '@':
1288 #if SCM_ENABLE_DEPRECATED
1289 /* See below for 'i' and 'e'. */
1290 case 'a':
1291 case 'c':
1292 case 'y':
1293 case 'h':
1294 case 'l':
1295 #endif
1296 return (scm_i_read_array (port, chr));
1297
1298 case 'i':
1299 case 'e':
1300 #if SCM_ENABLE_DEPRECATED
1301 {
1302 /* When next char is '(', it really is an old-style
1303 uniform array. */
1304 scm_t_wchar next_c = scm_getc (port);
1305 if (next_c != EOF)
1306 scm_ungetc (next_c, port);
1307 if (next_c == '(')
1308 return scm_i_read_array (port, chr);
1309 /* Fall through. */
1310 }
1311 #endif
1312 case 'b':
1313 case 'B':
1314 case 'o':
1315 case 'O':
1316 case 'd':
1317 case 'D':
1318 case 'x':
1319 case 'X':
1320 case 'I':
1321 case 'E':
1322 return (scm_read_number_and_radix (chr, port));
1323 case '{':
1324 return (scm_read_extended_symbol (chr, port));
1325 case '!':
1326 return (scm_read_scsh_block_comment (chr, port));
1327 case ';':
1328 return (scm_read_commented_expression (chr, port));
1329 case '`':
1330 case '\'':
1331 case ',':
1332 return (scm_read_syntax (chr, port));
1333 case 'n':
1334 return (scm_read_nil (chr, port));
1335 default:
1336 result = scm_read_sharp_extension (chr, port);
1337 if (scm_is_eq (result, SCM_UNSPECIFIED))
1338 {
1339 /* To remain compatible with 1.8 and earlier, the following
1340 characters have lower precedence than `read-hash-extend'
1341 characters. */
1342 switch (chr)
1343 {
1344 case '|':
1345 return scm_read_r6rs_block_comment (chr, port);
1346 default:
1347 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1348 scm_list_1 (SCM_MAKE_CHAR (chr)));
1349 }
1350 }
1351 else
1352 return result;
1353 }
1354
1355 return SCM_UNSPECIFIED;
1356 }
1357 #undef FUNC_NAME
1358
1359 static SCM
1360 scm_read_expression (SCM port)
1361 #define FUNC_NAME "scm_read_expression"
1362 {
1363 while (1)
1364 {
1365 register scm_t_wchar chr;
1366
1367 chr = scm_getc (port);
1368
1369 switch (chr)
1370 {
1371 case SCM_WHITE_SPACES:
1372 case SCM_LINE_INCREMENTORS:
1373 break;
1374 case ';':
1375 (void) scm_read_semicolon_comment (chr, port);
1376 break;
1377 case '[':
1378 if (!SCM_SQUARE_BRACKETS_P)
1379 return (scm_read_mixed_case_symbol (chr, port));
1380 /* otherwise fall through */
1381 case '(':
1382 return (scm_read_sexp (chr, port));
1383 case '"':
1384 return (scm_read_string (chr, port));
1385 case '\'':
1386 case '`':
1387 case ',':
1388 return (scm_read_quote (chr, port));
1389 case '#':
1390 {
1391 SCM result;
1392 result = scm_read_sharp (chr, port);
1393 if (scm_is_eq (result, SCM_UNSPECIFIED))
1394 /* We read a comment or some such. */
1395 break;
1396 else
1397 return result;
1398 }
1399 case ')':
1400 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1401 break;
1402 case EOF:
1403 return SCM_EOF_VAL;
1404 case ':':
1405 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1406 return scm_symbol_to_keyword (scm_read_expression (port));
1407 /* Fall through. */
1408
1409 default:
1410 {
1411 if (((chr >= '0') && (chr <= '9'))
1412 || (strchr ("+-.", chr)))
1413 return (scm_read_number (chr, port));
1414 else
1415 return (scm_read_mixed_case_symbol (chr, port));
1416 }
1417 }
1418 }
1419 }
1420 #undef FUNC_NAME
1421
1422 \f
1423 /* Actual reader. */
1424
1425 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1426 (SCM port),
1427 "Read an s-expression from the input port @var{port}, or from\n"
1428 "the current input port if @var{port} is not specified.\n"
1429 "Any whitespace before the next token is discarded.")
1430 #define FUNC_NAME s_scm_read
1431 {
1432 int c;
1433
1434 if (SCM_UNBNDP (port))
1435 port = scm_current_input_port ();
1436 SCM_VALIDATE_OPINPORT (1, port);
1437
1438 c = flush_ws (port, (char *) NULL);
1439 if (EOF == c)
1440 return SCM_EOF_VAL;
1441 scm_ungetc (c, port);
1442
1443 return (scm_read_expression (port));
1444 }
1445 #undef FUNC_NAME
1446
1447
1448 \f
1449
1450 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1451 static SCM
1452 recsexpr (SCM obj, long line, int column, SCM filename)
1453 {
1454 if (!scm_is_pair(obj)) {
1455 return obj;
1456 } else {
1457 SCM tmp = obj, copy;
1458 /* If this sexpr is visible in the read:sharp source, we want to
1459 keep that information, so only record non-constant cons cells
1460 which haven't previously been read by the reader. */
1461 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1462 {
1463 if (SCM_COPY_SOURCE_P)
1464 {
1465 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1466 SCM_UNDEFINED);
1467 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1468 {
1469 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1470 line,
1471 column,
1472 filename),
1473 SCM_UNDEFINED));
1474 copy = SCM_CDR (copy);
1475 }
1476 SCM_SETCDR (copy, tmp);
1477 }
1478 else
1479 {
1480 recsexpr (SCM_CAR (obj), line, column, filename);
1481 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1482 recsexpr (SCM_CAR (tmp), line, column, filename);
1483 copy = SCM_UNDEFINED;
1484 }
1485 scm_whash_insert (scm_source_whash,
1486 obj,
1487 scm_make_srcprops (line,
1488 column,
1489 filename,
1490 copy,
1491 SCM_EOL));
1492 }
1493 return obj;
1494 }
1495 }
1496
1497 /* Manipulate the read-hash-procedures alist. This could be written in
1498 Scheme, but maybe it will also be used by C code during initialisation. */
1499 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1500 (SCM chr, SCM proc),
1501 "Install the procedure @var{proc} for reading expressions\n"
1502 "starting with the character sequence @code{#} and @var{chr}.\n"
1503 "@var{proc} will be called with two arguments: the character\n"
1504 "@var{chr} and the port to read further data from. The object\n"
1505 "returned will be the return value of @code{read}. \n"
1506 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1507 )
1508 #define FUNC_NAME s_scm_read_hash_extend
1509 {
1510 SCM this;
1511 SCM prev;
1512
1513 SCM_VALIDATE_CHAR (1, chr);
1514 SCM_ASSERT (scm_is_false (proc)
1515 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1516 proc, SCM_ARG2, FUNC_NAME);
1517
1518 /* Check if chr is already in the alist. */
1519 this = *scm_read_hash_procedures;
1520 prev = SCM_BOOL_F;
1521 while (1)
1522 {
1523 if (scm_is_null (this))
1524 {
1525 /* not found, so add it to the beginning. */
1526 if (scm_is_true (proc))
1527 {
1528 *scm_read_hash_procedures =
1529 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1530 }
1531 break;
1532 }
1533 if (scm_is_eq (chr, SCM_CAAR (this)))
1534 {
1535 /* already in the alist. */
1536 if (scm_is_false (proc))
1537 {
1538 /* remove it. */
1539 if (scm_is_false (prev))
1540 {
1541 *scm_read_hash_procedures =
1542 SCM_CDR (*scm_read_hash_procedures);
1543 }
1544 else
1545 scm_set_cdr_x (prev, SCM_CDR (this));
1546 }
1547 else
1548 {
1549 /* replace it. */
1550 scm_set_cdr_x (SCM_CAR (this), proc);
1551 }
1552 break;
1553 }
1554 prev = this;
1555 this = SCM_CDR (this);
1556 }
1557
1558 return SCM_UNSPECIFIED;
1559 }
1560 #undef FUNC_NAME
1561
1562 /* Recover the read-hash procedure corresponding to char c. */
1563 static SCM
1564 scm_get_hash_procedure (int c)
1565 {
1566 SCM rest = *scm_read_hash_procedures;
1567
1568 while (1)
1569 {
1570 if (scm_is_null (rest))
1571 return SCM_BOOL_F;
1572
1573 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1574 return SCM_CDAR (rest);
1575
1576 rest = SCM_CDR (rest);
1577 }
1578 }
1579
1580 #define SCM_ENCODING_SEARCH_SIZE (500)
1581
1582 /* Search the first few hundred characters of a file for an Emacs-like coding
1583 declaration. Returns either NULL or a string whose storage has been
1584 allocated with `scm_gc_malloc ()'. */
1585 char *
1586 scm_i_scan_for_encoding (SCM port)
1587 {
1588 char header[SCM_ENCODING_SEARCH_SIZE+1];
1589 size_t bytes_read;
1590 char *encoding = NULL;
1591 int utf8_bom = 0;
1592 char *pos;
1593 int i;
1594 int in_comment;
1595
1596 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1597 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1598 "guile <(echo '(display "hello")')") so bail out. */
1599 return NULL;
1600
1601 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1602
1603 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1604
1605 if (bytes_read > 3
1606 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1607 utf8_bom = 1;
1608
1609 /* search past "coding[:=]" */
1610 pos = header;
1611 while (1)
1612 {
1613 if ((pos = strstr(pos, "coding")) == NULL)
1614 return NULL;
1615
1616 pos += strlen("coding");
1617 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1618 (*pos == ':' || *pos == '='))
1619 {
1620 pos ++;
1621 break;
1622 }
1623 }
1624
1625 /* skip spaces */
1626 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1627 (*pos == ' ' || *pos == '\t'))
1628 pos ++;
1629
1630 /* grab the next token */
1631 i = 0;
1632 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1633 && pos + i - header < bytes_read
1634 && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1635 i++;
1636
1637 if (i == 0)
1638 return NULL;
1639
1640 encoding = scm_gc_strndup (pos, i, "encoding");
1641 for (i = 0; i < strlen (encoding); i++)
1642 encoding[i] = toupper ((int) encoding[i]);
1643
1644 /* push backwards to make sure we were in a comment */
1645 in_comment = 0;
1646 while (pos - i - header > 0)
1647 {
1648 if (*(pos - i) == '\n')
1649 {
1650 /* This wasn't in a semicolon comment. Check for a
1651 hash-bang comment. */
1652 char *beg = strstr (header, "#!");
1653 char *end = strstr (header, "!#");
1654 if (beg < pos && pos < end)
1655 in_comment = 1;
1656 break;
1657 }
1658 if (*(pos - i) == ';')
1659 {
1660 in_comment = 1;
1661 break;
1662 }
1663 i ++;
1664 }
1665 if (!in_comment)
1666 /* This wasn't in a comment */
1667 return NULL;
1668
1669 if (utf8_bom && strcmp(encoding, "UTF-8"))
1670 scm_misc_error (NULL,
1671 "the port input declares the encoding ~s but is encoded as UTF-8",
1672 scm_list_1 (scm_from_locale_string (encoding)));
1673
1674 return encoding;
1675 }
1676
1677 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1678 (SCM port),
1679 "Scans the port for an Emacs-like character coding declaration\n"
1680 "near the top of the contents of a port with random-acessible contents.\n"
1681 "The coding declaration is of the form\n"
1682 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1683 "\n"
1684 "Returns a string containing the character encoding of the file\n"
1685 "if a declaration was found, or @code{#f} otherwise.\n")
1686 #define FUNC_NAME s_scm_file_encoding
1687 {
1688 char *enc;
1689 SCM s_enc;
1690
1691 enc = scm_i_scan_for_encoding (port);
1692 if (enc == NULL)
1693 return SCM_BOOL_F;
1694 else
1695 {
1696 s_enc = scm_from_locale_string (enc);
1697 return s_enc;
1698 }
1699
1700 return SCM_BOOL_F;
1701 }
1702 #undef FUNC_NAME
1703
1704 void
1705 scm_init_read ()
1706 {
1707 scm_read_hash_procedures =
1708 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1709
1710 scm_init_opts (scm_read_options, scm_read_opts);
1711 #include "libguile/read.x"
1712 }
1713
1714 /*
1715 Local Variables:
1716 c-file-style: "gnu"
1717 End:
1718 */