e0f3cf81546384afd1c8411474e9abecce96f81a
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static inline SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static inline void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static inline int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 register scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port);
360 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
361
362
363 static SCM
364 scm_read_sexp (scm_t_wchar chr, SCM port)
365 #define FUNC_NAME "scm_i_lreadparen"
366 {
367 register int c;
368 register SCM tmp;
369 register SCM tl, ans = SCM_EOL;
370 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
371 const int terminating_char = ((chr == '[') ? ']' : ')');
372
373 /* Need to capture line and column numbers here. */
374 long line = SCM_LINUM (port);
375 int column = SCM_COL (port) - 1;
376
377
378 c = flush_ws (port, FUNC_NAME);
379 if (terminating_char == c)
380 return SCM_EOL;
381
382 scm_ungetc (c, port);
383 if (scm_is_eq (scm_sym_dot,
384 (tmp = scm_read_expression (port))))
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
392
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
396 if (SCM_COPY_SOURCE_P)
397 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
398 ? copy
399 : tmp,
400 SCM_EOL);
401
402 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
403 {
404 SCM new_tail;
405
406 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
407 scm_i_input_error (FUNC_NAME, port,
408 "in pair: mismatched close paren: ~A",
409 scm_list_1 (SCM_MAKE_CHAR (c)));
410
411 scm_ungetc (c, port);
412 tmp = scm_read_expression (port);
413
414 if (scm_is_eq (scm_sym_dot, tmp))
415 {
416 SCM_SETCDR (tl, tmp = scm_read_expression (port));
417
418 if (SCM_COPY_SOURCE_P)
419 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
420 SCM_EOL));
421
422 c = flush_ws (port, FUNC_NAME);
423 if (terminating_char != c)
424 scm_i_input_error (FUNC_NAME, port,
425 "in pair: missing close paren", SCM_EOL);
426 goto exit;
427 }
428
429 new_tail = scm_cons (tmp, SCM_EOL);
430 SCM_SETCDR (tl, new_tail);
431 tl = new_tail;
432
433 if (SCM_COPY_SOURCE_P)
434 {
435 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
436 ? copy
437 : tmp, SCM_EOL);
438 SCM_SETCDR (tl2, new_tail2);
439 tl2 = new_tail2;
440 }
441 }
442
443 exit:
444 if (SCM_RECORD_POSITIONS_P)
445 scm_whash_insert (scm_source_whash,
446 ans,
447 scm_make_srcprops (line, column,
448 SCM_FILENAME (port),
449 SCM_COPY_SOURCE_P
450 ? ans2
451 : SCM_UNDEFINED,
452 SCM_EOL));
453 return ans;
454 }
455 #undef FUNC_NAME
456
457
458 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
459 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
460 found. */
461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
462 do \
463 { \
464 scm_t_wchar a; \
465 size_t i = 0; \
466 c = 0; \
467 while (i < ndigits) \
468 { \
469 a = scm_getc (port); \
470 if (a == EOF) \
471 goto str_eof; \
472 if (terminator \
473 && (a == (scm_t_wchar) terminator) \
474 && (i > 0)) \
475 break; \
476 if ('0' <= a && a <= '9') \
477 a -= '0'; \
478 else if ('A' <= a && a <= 'F') \
479 a = a - 'A' + 10; \
480 else if ('a' <= a && a <= 'f') \
481 a = a - 'a' + 10; \
482 else \
483 { \
484 c = a; \
485 goto bad_escaped; \
486 } \
487 c = c * 16 + a; \
488 i ++; \
489 } \
490 } while (0)
491
492 static void
493 skip_intraline_whitespace (SCM port)
494 {
495 scm_t_wchar c;
496
497 do
498 {
499 c = scm_getc (port);
500 if (c == EOF)
501 return;
502 }
503 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
504
505 scm_ungetc (c, port);
506 }
507
508 static SCM
509 scm_read_string (int chr, SCM port)
510 #define FUNC_NAME "scm_lreadr"
511 {
512 /* For strings smaller than C_STR, this function creates only one Scheme
513 object (the string returned). */
514
515 SCM str = SCM_BOOL_F;
516 unsigned c_str_len = 0;
517 scm_t_wchar c;
518
519 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
520 while ('"' != (c = scm_getc (port)))
521 {
522 if (c == EOF)
523 {
524 str_eof:
525 scm_i_input_error (FUNC_NAME, port,
526 "end of file in string constant", SCM_EOL);
527 }
528
529 if (c_str_len + 1 >= scm_i_string_length (str))
530 {
531 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
532
533 str = scm_string_append (scm_list_2 (str, addy));
534 }
535
536 if (c == '\\')
537 {
538 switch (c = scm_getc (port))
539 {
540 case EOF:
541 goto str_eof;
542 case '"':
543 case '\\':
544 break;
545 case '\n':
546 if (SCM_HUNGRY_EOL_ESCAPES_P)
547 skip_intraline_whitespace (port);
548 continue;
549 case '0':
550 c = '\0';
551 break;
552 case 'f':
553 c = '\f';
554 break;
555 case 'n':
556 c = '\n';
557 break;
558 case 'r':
559 c = '\r';
560 break;
561 case 't':
562 c = '\t';
563 break;
564 case 'a':
565 c = '\007';
566 break;
567 case 'v':
568 c = '\v';
569 break;
570 case 'b':
571 c = '\010';
572 break;
573 case 'x':
574 if (SCM_R6RS_ESCAPES_P)
575 SCM_READ_HEX_ESCAPE (10, ';');
576 else
577 SCM_READ_HEX_ESCAPE (2, '\0');
578 break;
579 case 'u':
580 if (!SCM_R6RS_ESCAPES_P)
581 {
582 SCM_READ_HEX_ESCAPE (4, '\0');
583 break;
584 }
585 case 'U':
586 if (!SCM_R6RS_ESCAPES_P)
587 {
588 SCM_READ_HEX_ESCAPE (6, '\0');
589 break;
590 }
591 default:
592 bad_escaped:
593 scm_i_input_error (FUNC_NAME, port,
594 "illegal character in escape sequence: ~S",
595 scm_list_1 (SCM_MAKE_CHAR (c)));
596 }
597 }
598 str = scm_i_string_start_writing (str);
599 scm_i_string_set_x (str, c_str_len++, c);
600 scm_i_string_stop_writing ();
601 }
602
603 if (c_str_len > 0)
604 {
605 return scm_i_substring_copy (str, 0, c_str_len);
606 }
607
608 return scm_nullstr;
609 }
610 #undef FUNC_NAME
611
612
613 static SCM
614 scm_read_number (scm_t_wchar chr, SCM port)
615 {
616 SCM result, str = SCM_EOL;
617 char buffer[READER_BUFFER_SIZE];
618 char *overflow_buffer = NULL;
619 size_t bytes_read;
620 int overflow;
621 scm_t_port *pt = SCM_PTAB_ENTRY (port);
622
623 scm_ungetc (chr, port);
624 overflow = read_complete_token (port, buffer, sizeof (buffer),
625 &overflow_buffer, &bytes_read);
626
627 if (!overflow)
628 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
629 else
630 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
631 pt->ilseq_handler);
632
633 result = scm_string_to_number (str, SCM_UNDEFINED);
634 if (!scm_is_true (result))
635 {
636 /* Return a symbol instead of a number */
637 if (SCM_CASE_INSENSITIVE_P)
638 str = scm_string_downcase_x (str);
639 result = scm_string_to_symbol (str);
640 }
641
642 if (overflow)
643 free (overflow_buffer);
644 SCM_COL (port) += scm_i_string_length (str);
645 return result;
646 }
647
648 static SCM
649 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
650 {
651 SCM result;
652 int ends_with_colon = 0;
653 size_t bytes_read;
654 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
655 int overflow;
656 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
657 scm_t_port *pt = SCM_PTAB_ENTRY (port);
658 SCM str;
659
660 scm_ungetc (chr, port);
661 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
662 &overflow_buffer, &bytes_read);
663 if (bytes_read > 0)
664 {
665 if (!overflow)
666 ends_with_colon = buffer[bytes_read - 1] == ':';
667 else
668 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
669 }
670
671 if (postfix && ends_with_colon && (bytes_read > 1))
672 {
673 if (!overflow)
674 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
675 else
676 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
677 pt->ilseq_handler);
678
679 if (SCM_CASE_INSENSITIVE_P)
680 str = scm_string_downcase_x (str);
681 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
682 }
683 else
684 {
685 if (!overflow)
686 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
687 else
688 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
689 pt->ilseq_handler);
690
691 if (SCM_CASE_INSENSITIVE_P)
692 str = scm_string_downcase_x (str);
693 result = scm_string_to_symbol (str);
694 }
695
696 if (overflow)
697 free (overflow_buffer);
698 SCM_COL (port) += scm_i_string_length (str);
699 return result;
700 }
701
702 static SCM
703 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
704 #define FUNC_NAME "scm_lreadr"
705 {
706 SCM result;
707 size_t read;
708 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
709 int overflow;
710 unsigned int radix;
711 SCM str;
712 scm_t_port *pt;
713
714 switch (chr)
715 {
716 case 'B':
717 case 'b':
718 radix = 2;
719 break;
720
721 case 'o':
722 case 'O':
723 radix = 8;
724 break;
725
726 case 'd':
727 case 'D':
728 radix = 10;
729 break;
730
731 case 'x':
732 case 'X':
733 radix = 16;
734 break;
735
736 default:
737 scm_ungetc (chr, port);
738 scm_ungetc ('#', port);
739 radix = 10;
740 }
741
742 overflow = read_complete_token (port, buffer, sizeof (buffer),
743 &overflow_buffer, &read);
744
745 pt = SCM_PTAB_ENTRY (port);
746 if (!overflow)
747 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
748 else
749 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
750 pt->ilseq_handler);
751
752 result = scm_string_to_number (str, scm_from_uint (radix));
753
754 if (overflow)
755 free (overflow_buffer);
756
757 SCM_COL (port) += scm_i_string_length (str);
758
759 if (scm_is_true (result))
760 return result;
761
762 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
763
764 return SCM_BOOL_F;
765 }
766 #undef FUNC_NAME
767
768 static SCM
769 scm_read_quote (int chr, SCM port)
770 {
771 SCM p;
772 long line = SCM_LINUM (port);
773 int column = SCM_COL (port) - 1;
774
775 switch (chr)
776 {
777 case '`':
778 p = scm_sym_quasiquote;
779 break;
780
781 case '\'':
782 p = scm_sym_quote;
783 break;
784
785 case ',':
786 {
787 scm_t_wchar c;
788
789 c = scm_getc (port);
790 if ('@' == c)
791 p = scm_sym_uq_splicing;
792 else
793 {
794 scm_ungetc (c, port);
795 p = scm_sym_unquote;
796 }
797 break;
798 }
799
800 default:
801 fprintf (stderr, "%s: unhandled quote character (%i)\n",
802 "scm_read_quote", chr);
803 abort ();
804 }
805
806 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
807 if (SCM_RECORD_POSITIONS_P)
808 scm_whash_insert (scm_source_whash, p,
809 scm_make_srcprops (line, column,
810 SCM_FILENAME (port),
811 SCM_COPY_SOURCE_P
812 ? (scm_cons2 (SCM_CAR (p),
813 SCM_CAR (SCM_CDR (p)),
814 SCM_EOL))
815 : SCM_UNDEFINED,
816 SCM_EOL));
817
818
819 return p;
820 }
821
822 SCM_SYMBOL (sym_syntax, "syntax");
823 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
824 SCM_SYMBOL (sym_unsyntax, "unsyntax");
825 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
826
827 static SCM
828 scm_read_syntax (int chr, SCM port)
829 {
830 SCM p;
831 long line = SCM_LINUM (port);
832 int column = SCM_COL (port) - 1;
833
834 switch (chr)
835 {
836 case '`':
837 p = sym_quasisyntax;
838 break;
839
840 case '\'':
841 p = sym_syntax;
842 break;
843
844 case ',':
845 {
846 int c;
847
848 c = scm_getc (port);
849 if ('@' == c)
850 p = sym_unsyntax_splicing;
851 else
852 {
853 scm_ungetc (c, port);
854 p = sym_unsyntax;
855 }
856 break;
857 }
858
859 default:
860 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
861 "scm_read_syntax", chr);
862 abort ();
863 }
864
865 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
866 if (SCM_RECORD_POSITIONS_P)
867 scm_whash_insert (scm_source_whash, p,
868 scm_make_srcprops (line, column,
869 SCM_FILENAME (port),
870 SCM_COPY_SOURCE_P
871 ? (scm_cons2 (SCM_CAR (p),
872 SCM_CAR (SCM_CDR (p)),
873 SCM_EOL))
874 : SCM_UNDEFINED,
875 SCM_EOL));
876
877
878 return p;
879 }
880
881 static inline SCM
882 scm_read_nil (int chr, SCM port)
883 {
884 SCM id = scm_read_mixed_case_symbol (chr, port);
885
886 if (!scm_is_eq (id, sym_nil))
887 scm_i_input_error ("scm_read_nil", port,
888 "unexpected input while reading #nil: ~a",
889 scm_list_1 (id));
890
891 return SCM_ELISP_NIL;
892 }
893
894 static inline SCM
895 scm_read_semicolon_comment (int chr, SCM port)
896 {
897 int c;
898
899 /* We use the get_byte here because there is no need to get the
900 locale correct with comment input. This presumes that newline
901 always represents itself no matter what the encoding is. */
902 for (c = scm_get_byte_or_eof (port);
903 (c != EOF) && (c != '\n');
904 c = scm_get_byte_or_eof (port));
905
906 return SCM_UNSPECIFIED;
907 }
908
909 \f
910 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
911
912 static SCM
913 scm_read_boolean (int chr, SCM port)
914 {
915 switch (chr)
916 {
917 case 't':
918 case 'T':
919 return SCM_BOOL_T;
920
921 case 'f':
922 case 'F':
923 return SCM_BOOL_F;
924 }
925
926 return SCM_UNSPECIFIED;
927 }
928
929 static SCM
930 scm_read_character (scm_t_wchar chr, SCM port)
931 #define FUNC_NAME "scm_lreadr"
932 {
933 char buffer[READER_CHAR_NAME_MAX_SIZE];
934 SCM charname;
935 size_t charname_len, bytes_read;
936 scm_t_wchar cp;
937 int overflow;
938 scm_t_port *pt;
939
940 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
941 if (overflow)
942 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
943
944 if (bytes_read == 0)
945 {
946 chr = scm_getc (port);
947 if (chr == EOF)
948 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
949 "while reading character", SCM_EOL);
950
951 /* CHR must be a token delimiter, like a whitespace. */
952 return (SCM_MAKE_CHAR (chr));
953 }
954
955 pt = SCM_PTAB_ENTRY (port);
956
957 /* Simple ASCII characters can be processed immediately. Also, simple
958 ISO-8859-1 characters can be processed immediately if the encoding for this
959 port is ISO-8859-1. */
960 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
961 {
962 SCM_COL (port) += 1;
963 return SCM_MAKE_CHAR (buffer[0]);
964 }
965
966 /* Otherwise, convert the buffer into a proper scheme string for
967 processing. */
968 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
969 pt->ilseq_handler);
970 charname_len = scm_i_string_length (charname);
971 SCM_COL (port) += charname_len;
972 cp = scm_i_string_ref (charname, 0);
973 if (charname_len == 1)
974 return SCM_MAKE_CHAR (cp);
975
976 /* Ignore dotted circles, which may be used to keep combining characters from
977 combining with the backslash in #\charname. */
978 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
979 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
980
981 if (cp >= '0' && cp < '8')
982 {
983 /* Dirk:FIXME:: This type of character syntax is not R5RS
984 * compliant. Further, it should be verified that the constant
985 * does only consist of octal digits. */
986 SCM p = scm_string_to_number (charname, scm_from_uint (8));
987 if (SCM_I_INUMP (p))
988 {
989 scm_t_wchar c = scm_to_uint32 (p);
990 if (SCM_IS_UNICODE_CHAR (c))
991 return SCM_MAKE_CHAR (c);
992 else
993 scm_i_input_error (FUNC_NAME, port,
994 "out-of-range octal character escape: ~a",
995 scm_list_1 (charname));
996 }
997 }
998
999 if (cp == 'x' && (charname_len > 1))
1000 {
1001 SCM p;
1002
1003 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1005 scm_from_uint (16));
1006 if (SCM_I_INUMP (p))
1007 {
1008 scm_t_wchar c = scm_to_uint32 (p);
1009 if (SCM_IS_UNICODE_CHAR (c))
1010 return SCM_MAKE_CHAR (c);
1011 else
1012 scm_i_input_error (FUNC_NAME, port,
1013 "out-of-range hex character escape: ~a",
1014 scm_list_1 (charname));
1015 }
1016 }
1017
1018 /* The names of characters should never have non-Latin1
1019 characters. */
1020 if (scm_i_is_narrow_string (charname)
1021 || scm_i_try_narrow_string (charname))
1022 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1023 charname_len);
1024 if (scm_is_true (ch))
1025 return ch;
1026 }
1027
1028 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1029 scm_list_1 (charname));
1030
1031 return SCM_UNSPECIFIED;
1032 }
1033 #undef FUNC_NAME
1034
1035 static inline SCM
1036 scm_read_keyword (int chr, SCM port)
1037 {
1038 SCM symbol;
1039
1040 /* Read the symbol that comprises the keyword. Doing this instead of
1041 invoking a specific symbol reader function allows `scm_read_keyword ()'
1042 to adapt to the delimiters currently valid of symbols.
1043
1044 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1045 symbol = scm_read_expression (port);
1046 if (!scm_is_symbol (symbol))
1047 scm_i_input_error ("scm_read_keyword", port,
1048 "keyword prefix `~a' not followed by a symbol: ~s",
1049 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1050
1051 return (scm_symbol_to_keyword (symbol));
1052 }
1053
1054 static inline SCM
1055 scm_read_vector (int chr, SCM port)
1056 {
1057 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058 guarantee that it's going to do what we want. After all, this is an
1059 implementation detail of `scm_read_vector ()', not a desirable
1060 property. */
1061 return (scm_vector (scm_read_sexp (chr, port)));
1062 }
1063
1064 static inline SCM
1065 scm_read_srfi4_vector (int chr, SCM port)
1066 {
1067 return scm_i_read_array (port, chr);
1068 }
1069
1070 static SCM
1071 scm_read_bytevector (scm_t_wchar chr, SCM port)
1072 {
1073 chr = scm_getc (port);
1074 if (chr != 'u')
1075 goto syntax;
1076
1077 chr = scm_getc (port);
1078 if (chr != '8')
1079 goto syntax;
1080
1081 chr = scm_getc (port);
1082 if (chr != '(')
1083 goto syntax;
1084
1085 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1086
1087 syntax:
1088 scm_i_input_error ("read_bytevector", port,
1089 "invalid bytevector prefix",
1090 SCM_MAKE_CHAR (chr));
1091 return SCM_UNSPECIFIED;
1092 }
1093
1094 static SCM
1095 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1096 {
1097 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1098 terribly inefficient but who cares? */
1099 SCM s_bits = SCM_EOL;
1100
1101 for (chr = scm_getc (port);
1102 (chr != EOF) && ((chr == '0') || (chr == '1'));
1103 chr = scm_getc (port))
1104 {
1105 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1106 }
1107
1108 if (chr != EOF)
1109 scm_ungetc (chr, port);
1110
1111 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1112 }
1113
1114 static inline SCM
1115 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1116 {
1117 int bang_seen = 0;
1118
1119 for (;;)
1120 {
1121 int c = scm_getc (port);
1122
1123 if (c == EOF)
1124 scm_i_input_error ("skip_block_comment", port,
1125 "unterminated `#! ... !#' comment", SCM_EOL);
1126
1127 if (c == '!')
1128 bang_seen = 1;
1129 else if (c == '#' && bang_seen)
1130 break;
1131 else
1132 bang_seen = 0;
1133 }
1134
1135 return SCM_UNSPECIFIED;
1136 }
1137
1138 static inline SCM
1139 scm_read_shebang (scm_t_wchar chr, SCM port)
1140 {
1141 int c = 0;
1142 if ((c = scm_get_byte_or_eof (port)) != 'r')
1143 {
1144 scm_ungetc (c, port);
1145 return scm_read_scsh_block_comment (chr, port);
1146 }
1147 if ((c = scm_get_byte_or_eof (port)) != '6')
1148 {
1149 scm_ungetc (c, port);
1150 scm_ungetc ('r', port);
1151 return scm_read_scsh_block_comment (chr, port);
1152 }
1153 if ((c = scm_get_byte_or_eof (port)) != 'r')
1154 {
1155 scm_ungetc (c, port);
1156 scm_ungetc ('6', port);
1157 scm_ungetc ('r', port);
1158 return scm_read_scsh_block_comment (chr, port);
1159 }
1160 if ((c = scm_get_byte_or_eof (port)) != 's')
1161 {
1162 scm_ungetc (c, port);
1163 scm_ungetc ('r', port);
1164 scm_ungetc ('6', port);
1165 scm_ungetc ('r', port);
1166 return scm_read_scsh_block_comment (chr, port);
1167 }
1168
1169 return SCM_UNSPECIFIED;
1170 }
1171
1172 static SCM
1173 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1174 {
1175 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1176 nested. So care must be taken. */
1177 int nesting_level = 1;
1178 int opening_seen = 0, closing_seen = 0;
1179
1180 while (nesting_level > 0)
1181 {
1182 int c = scm_getc (port);
1183
1184 if (c == EOF)
1185 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1186 "unterminated `#| ... |#' comment", SCM_EOL);
1187
1188 if (opening_seen)
1189 {
1190 if (c == '|')
1191 nesting_level++;
1192 opening_seen = 0;
1193 }
1194 else if (closing_seen)
1195 {
1196 if (c == '#')
1197 nesting_level--;
1198 closing_seen = 0;
1199 }
1200 else if (c == '|')
1201 closing_seen = 1;
1202 else if (c == '#')
1203 opening_seen = 1;
1204 else
1205 opening_seen = closing_seen = 0;
1206 }
1207
1208 return SCM_UNSPECIFIED;
1209 }
1210
1211 static SCM
1212 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1213 {
1214 scm_t_wchar c;
1215
1216 c = flush_ws (port, (char *) NULL);
1217 if (EOF == c)
1218 scm_i_input_error ("read_commented_expression", port,
1219 "no expression after #; comment", SCM_EOL);
1220 scm_ungetc (c, port);
1221 scm_read_expression (port);
1222 return SCM_UNSPECIFIED;
1223 }
1224
1225 static SCM
1226 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1227 {
1228 /* Guile's extended symbol read syntax looks like this:
1229
1230 #{This is all a symbol name}#
1231
1232 So here, CHR is expected to be `{'. */
1233 int saw_brace = 0, finished = 0;
1234 size_t len = 0;
1235 SCM buf = scm_i_make_string (1024, NULL, 0);
1236
1237 buf = scm_i_string_start_writing (buf);
1238
1239 while ((chr = scm_getc (port)) != EOF)
1240 {
1241 if (saw_brace)
1242 {
1243 if (chr == '#')
1244 {
1245 finished = 1;
1246 break;
1247 }
1248 else
1249 {
1250 saw_brace = 0;
1251 scm_i_string_set_x (buf, len++, '}');
1252 scm_i_string_set_x (buf, len++, chr);
1253 }
1254 }
1255 else if (chr == '}')
1256 saw_brace = 1;
1257 else
1258 scm_i_string_set_x (buf, len++, chr);
1259
1260 if (len >= scm_i_string_length (buf) - 2)
1261 {
1262 SCM addy;
1263
1264 scm_i_string_stop_writing ();
1265 addy = scm_i_make_string (1024, NULL, 0);
1266 buf = scm_string_append (scm_list_2 (buf, addy));
1267 len = 0;
1268 buf = scm_i_string_start_writing (buf);
1269 }
1270
1271 if (finished)
1272 break;
1273 }
1274 scm_i_string_stop_writing ();
1275
1276 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1277 }
1278
1279
1280 \f
1281 /* Top-level token readers, i.e., dispatchers. */
1282
1283 static SCM
1284 scm_read_sharp_extension (int chr, SCM port)
1285 {
1286 SCM proc;
1287
1288 proc = scm_get_hash_procedure (chr);
1289 if (scm_is_true (scm_procedure_p (proc)))
1290 {
1291 long line = SCM_LINUM (port);
1292 int column = SCM_COL (port) - 2;
1293 SCM got;
1294
1295 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1296 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1297 {
1298 if (SCM_RECORD_POSITIONS_P)
1299 return (recsexpr (got, line, column,
1300 SCM_FILENAME (port)));
1301 else
1302 return got;
1303 }
1304 }
1305
1306 return SCM_UNSPECIFIED;
1307 }
1308
1309 /* The reader for the sharp `#' character. It basically dispatches reads
1310 among the above token readers. */
1311 static SCM
1312 scm_read_sharp (scm_t_wchar chr, SCM port)
1313 #define FUNC_NAME "scm_lreadr"
1314 {
1315 SCM result;
1316
1317 chr = scm_getc (port);
1318
1319 result = scm_read_sharp_extension (chr, port);
1320 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1321 return result;
1322
1323 switch (chr)
1324 {
1325 case '\\':
1326 return (scm_read_character (chr, port));
1327 case '(':
1328 return (scm_read_vector (chr, port));
1329 case 's':
1330 case 'u':
1331 case 'f':
1332 /* This one may return either a boolean or an SRFI-4 vector. */
1333 return (scm_read_srfi4_vector (chr, port));
1334 case 'v':
1335 return (scm_read_bytevector (chr, port));
1336 case '*':
1337 return (scm_read_guile_bit_vector (chr, port));
1338 case 't':
1339 case 'T':
1340 case 'F':
1341 /* This one may return either a boolean or an SRFI-4 vector. */
1342 return (scm_read_boolean (chr, port));
1343 case ':':
1344 return (scm_read_keyword (chr, port));
1345 case '0': case '1': case '2': case '3': case '4':
1346 case '5': case '6': case '7': case '8': case '9':
1347 case '@':
1348 #if SCM_ENABLE_DEPRECATED
1349 /* See below for 'i' and 'e'. */
1350 case 'a':
1351 case 'c':
1352 case 'y':
1353 case 'h':
1354 case 'l':
1355 #endif
1356 return (scm_i_read_array (port, chr));
1357
1358 case 'i':
1359 case 'e':
1360 #if SCM_ENABLE_DEPRECATED
1361 {
1362 /* When next char is '(', it really is an old-style
1363 uniform array. */
1364 scm_t_wchar next_c = scm_getc (port);
1365 if (next_c != EOF)
1366 scm_ungetc (next_c, port);
1367 if (next_c == '(')
1368 return scm_i_read_array (port, chr);
1369 /* Fall through. */
1370 }
1371 #endif
1372 case 'b':
1373 case 'B':
1374 case 'o':
1375 case 'O':
1376 case 'd':
1377 case 'D':
1378 case 'x':
1379 case 'X':
1380 case 'I':
1381 case 'E':
1382 return (scm_read_number_and_radix (chr, port));
1383 case '{':
1384 return (scm_read_extended_symbol (chr, port));
1385 case '!':
1386 return (scm_read_shebang (chr, port));
1387 case ';':
1388 return (scm_read_commented_expression (chr, port));
1389 case '`':
1390 case '\'':
1391 case ',':
1392 return (scm_read_syntax (chr, port));
1393 case 'n':
1394 return (scm_read_nil (chr, port));
1395 default:
1396 result = scm_read_sharp_extension (chr, port);
1397 if (scm_is_eq (result, SCM_UNSPECIFIED))
1398 {
1399 /* To remain compatible with 1.8 and earlier, the following
1400 characters have lower precedence than `read-hash-extend'
1401 characters. */
1402 switch (chr)
1403 {
1404 case '|':
1405 return scm_read_r6rs_block_comment (chr, port);
1406 default:
1407 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1408 scm_list_1 (SCM_MAKE_CHAR (chr)));
1409 }
1410 }
1411 else
1412 return result;
1413 }
1414
1415 return SCM_UNSPECIFIED;
1416 }
1417 #undef FUNC_NAME
1418
1419 static SCM
1420 scm_read_expression (SCM port)
1421 #define FUNC_NAME "scm_read_expression"
1422 {
1423 while (1)
1424 {
1425 register scm_t_wchar chr;
1426
1427 chr = scm_getc (port);
1428
1429 switch (chr)
1430 {
1431 case SCM_WHITE_SPACES:
1432 case SCM_LINE_INCREMENTORS:
1433 break;
1434 case ';':
1435 (void) scm_read_semicolon_comment (chr, port);
1436 break;
1437 case '[':
1438 if (!SCM_SQUARE_BRACKETS_P)
1439 return (scm_read_mixed_case_symbol (chr, port));
1440 /* otherwise fall through */
1441 case '(':
1442 return (scm_read_sexp (chr, port));
1443 case '"':
1444 return (scm_read_string (chr, port));
1445 case '\'':
1446 case '`':
1447 case ',':
1448 return (scm_read_quote (chr, port));
1449 case '#':
1450 {
1451 SCM result;
1452 result = scm_read_sharp (chr, port);
1453 if (scm_is_eq (result, SCM_UNSPECIFIED))
1454 /* We read a comment or some such. */
1455 break;
1456 else
1457 return result;
1458 }
1459 case ')':
1460 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1461 break;
1462 case ']':
1463 if (SCM_SQUARE_BRACKETS_P)
1464 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1465 /* otherwise fall through */
1466 case EOF:
1467 return SCM_EOF_VAL;
1468 case ':':
1469 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1470 return scm_symbol_to_keyword (scm_read_expression (port));
1471 /* Fall through. */
1472
1473 default:
1474 {
1475 if (((chr >= '0') && (chr <= '9'))
1476 || (strchr ("+-.", chr)))
1477 return (scm_read_number (chr, port));
1478 else
1479 return (scm_read_mixed_case_symbol (chr, port));
1480 }
1481 }
1482 }
1483 }
1484 #undef FUNC_NAME
1485
1486 \f
1487 /* Actual reader. */
1488
1489 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1490 (SCM port),
1491 "Read an s-expression from the input port @var{port}, or from\n"
1492 "the current input port if @var{port} is not specified.\n"
1493 "Any whitespace before the next token is discarded.")
1494 #define FUNC_NAME s_scm_read
1495 {
1496 int c;
1497
1498 if (SCM_UNBNDP (port))
1499 port = scm_current_input_port ();
1500 SCM_VALIDATE_OPINPORT (1, port);
1501
1502 c = flush_ws (port, (char *) NULL);
1503 if (EOF == c)
1504 return SCM_EOF_VAL;
1505 scm_ungetc (c, port);
1506
1507 return (scm_read_expression (port));
1508 }
1509 #undef FUNC_NAME
1510
1511
1512 \f
1513
1514 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1515 static SCM
1516 recsexpr (SCM obj, long line, int column, SCM filename)
1517 {
1518 if (!scm_is_pair(obj)) {
1519 return obj;
1520 } else {
1521 SCM tmp = obj, copy;
1522 /* If this sexpr is visible in the read:sharp source, we want to
1523 keep that information, so only record non-constant cons cells
1524 which haven't previously been read by the reader. */
1525 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1526 {
1527 if (SCM_COPY_SOURCE_P)
1528 {
1529 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1530 SCM_UNDEFINED);
1531 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1532 {
1533 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1534 line,
1535 column,
1536 filename),
1537 SCM_UNDEFINED));
1538 copy = SCM_CDR (copy);
1539 }
1540 SCM_SETCDR (copy, tmp);
1541 }
1542 else
1543 {
1544 recsexpr (SCM_CAR (obj), line, column, filename);
1545 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1546 recsexpr (SCM_CAR (tmp), line, column, filename);
1547 copy = SCM_UNDEFINED;
1548 }
1549 scm_whash_insert (scm_source_whash,
1550 obj,
1551 scm_make_srcprops (line,
1552 column,
1553 filename,
1554 copy,
1555 SCM_EOL));
1556 }
1557 return obj;
1558 }
1559 }
1560
1561 /* Manipulate the read-hash-procedures alist. This could be written in
1562 Scheme, but maybe it will also be used by C code during initialisation. */
1563 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1564 (SCM chr, SCM proc),
1565 "Install the procedure @var{proc} for reading expressions\n"
1566 "starting with the character sequence @code{#} and @var{chr}.\n"
1567 "@var{proc} will be called with two arguments: the character\n"
1568 "@var{chr} and the port to read further data from. The object\n"
1569 "returned will be the return value of @code{read}. \n"
1570 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1571 )
1572 #define FUNC_NAME s_scm_read_hash_extend
1573 {
1574 SCM this;
1575 SCM prev;
1576
1577 SCM_VALIDATE_CHAR (1, chr);
1578 SCM_ASSERT (scm_is_false (proc)
1579 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1580 proc, SCM_ARG2, FUNC_NAME);
1581
1582 /* Check if chr is already in the alist. */
1583 this = scm_i_read_hash_procedures_ref ();
1584 prev = SCM_BOOL_F;
1585 while (1)
1586 {
1587 if (scm_is_null (this))
1588 {
1589 /* not found, so add it to the beginning. */
1590 if (scm_is_true (proc))
1591 {
1592 SCM new = scm_cons (scm_cons (chr, proc),
1593 scm_i_read_hash_procedures_ref ());
1594 scm_i_read_hash_procedures_set_x (new);
1595 }
1596 break;
1597 }
1598 if (scm_is_eq (chr, SCM_CAAR (this)))
1599 {
1600 /* already in the alist. */
1601 if (scm_is_false (proc))
1602 {
1603 /* remove it. */
1604 if (scm_is_false (prev))
1605 {
1606 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1607 scm_i_read_hash_procedures_set_x (rest);
1608 }
1609 else
1610 scm_set_cdr_x (prev, SCM_CDR (this));
1611 }
1612 else
1613 {
1614 /* replace it. */
1615 scm_set_cdr_x (SCM_CAR (this), proc);
1616 }
1617 break;
1618 }
1619 prev = this;
1620 this = SCM_CDR (this);
1621 }
1622
1623 return SCM_UNSPECIFIED;
1624 }
1625 #undef FUNC_NAME
1626
1627 /* Recover the read-hash procedure corresponding to char c. */
1628 static SCM
1629 scm_get_hash_procedure (int c)
1630 {
1631 SCM rest = scm_i_read_hash_procedures_ref ();
1632
1633 while (1)
1634 {
1635 if (scm_is_null (rest))
1636 return SCM_BOOL_F;
1637
1638 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1639 return SCM_CDAR (rest);
1640
1641 rest = SCM_CDR (rest);
1642 }
1643 }
1644
1645 #define SCM_ENCODING_SEARCH_SIZE (500)
1646
1647 /* Search the first few hundred characters of a file for an Emacs-like coding
1648 declaration. Returns either NULL or a string whose storage has been
1649 allocated with `scm_gc_malloc ()'. */
1650 char *
1651 scm_i_scan_for_encoding (SCM port)
1652 {
1653 scm_t_port *pt;
1654 char header[SCM_ENCODING_SEARCH_SIZE+1];
1655 size_t bytes_read, encoding_length, i;
1656 char *encoding = NULL;
1657 int utf8_bom = 0;
1658 char *pos, *encoding_start;
1659 int in_comment;
1660
1661 pt = SCM_PTAB_ENTRY (port);
1662
1663 if (pt->rw_active == SCM_PORT_WRITE)
1664 scm_flush (port);
1665
1666 if (pt->rw_random)
1667 pt->rw_active = SCM_PORT_READ;
1668
1669 if (pt->read_pos == pt->read_end)
1670 {
1671 /* We can use the read buffer, and thus avoid a seek. */
1672 if (scm_fill_input (port) == EOF)
1673 return NULL;
1674
1675 bytes_read = pt->read_end - pt->read_pos;
1676 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1677 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1678
1679 if (bytes_read <= 1)
1680 /* An unbuffered port -- don't scan. */
1681 return NULL;
1682
1683 memcpy (header, pt->read_pos, bytes_read);
1684 header[bytes_read] = '\0';
1685 }
1686 else
1687 {
1688 /* Try to read some bytes and then seek back. Not all ports
1689 support seeking back; and indeed some file ports (like
1690 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1691 check performed by SCM_FPORT_FDES---but fail to seek
1692 backwards. Hence this block comes second. We prefer to use
1693 the read buffer in-place. */
1694 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1695 return NULL;
1696
1697 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1698 header[bytes_read] = '\0';
1699 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1700 }
1701
1702 if (bytes_read > 3
1703 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1704 utf8_bom = 1;
1705
1706 /* search past "coding[:=]" */
1707 pos = header;
1708 while (1)
1709 {
1710 if ((pos = strstr(pos, "coding")) == NULL)
1711 return NULL;
1712
1713 pos += strlen("coding");
1714 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1715 (*pos == ':' || *pos == '='))
1716 {
1717 pos ++;
1718 break;
1719 }
1720 }
1721
1722 /* skip spaces */
1723 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1724 (*pos == ' ' || *pos == '\t'))
1725 pos ++;
1726
1727 /* grab the next token */
1728 encoding_start = pos;
1729 i = 0;
1730 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1731 && encoding_start + i - header < bytes_read
1732 && (isalnum ((int) encoding_start[i])
1733 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1734 i++;
1735
1736 encoding_length = i;
1737 if (encoding_length == 0)
1738 return NULL;
1739
1740 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1741 for (i = 0; i < encoding_length; i++)
1742 encoding[i] = toupper ((int) encoding[i]);
1743
1744 /* push backwards to make sure we were in a comment */
1745 in_comment = 0;
1746 pos = encoding_start;
1747 while (pos >= header)
1748 {
1749 if (*pos == '\n')
1750 {
1751 /* This wasn't in a semicolon comment. Check for a
1752 hash-bang comment. */
1753 char *beg = strstr (header, "#!");
1754 char *end = strstr (header, "!#");
1755 if (beg < encoding_start && encoding_start + encoding_length < end)
1756 in_comment = 1;
1757 break;
1758 }
1759 if (*pos == ';')
1760 {
1761 in_comment = 1;
1762 break;
1763 }
1764 pos --;
1765 }
1766 if (!in_comment)
1767 /* This wasn't in a comment */
1768 return NULL;
1769
1770 if (utf8_bom && strcmp(encoding, "UTF-8"))
1771 scm_misc_error (NULL,
1772 "the port input declares the encoding ~s but is encoded as UTF-8",
1773 scm_list_1 (scm_from_locale_string (encoding)));
1774
1775 return encoding;
1776 }
1777
1778 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1779 (SCM port),
1780 "Scans the port for an Emacs-like character coding declaration\n"
1781 "near the top of the contents of a port with random-accessible contents.\n"
1782 "The coding declaration is of the form\n"
1783 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1784 "\n"
1785 "Returns a string containing the character encoding of the file\n"
1786 "if a declaration was found, or @code{#f} otherwise.\n")
1787 #define FUNC_NAME s_scm_file_encoding
1788 {
1789 char *enc;
1790 SCM s_enc;
1791
1792 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1793
1794 enc = scm_i_scan_for_encoding (port);
1795 if (enc == NULL)
1796 return SCM_BOOL_F;
1797 else
1798 {
1799 s_enc = scm_from_locale_string (enc);
1800 return s_enc;
1801 }
1802
1803 return SCM_BOOL_F;
1804 }
1805 #undef FUNC_NAME
1806
1807 void
1808 scm_init_read ()
1809 {
1810 SCM read_hash_procs;
1811
1812 read_hash_procs = scm_make_fluid ();
1813 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1814
1815 scm_i_read_hash_procedures =
1816 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1817
1818 scm_init_opts (scm_read_options, scm_read_opts);
1819 #include "libguile/read.x"
1820 }
1821
1822 /*
1823 Local Variables:
1824 c-file-style: "gnu"
1825 End:
1826 */