implement r6rs hungry escaped EOL
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 0,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static inline SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static inline void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static inline int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 register scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port);
360 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
361
362
363 static SCM
364 scm_read_sexp (scm_t_wchar chr, SCM port)
365 #define FUNC_NAME "scm_i_lreadparen"
366 {
367 register int c;
368 register SCM tmp;
369 register SCM tl, ans = SCM_EOL;
370 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
371 const int terminating_char = ((chr == '[') ? ']' : ')');
372
373 /* Need to capture line and column numbers here. */
374 long line = SCM_LINUM (port);
375 int column = SCM_COL (port) - 1;
376
377
378 c = flush_ws (port, FUNC_NAME);
379 if (terminating_char == c)
380 return SCM_EOL;
381
382 scm_ungetc (c, port);
383 if (scm_is_eq (scm_sym_dot,
384 (tmp = scm_read_expression (port))))
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
392
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
396 if (SCM_COPY_SOURCE_P)
397 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
398 ? copy
399 : tmp,
400 SCM_EOL);
401
402 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
403 {
404 SCM new_tail;
405
406 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
407 scm_i_input_error (FUNC_NAME, port,
408 "in pair: mismatched close paren: ~A",
409 scm_list_1 (SCM_MAKE_CHAR (c)));
410
411 scm_ungetc (c, port);
412 tmp = scm_read_expression (port);
413
414 if (scm_is_eq (scm_sym_dot, tmp))
415 {
416 SCM_SETCDR (tl, tmp = scm_read_expression (port));
417
418 if (SCM_COPY_SOURCE_P)
419 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
420 SCM_EOL));
421
422 c = flush_ws (port, FUNC_NAME);
423 if (terminating_char != c)
424 scm_i_input_error (FUNC_NAME, port,
425 "in pair: missing close paren", SCM_EOL);
426 goto exit;
427 }
428
429 new_tail = scm_cons (tmp, SCM_EOL);
430 SCM_SETCDR (tl, new_tail);
431 tl = new_tail;
432
433 if (SCM_COPY_SOURCE_P)
434 {
435 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
436 ? copy
437 : tmp, SCM_EOL);
438 SCM_SETCDR (tl2, new_tail2);
439 tl2 = new_tail2;
440 }
441 }
442
443 exit:
444 if (SCM_RECORD_POSITIONS_P)
445 scm_whash_insert (scm_source_whash,
446 ans,
447 scm_make_srcprops (line, column,
448 SCM_FILENAME (port),
449 SCM_COPY_SOURCE_P
450 ? ans2
451 : SCM_UNDEFINED,
452 SCM_EOL));
453 return ans;
454 }
455 #undef FUNC_NAME
456
457
458 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
459 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
460 found. */
461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
462 do \
463 { \
464 scm_t_wchar a; \
465 size_t i = 0; \
466 c = 0; \
467 while (i < ndigits) \
468 { \
469 a = scm_getc (port); \
470 if (a == EOF) \
471 goto str_eof; \
472 if (terminator \
473 && (a == (scm_t_wchar) terminator) \
474 && (i > 0)) \
475 break; \
476 if ('0' <= a && a <= '9') \
477 a -= '0'; \
478 else if ('A' <= a && a <= 'F') \
479 a = a - 'A' + 10; \
480 else if ('a' <= a && a <= 'f') \
481 a = a - 'a' + 10; \
482 else \
483 { \
484 c = a; \
485 goto bad_escaped; \
486 } \
487 c = c * 16 + a; \
488 i ++; \
489 } \
490 } while (0)
491
492 static void
493 skip_intraline_whitespace (SCM port)
494 {
495 scm_t_wchar c;
496
497 do
498 {
499 c = scm_getc (port);
500 if (c == EOF)
501 return;
502 }
503 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
504
505 scm_ungetc (c, port);
506 }
507
508 static SCM
509 scm_read_string (int chr, SCM port)
510 #define FUNC_NAME "scm_lreadr"
511 {
512 /* For strings smaller than C_STR, this function creates only one Scheme
513 object (the string returned). */
514
515 SCM str = SCM_BOOL_F;
516 unsigned c_str_len = 0;
517 scm_t_wchar c;
518
519 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
520 while ('"' != (c = scm_getc (port)))
521 {
522 if (c == EOF)
523 {
524 str_eof:
525 scm_i_input_error (FUNC_NAME, port,
526 "end of file in string constant", SCM_EOL);
527 }
528
529 if (c_str_len + 1 >= scm_i_string_length (str))
530 {
531 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
532
533 str = scm_string_append (scm_list_2 (str, addy));
534 }
535
536 if (c == '\\')
537 {
538 switch (c = scm_getc (port))
539 {
540 case EOF:
541 goto str_eof;
542 case '"':
543 case '\\':
544 break;
545 case '\n':
546 if (SCM_HUNGRY_EOL_ESCAPES_P)
547 skip_intraline_whitespace (port);
548 continue;
549 case '0':
550 c = '\0';
551 break;
552 case 'f':
553 c = '\f';
554 break;
555 case 'n':
556 c = '\n';
557 break;
558 case 'r':
559 c = '\r';
560 break;
561 case 't':
562 c = '\t';
563 break;
564 case 'a':
565 c = '\007';
566 break;
567 case 'v':
568 c = '\v';
569 break;
570 case 'b':
571 c = '\010';
572 break;
573 case 'x':
574 if (SCM_R6RS_ESCAPES_P)
575 SCM_READ_HEX_ESCAPE (10, ';');
576 else
577 SCM_READ_HEX_ESCAPE (2, '\0');
578 break;
579 case 'u':
580 if (!SCM_R6RS_ESCAPES_P)
581 {
582 SCM_READ_HEX_ESCAPE (4, '\0');
583 break;
584 }
585 case 'U':
586 if (!SCM_R6RS_ESCAPES_P)
587 {
588 SCM_READ_HEX_ESCAPE (6, '\0');
589 break;
590 }
591 default:
592 bad_escaped:
593 scm_i_input_error (FUNC_NAME, port,
594 "illegal character in escape sequence: ~S",
595 scm_list_1 (SCM_MAKE_CHAR (c)));
596 }
597 }
598 str = scm_i_string_start_writing (str);
599 scm_i_string_set_x (str, c_str_len++, c);
600 scm_i_string_stop_writing ();
601 }
602
603 if (c_str_len > 0)
604 {
605 return scm_i_substring_copy (str, 0, c_str_len);
606 }
607
608 return scm_nullstr;
609 }
610 #undef FUNC_NAME
611
612
613 static SCM
614 scm_read_number (scm_t_wchar chr, SCM port)
615 {
616 SCM result, str = SCM_EOL;
617 char buffer[READER_BUFFER_SIZE];
618 char *overflow_buffer = NULL;
619 size_t bytes_read;
620 int overflow;
621 scm_t_port *pt = SCM_PTAB_ENTRY (port);
622
623 scm_ungetc (chr, port);
624 overflow = read_complete_token (port, buffer, sizeof (buffer),
625 &overflow_buffer, &bytes_read);
626
627 if (!overflow)
628 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
629 else
630 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
631 pt->ilseq_handler);
632
633 result = scm_string_to_number (str, SCM_UNDEFINED);
634 if (!scm_is_true (result))
635 {
636 /* Return a symbol instead of a number */
637 if (SCM_CASE_INSENSITIVE_P)
638 str = scm_string_downcase_x (str);
639 result = scm_string_to_symbol (str);
640 }
641
642 if (overflow)
643 free (overflow_buffer);
644 SCM_COL (port) += scm_i_string_length (str);
645 return result;
646 }
647
648 static SCM
649 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
650 {
651 SCM result;
652 int ends_with_colon = 0;
653 size_t bytes_read;
654 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
655 int overflow;
656 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
657 scm_t_port *pt = SCM_PTAB_ENTRY (port);
658 SCM str;
659
660 scm_ungetc (chr, port);
661 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
662 &overflow_buffer, &bytes_read);
663 if (bytes_read > 0)
664 {
665 if (!overflow)
666 ends_with_colon = buffer[bytes_read - 1] == ':';
667 else
668 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
669 }
670
671 if (postfix && ends_with_colon && (bytes_read > 1))
672 {
673 if (!overflow)
674 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
675 else
676 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
677 pt->ilseq_handler);
678
679 if (SCM_CASE_INSENSITIVE_P)
680 str = scm_string_downcase_x (str);
681 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
682 }
683 else
684 {
685 if (!overflow)
686 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
687 else
688 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
689 pt->ilseq_handler);
690
691 if (SCM_CASE_INSENSITIVE_P)
692 str = scm_string_downcase_x (str);
693 result = scm_string_to_symbol (str);
694 }
695
696 if (overflow)
697 free (overflow_buffer);
698 SCM_COL (port) += scm_i_string_length (str);
699 return result;
700 }
701
702 static SCM
703 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
704 #define FUNC_NAME "scm_lreadr"
705 {
706 SCM result;
707 size_t read;
708 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
709 int overflow;
710 unsigned int radix;
711 SCM str;
712 scm_t_port *pt;
713
714 switch (chr)
715 {
716 case 'B':
717 case 'b':
718 radix = 2;
719 break;
720
721 case 'o':
722 case 'O':
723 radix = 8;
724 break;
725
726 case 'd':
727 case 'D':
728 radix = 10;
729 break;
730
731 case 'x':
732 case 'X':
733 radix = 16;
734 break;
735
736 default:
737 scm_ungetc (chr, port);
738 scm_ungetc ('#', port);
739 radix = 10;
740 }
741
742 overflow = read_complete_token (port, buffer, sizeof (buffer),
743 &overflow_buffer, &read);
744
745 pt = SCM_PTAB_ENTRY (port);
746 if (!overflow)
747 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
748 else
749 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
750 pt->ilseq_handler);
751
752 result = scm_string_to_number (str, scm_from_uint (radix));
753
754 if (overflow)
755 free (overflow_buffer);
756
757 SCM_COL (port) += scm_i_string_length (str);
758
759 if (scm_is_true (result))
760 return result;
761
762 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
763
764 return SCM_BOOL_F;
765 }
766 #undef FUNC_NAME
767
768 static SCM
769 scm_read_quote (int chr, SCM port)
770 {
771 SCM p;
772 long line = SCM_LINUM (port);
773 int column = SCM_COL (port) - 1;
774
775 switch (chr)
776 {
777 case '`':
778 p = scm_sym_quasiquote;
779 break;
780
781 case '\'':
782 p = scm_sym_quote;
783 break;
784
785 case ',':
786 {
787 scm_t_wchar c;
788
789 c = scm_getc (port);
790 if ('@' == c)
791 p = scm_sym_uq_splicing;
792 else
793 {
794 scm_ungetc (c, port);
795 p = scm_sym_unquote;
796 }
797 break;
798 }
799
800 default:
801 fprintf (stderr, "%s: unhandled quote character (%i)\n",
802 "scm_read_quote", chr);
803 abort ();
804 }
805
806 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
807 if (SCM_RECORD_POSITIONS_P)
808 scm_whash_insert (scm_source_whash, p,
809 scm_make_srcprops (line, column,
810 SCM_FILENAME (port),
811 SCM_COPY_SOURCE_P
812 ? (scm_cons2 (SCM_CAR (p),
813 SCM_CAR (SCM_CDR (p)),
814 SCM_EOL))
815 : SCM_UNDEFINED,
816 SCM_EOL));
817
818
819 return p;
820 }
821
822 SCM_SYMBOL (sym_syntax, "syntax");
823 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
824 SCM_SYMBOL (sym_unsyntax, "unsyntax");
825 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
826
827 static SCM
828 scm_read_syntax (int chr, SCM port)
829 {
830 SCM p;
831 long line = SCM_LINUM (port);
832 int column = SCM_COL (port) - 1;
833
834 switch (chr)
835 {
836 case '`':
837 p = sym_quasisyntax;
838 break;
839
840 case '\'':
841 p = sym_syntax;
842 break;
843
844 case ',':
845 {
846 int c;
847
848 c = scm_getc (port);
849 if ('@' == c)
850 p = sym_unsyntax_splicing;
851 else
852 {
853 scm_ungetc (c, port);
854 p = sym_unsyntax;
855 }
856 break;
857 }
858
859 default:
860 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
861 "scm_read_syntax", chr);
862 abort ();
863 }
864
865 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
866 if (SCM_RECORD_POSITIONS_P)
867 scm_whash_insert (scm_source_whash, p,
868 scm_make_srcprops (line, column,
869 SCM_FILENAME (port),
870 SCM_COPY_SOURCE_P
871 ? (scm_cons2 (SCM_CAR (p),
872 SCM_CAR (SCM_CDR (p)),
873 SCM_EOL))
874 : SCM_UNDEFINED,
875 SCM_EOL));
876
877
878 return p;
879 }
880
881 static inline SCM
882 scm_read_nil (int chr, SCM port)
883 {
884 SCM id = scm_read_mixed_case_symbol (chr, port);
885
886 if (!scm_is_eq (id, sym_nil))
887 scm_i_input_error ("scm_read_nil", port,
888 "unexpected input while reading #nil: ~a",
889 scm_list_1 (id));
890
891 return SCM_ELISP_NIL;
892 }
893
894 static inline SCM
895 scm_read_semicolon_comment (int chr, SCM port)
896 {
897 int c;
898
899 /* We use the get_byte here because there is no need to get the
900 locale correct with comment input. This presumes that newline
901 always represents itself no matter what the encoding is. */
902 for (c = scm_get_byte_or_eof (port);
903 (c != EOF) && (c != '\n');
904 c = scm_get_byte_or_eof (port));
905
906 return SCM_UNSPECIFIED;
907 }
908
909 \f
910 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
911
912 static SCM
913 scm_read_boolean (int chr, SCM port)
914 {
915 switch (chr)
916 {
917 case 't':
918 case 'T':
919 return SCM_BOOL_T;
920
921 case 'f':
922 case 'F':
923 return SCM_BOOL_F;
924 }
925
926 return SCM_UNSPECIFIED;
927 }
928
929 static SCM
930 scm_read_character (scm_t_wchar chr, SCM port)
931 #define FUNC_NAME "scm_lreadr"
932 {
933 char buffer[READER_CHAR_NAME_MAX_SIZE];
934 SCM charname;
935 size_t charname_len, bytes_read;
936 scm_t_wchar cp;
937 int overflow;
938 scm_t_port *pt;
939
940 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
941 if (overflow)
942 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
943
944 if (bytes_read == 0)
945 {
946 chr = scm_getc (port);
947 if (chr == EOF)
948 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
949 "while reading character", SCM_EOL);
950
951 /* CHR must be a token delimiter, like a whitespace. */
952 return (SCM_MAKE_CHAR (chr));
953 }
954
955 pt = SCM_PTAB_ENTRY (port);
956
957 /* Simple ASCII characters can be processed immediately. Also, simple
958 ISO-8859-1 characters can be processed immediately if the encoding for this
959 port is ISO-8859-1. */
960 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
961 {
962 SCM_COL (port) += 1;
963 return SCM_MAKE_CHAR (buffer[0]);
964 }
965
966 /* Otherwise, convert the buffer into a proper scheme string for
967 processing. */
968 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
969 pt->ilseq_handler);
970 charname_len = scm_i_string_length (charname);
971 SCM_COL (port) += charname_len;
972 cp = scm_i_string_ref (charname, 0);
973 if (charname_len == 1)
974 return SCM_MAKE_CHAR (cp);
975
976 /* Ignore dotted circles, which may be used to keep combining characters from
977 combining with the backslash in #\charname. */
978 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
979 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
980
981 if (cp >= '0' && cp < '8')
982 {
983 /* Dirk:FIXME:: This type of character syntax is not R5RS
984 * compliant. Further, it should be verified that the constant
985 * does only consist of octal digits. */
986 SCM p = scm_string_to_number (charname, scm_from_uint (8));
987 if (SCM_I_INUMP (p))
988 {
989 scm_t_wchar c = scm_to_uint32 (p);
990 if (SCM_IS_UNICODE_CHAR (c))
991 return SCM_MAKE_CHAR (c);
992 else
993 scm_i_input_error (FUNC_NAME, port,
994 "out-of-range octal character escape: ~a",
995 scm_list_1 (charname));
996 }
997 }
998
999 if (cp == 'x' && (charname_len > 1))
1000 {
1001 SCM p;
1002
1003 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1005 scm_from_uint (16));
1006 if (SCM_I_INUMP (p))
1007 {
1008 scm_t_wchar c = scm_to_uint32 (p);
1009 if (SCM_IS_UNICODE_CHAR (c))
1010 return SCM_MAKE_CHAR (c);
1011 else
1012 scm_i_input_error (FUNC_NAME, port,
1013 "out-of-range hex character escape: ~a",
1014 scm_list_1 (charname));
1015 }
1016 }
1017
1018 /* The names of characters should never have non-Latin1
1019 characters. */
1020 if (scm_i_is_narrow_string (charname)
1021 || scm_i_try_narrow_string (charname))
1022 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1023 charname_len);
1024 if (scm_is_true (ch))
1025 return ch;
1026 }
1027
1028 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1029 scm_list_1 (charname));
1030
1031 return SCM_UNSPECIFIED;
1032 }
1033 #undef FUNC_NAME
1034
1035 static inline SCM
1036 scm_read_keyword (int chr, SCM port)
1037 {
1038 SCM symbol;
1039
1040 /* Read the symbol that comprises the keyword. Doing this instead of
1041 invoking a specific symbol reader function allows `scm_read_keyword ()'
1042 to adapt to the delimiters currently valid of symbols.
1043
1044 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1045 symbol = scm_read_expression (port);
1046 if (!scm_is_symbol (symbol))
1047 scm_i_input_error ("scm_read_keyword", port,
1048 "keyword prefix `~a' not followed by a symbol: ~s",
1049 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1050
1051 return (scm_symbol_to_keyword (symbol));
1052 }
1053
1054 static inline SCM
1055 scm_read_vector (int chr, SCM port)
1056 {
1057 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058 guarantee that it's going to do what we want. After all, this is an
1059 implementation detail of `scm_read_vector ()', not a desirable
1060 property. */
1061 return (scm_vector (scm_read_sexp (chr, port)));
1062 }
1063
1064 static inline SCM
1065 scm_read_srfi4_vector (int chr, SCM port)
1066 {
1067 return scm_i_read_array (port, chr);
1068 }
1069
1070 static SCM
1071 scm_read_bytevector (scm_t_wchar chr, SCM port)
1072 {
1073 chr = scm_getc (port);
1074 if (chr != 'u')
1075 goto syntax;
1076
1077 chr = scm_getc (port);
1078 if (chr != '8')
1079 goto syntax;
1080
1081 chr = scm_getc (port);
1082 if (chr != '(')
1083 goto syntax;
1084
1085 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1086
1087 syntax:
1088 scm_i_input_error ("read_bytevector", port,
1089 "invalid bytevector prefix",
1090 SCM_MAKE_CHAR (chr));
1091 return SCM_UNSPECIFIED;
1092 }
1093
1094 static SCM
1095 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1096 {
1097 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1098 terribly inefficient but who cares? */
1099 SCM s_bits = SCM_EOL;
1100
1101 for (chr = scm_getc (port);
1102 (chr != EOF) && ((chr == '0') || (chr == '1'));
1103 chr = scm_getc (port))
1104 {
1105 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1106 }
1107
1108 if (chr != EOF)
1109 scm_ungetc (chr, port);
1110
1111 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1112 }
1113
1114 static inline SCM
1115 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1116 {
1117 int bang_seen = 0;
1118
1119 /* We can use the get_byte here because there is no need to get the
1120 locale correct when reading comments. This presumes that
1121 hash and exclamation points always represent themselves no
1122 matter what the source encoding is.*/
1123 for (;;)
1124 {
1125 int c = scm_get_byte_or_eof (port);
1126
1127 if (c == EOF)
1128 scm_i_input_error ("skip_block_comment", port,
1129 "unterminated `#! ... !#' comment", SCM_EOL);
1130
1131 if (c == '!')
1132 bang_seen = 1;
1133 else if (c == '#' && bang_seen)
1134 break;
1135 else
1136 bang_seen = 0;
1137 }
1138
1139 return SCM_UNSPECIFIED;
1140 }
1141
1142 static inline SCM
1143 scm_read_shebang (scm_t_wchar chr, SCM port)
1144 {
1145 int c = 0;
1146 if ((c = scm_get_byte_or_eof (port)) != 'r')
1147 {
1148 scm_ungetc (c, port);
1149 return scm_read_scsh_block_comment (chr, port);
1150 }
1151 if ((c = scm_get_byte_or_eof (port)) != '6')
1152 {
1153 scm_ungetc (c, port);
1154 scm_ungetc ('r', port);
1155 return scm_read_scsh_block_comment (chr, port);
1156 }
1157 if ((c = scm_get_byte_or_eof (port)) != 'r')
1158 {
1159 scm_ungetc (c, port);
1160 scm_ungetc ('6', port);
1161 scm_ungetc ('r', port);
1162 return scm_read_scsh_block_comment (chr, port);
1163 }
1164 if ((c = scm_get_byte_or_eof (port)) != 's')
1165 {
1166 scm_ungetc (c, port);
1167 scm_ungetc ('r', port);
1168 scm_ungetc ('6', port);
1169 scm_ungetc ('r', port);
1170 return scm_read_scsh_block_comment (chr, port);
1171 }
1172
1173 return SCM_UNSPECIFIED;
1174 }
1175
1176 static SCM
1177 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1178 {
1179 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1180 nested. So care must be taken. */
1181 int nesting_level = 1;
1182 int opening_seen = 0, closing_seen = 0;
1183
1184 while (nesting_level > 0)
1185 {
1186 int c = scm_getc (port);
1187
1188 if (c == EOF)
1189 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1190 "unterminated `#| ... |#' comment", SCM_EOL);
1191
1192 if (opening_seen)
1193 {
1194 if (c == '|')
1195 nesting_level++;
1196 opening_seen = 0;
1197 }
1198 else if (closing_seen)
1199 {
1200 if (c == '#')
1201 nesting_level--;
1202 closing_seen = 0;
1203 }
1204 else if (c == '|')
1205 closing_seen = 1;
1206 else if (c == '#')
1207 opening_seen = 1;
1208 else
1209 opening_seen = closing_seen = 0;
1210 }
1211
1212 return SCM_UNSPECIFIED;
1213 }
1214
1215 static SCM
1216 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1217 {
1218 scm_t_wchar c;
1219
1220 c = flush_ws (port, (char *) NULL);
1221 if (EOF == c)
1222 scm_i_input_error ("read_commented_expression", port,
1223 "no expression after #; comment", SCM_EOL);
1224 scm_ungetc (c, port);
1225 scm_read_expression (port);
1226 return SCM_UNSPECIFIED;
1227 }
1228
1229 static SCM
1230 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1231 {
1232 /* Guile's extended symbol read syntax looks like this:
1233
1234 #{This is all a symbol name}#
1235
1236 So here, CHR is expected to be `{'. */
1237 int saw_brace = 0, finished = 0;
1238 size_t len = 0;
1239 SCM buf = scm_i_make_string (1024, NULL);
1240
1241 buf = scm_i_string_start_writing (buf);
1242
1243 while ((chr = scm_getc (port)) != EOF)
1244 {
1245 if (saw_brace)
1246 {
1247 if (chr == '#')
1248 {
1249 finished = 1;
1250 break;
1251 }
1252 else
1253 {
1254 saw_brace = 0;
1255 scm_i_string_set_x (buf, len++, '}');
1256 scm_i_string_set_x (buf, len++, chr);
1257 }
1258 }
1259 else if (chr == '}')
1260 saw_brace = 1;
1261 else
1262 scm_i_string_set_x (buf, len++, chr);
1263
1264 if (len >= scm_i_string_length (buf) - 2)
1265 {
1266 SCM addy;
1267
1268 scm_i_string_stop_writing ();
1269 addy = scm_i_make_string (1024, NULL);
1270 buf = scm_string_append (scm_list_2 (buf, addy));
1271 len = 0;
1272 buf = scm_i_string_start_writing (buf);
1273 }
1274
1275 if (finished)
1276 break;
1277 }
1278 scm_i_string_stop_writing ();
1279
1280 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1281 }
1282
1283
1284 \f
1285 /* Top-level token readers, i.e., dispatchers. */
1286
1287 static SCM
1288 scm_read_sharp_extension (int chr, SCM port)
1289 {
1290 SCM proc;
1291
1292 proc = scm_get_hash_procedure (chr);
1293 if (scm_is_true (scm_procedure_p (proc)))
1294 {
1295 long line = SCM_LINUM (port);
1296 int column = SCM_COL (port) - 2;
1297 SCM got;
1298
1299 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1300 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1301 {
1302 if (SCM_RECORD_POSITIONS_P)
1303 return (recsexpr (got, line, column,
1304 SCM_FILENAME (port)));
1305 else
1306 return got;
1307 }
1308 }
1309
1310 return SCM_UNSPECIFIED;
1311 }
1312
1313 /* The reader for the sharp `#' character. It basically dispatches reads
1314 among the above token readers. */
1315 static SCM
1316 scm_read_sharp (scm_t_wchar chr, SCM port)
1317 #define FUNC_NAME "scm_lreadr"
1318 {
1319 SCM result;
1320
1321 chr = scm_getc (port);
1322
1323 result = scm_read_sharp_extension (chr, port);
1324 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1325 return result;
1326
1327 switch (chr)
1328 {
1329 case '\\':
1330 return (scm_read_character (chr, port));
1331 case '(':
1332 return (scm_read_vector (chr, port));
1333 case 's':
1334 case 'u':
1335 case 'f':
1336 /* This one may return either a boolean or an SRFI-4 vector. */
1337 return (scm_read_srfi4_vector (chr, port));
1338 case 'v':
1339 return (scm_read_bytevector (chr, port));
1340 case '*':
1341 return (scm_read_guile_bit_vector (chr, port));
1342 case 't':
1343 case 'T':
1344 case 'F':
1345 /* This one may return either a boolean or an SRFI-4 vector. */
1346 return (scm_read_boolean (chr, port));
1347 case ':':
1348 return (scm_read_keyword (chr, port));
1349 case '0': case '1': case '2': case '3': case '4':
1350 case '5': case '6': case '7': case '8': case '9':
1351 case '@':
1352 #if SCM_ENABLE_DEPRECATED
1353 /* See below for 'i' and 'e'. */
1354 case 'a':
1355 case 'c':
1356 case 'y':
1357 case 'h':
1358 case 'l':
1359 #endif
1360 return (scm_i_read_array (port, chr));
1361
1362 case 'i':
1363 case 'e':
1364 #if SCM_ENABLE_DEPRECATED
1365 {
1366 /* When next char is '(', it really is an old-style
1367 uniform array. */
1368 scm_t_wchar next_c = scm_getc (port);
1369 if (next_c != EOF)
1370 scm_ungetc (next_c, port);
1371 if (next_c == '(')
1372 return scm_i_read_array (port, chr);
1373 /* Fall through. */
1374 }
1375 #endif
1376 case 'b':
1377 case 'B':
1378 case 'o':
1379 case 'O':
1380 case 'd':
1381 case 'D':
1382 case 'x':
1383 case 'X':
1384 case 'I':
1385 case 'E':
1386 return (scm_read_number_and_radix (chr, port));
1387 case '{':
1388 return (scm_read_extended_symbol (chr, port));
1389 case '!':
1390 return (scm_read_shebang (chr, port));
1391 case ';':
1392 return (scm_read_commented_expression (chr, port));
1393 case '`':
1394 case '\'':
1395 case ',':
1396 return (scm_read_syntax (chr, port));
1397 case 'n':
1398 return (scm_read_nil (chr, port));
1399 default:
1400 result = scm_read_sharp_extension (chr, port);
1401 if (scm_is_eq (result, SCM_UNSPECIFIED))
1402 {
1403 /* To remain compatible with 1.8 and earlier, the following
1404 characters have lower precedence than `read-hash-extend'
1405 characters. */
1406 switch (chr)
1407 {
1408 case '|':
1409 return scm_read_r6rs_block_comment (chr, port);
1410 default:
1411 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1412 scm_list_1 (SCM_MAKE_CHAR (chr)));
1413 }
1414 }
1415 else
1416 return result;
1417 }
1418
1419 return SCM_UNSPECIFIED;
1420 }
1421 #undef FUNC_NAME
1422
1423 static SCM
1424 scm_read_expression (SCM port)
1425 #define FUNC_NAME "scm_read_expression"
1426 {
1427 while (1)
1428 {
1429 register scm_t_wchar chr;
1430
1431 chr = scm_getc (port);
1432
1433 switch (chr)
1434 {
1435 case SCM_WHITE_SPACES:
1436 case SCM_LINE_INCREMENTORS:
1437 break;
1438 case ';':
1439 (void) scm_read_semicolon_comment (chr, port);
1440 break;
1441 case '[':
1442 if (!SCM_SQUARE_BRACKETS_P)
1443 return (scm_read_mixed_case_symbol (chr, port));
1444 /* otherwise fall through */
1445 case '(':
1446 return (scm_read_sexp (chr, port));
1447 case '"':
1448 return (scm_read_string (chr, port));
1449 case '\'':
1450 case '`':
1451 case ',':
1452 return (scm_read_quote (chr, port));
1453 case '#':
1454 {
1455 SCM result;
1456 result = scm_read_sharp (chr, port);
1457 if (scm_is_eq (result, SCM_UNSPECIFIED))
1458 /* We read a comment or some such. */
1459 break;
1460 else
1461 return result;
1462 }
1463 case ')':
1464 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1465 break;
1466 case ']':
1467 if (SCM_SQUARE_BRACKETS_P)
1468 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1469 /* otherwise fall through */
1470 case EOF:
1471 return SCM_EOF_VAL;
1472 case ':':
1473 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1474 return scm_symbol_to_keyword (scm_read_expression (port));
1475 /* Fall through. */
1476
1477 default:
1478 {
1479 if (((chr >= '0') && (chr <= '9'))
1480 || (strchr ("+-.", chr)))
1481 return (scm_read_number (chr, port));
1482 else
1483 return (scm_read_mixed_case_symbol (chr, port));
1484 }
1485 }
1486 }
1487 }
1488 #undef FUNC_NAME
1489
1490 \f
1491 /* Actual reader. */
1492
1493 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1494 (SCM port),
1495 "Read an s-expression from the input port @var{port}, or from\n"
1496 "the current input port if @var{port} is not specified.\n"
1497 "Any whitespace before the next token is discarded.")
1498 #define FUNC_NAME s_scm_read
1499 {
1500 int c;
1501
1502 if (SCM_UNBNDP (port))
1503 port = scm_current_input_port ();
1504 SCM_VALIDATE_OPINPORT (1, port);
1505
1506 c = flush_ws (port, (char *) NULL);
1507 if (EOF == c)
1508 return SCM_EOF_VAL;
1509 scm_ungetc (c, port);
1510
1511 return (scm_read_expression (port));
1512 }
1513 #undef FUNC_NAME
1514
1515
1516 \f
1517
1518 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1519 static SCM
1520 recsexpr (SCM obj, long line, int column, SCM filename)
1521 {
1522 if (!scm_is_pair(obj)) {
1523 return obj;
1524 } else {
1525 SCM tmp = obj, copy;
1526 /* If this sexpr is visible in the read:sharp source, we want to
1527 keep that information, so only record non-constant cons cells
1528 which haven't previously been read by the reader. */
1529 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1530 {
1531 if (SCM_COPY_SOURCE_P)
1532 {
1533 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1534 SCM_UNDEFINED);
1535 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1536 {
1537 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1538 line,
1539 column,
1540 filename),
1541 SCM_UNDEFINED));
1542 copy = SCM_CDR (copy);
1543 }
1544 SCM_SETCDR (copy, tmp);
1545 }
1546 else
1547 {
1548 recsexpr (SCM_CAR (obj), line, column, filename);
1549 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1550 recsexpr (SCM_CAR (tmp), line, column, filename);
1551 copy = SCM_UNDEFINED;
1552 }
1553 scm_whash_insert (scm_source_whash,
1554 obj,
1555 scm_make_srcprops (line,
1556 column,
1557 filename,
1558 copy,
1559 SCM_EOL));
1560 }
1561 return obj;
1562 }
1563 }
1564
1565 /* Manipulate the read-hash-procedures alist. This could be written in
1566 Scheme, but maybe it will also be used by C code during initialisation. */
1567 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1568 (SCM chr, SCM proc),
1569 "Install the procedure @var{proc} for reading expressions\n"
1570 "starting with the character sequence @code{#} and @var{chr}.\n"
1571 "@var{proc} will be called with two arguments: the character\n"
1572 "@var{chr} and the port to read further data from. The object\n"
1573 "returned will be the return value of @code{read}. \n"
1574 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1575 )
1576 #define FUNC_NAME s_scm_read_hash_extend
1577 {
1578 SCM this;
1579 SCM prev;
1580
1581 SCM_VALIDATE_CHAR (1, chr);
1582 SCM_ASSERT (scm_is_false (proc)
1583 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1584 proc, SCM_ARG2, FUNC_NAME);
1585
1586 /* Check if chr is already in the alist. */
1587 this = scm_i_read_hash_procedures_ref ();
1588 prev = SCM_BOOL_F;
1589 while (1)
1590 {
1591 if (scm_is_null (this))
1592 {
1593 /* not found, so add it to the beginning. */
1594 if (scm_is_true (proc))
1595 {
1596 SCM new = scm_cons (scm_cons (chr, proc),
1597 scm_i_read_hash_procedures_ref ());
1598 scm_i_read_hash_procedures_set_x (new);
1599 }
1600 break;
1601 }
1602 if (scm_is_eq (chr, SCM_CAAR (this)))
1603 {
1604 /* already in the alist. */
1605 if (scm_is_false (proc))
1606 {
1607 /* remove it. */
1608 if (scm_is_false (prev))
1609 {
1610 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1611 scm_i_read_hash_procedures_set_x (rest);
1612 }
1613 else
1614 scm_set_cdr_x (prev, SCM_CDR (this));
1615 }
1616 else
1617 {
1618 /* replace it. */
1619 scm_set_cdr_x (SCM_CAR (this), proc);
1620 }
1621 break;
1622 }
1623 prev = this;
1624 this = SCM_CDR (this);
1625 }
1626
1627 return SCM_UNSPECIFIED;
1628 }
1629 #undef FUNC_NAME
1630
1631 /* Recover the read-hash procedure corresponding to char c. */
1632 static SCM
1633 scm_get_hash_procedure (int c)
1634 {
1635 SCM rest = scm_i_read_hash_procedures_ref ();
1636
1637 while (1)
1638 {
1639 if (scm_is_null (rest))
1640 return SCM_BOOL_F;
1641
1642 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1643 return SCM_CDAR (rest);
1644
1645 rest = SCM_CDR (rest);
1646 }
1647 }
1648
1649 #define SCM_ENCODING_SEARCH_SIZE (500)
1650
1651 /* Search the first few hundred characters of a file for an Emacs-like coding
1652 declaration. Returns either NULL or a string whose storage has been
1653 allocated with `scm_gc_malloc ()'. */
1654 char *
1655 scm_i_scan_for_encoding (SCM port)
1656 {
1657 char header[SCM_ENCODING_SEARCH_SIZE+1];
1658 size_t bytes_read, encoding_length, i;
1659 char *encoding = NULL;
1660 int utf8_bom = 0;
1661 char *pos, *encoding_start;
1662 int in_comment;
1663
1664 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1665 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1666 "guile <(echo '(display "hello")')") so bail out. */
1667 return NULL;
1668
1669 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1670 header[bytes_read] = '\0';
1671
1672 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1673
1674 if (bytes_read > 3
1675 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1676 utf8_bom = 1;
1677
1678 /* search past "coding[:=]" */
1679 pos = header;
1680 while (1)
1681 {
1682 if ((pos = strstr(pos, "coding")) == NULL)
1683 return NULL;
1684
1685 pos += strlen("coding");
1686 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1687 (*pos == ':' || *pos == '='))
1688 {
1689 pos ++;
1690 break;
1691 }
1692 }
1693
1694 /* skip spaces */
1695 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1696 (*pos == ' ' || *pos == '\t'))
1697 pos ++;
1698
1699 /* grab the next token */
1700 encoding_start = pos;
1701 i = 0;
1702 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1703 && encoding_start + i - header < bytes_read
1704 && (isalnum ((int) encoding_start[i])
1705 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1706 i++;
1707
1708 encoding_length = i;
1709 if (encoding_length == 0)
1710 return NULL;
1711
1712 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1713 for (i = 0; i < encoding_length; i++)
1714 encoding[i] = toupper ((int) encoding[i]);
1715
1716 /* push backwards to make sure we were in a comment */
1717 in_comment = 0;
1718 pos = encoding_start;
1719 while (pos >= header)
1720 {
1721 if (*pos == '\n')
1722 {
1723 /* This wasn't in a semicolon comment. Check for a
1724 hash-bang comment. */
1725 char *beg = strstr (header, "#!");
1726 char *end = strstr (header, "!#");
1727 if (beg < encoding_start && encoding_start + encoding_length < end)
1728 in_comment = 1;
1729 break;
1730 }
1731 if (*pos == ';')
1732 {
1733 in_comment = 1;
1734 break;
1735 }
1736 pos --;
1737 }
1738 if (!in_comment)
1739 /* This wasn't in a comment */
1740 return NULL;
1741
1742 if (utf8_bom && strcmp(encoding, "UTF-8"))
1743 scm_misc_error (NULL,
1744 "the port input declares the encoding ~s but is encoded as UTF-8",
1745 scm_list_1 (scm_from_locale_string (encoding)));
1746
1747 return encoding;
1748 }
1749
1750 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1751 (SCM port),
1752 "Scans the port for an Emacs-like character coding declaration\n"
1753 "near the top of the contents of a port with random-acessible contents.\n"
1754 "The coding declaration is of the form\n"
1755 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1756 "\n"
1757 "Returns a string containing the character encoding of the file\n"
1758 "if a declaration was found, or @code{#f} otherwise.\n")
1759 #define FUNC_NAME s_scm_file_encoding
1760 {
1761 char *enc;
1762 SCM s_enc;
1763
1764 enc = scm_i_scan_for_encoding (port);
1765 if (enc == NULL)
1766 return SCM_BOOL_F;
1767 else
1768 {
1769 s_enc = scm_from_locale_string (enc);
1770 return s_enc;
1771 }
1772
1773 return SCM_BOOL_F;
1774 }
1775 #undef FUNC_NAME
1776
1777 void
1778 scm_init_read ()
1779 {
1780 SCM read_hash_procs;
1781
1782 read_hash_procs = scm_make_fluid ();
1783 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1784
1785 scm_i_read_hash_procedures =
1786 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1787
1788 scm_init_opts (scm_read_options, scm_read_opts);
1789 #include "libguile/read.x"
1790 }
1791
1792 /*
1793 Local Variables:
1794 c-file-style: "gnu"
1795 End:
1796 */