Support for the #!r6rs lexeme.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
53
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
56
57
58 \f
59
60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
63 SCM_SYMBOL (sym_nil, "nil");
64
65 scm_t_option scm_read_opts[] = {
66 { SCM_OPTION_BOOLEAN, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
76 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
77 "Support `\\(' and `\\)' in strings."},
78 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
79 "Use R6RS variable-length character and string hex escapes."},
80 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
81 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
82 { 0, },
83 };
84
85 /*
86 Give meaningful error messages for errors
87
88 We use the format
89
90 FILE:LINE:COL: MESSAGE
91 This happened in ....
92
93 This is not standard GNU format, but the test-suite likes the real
94 message to be in front.
95
96 */
97
98
99 void
100 scm_i_input_error (char const *function,
101 SCM port, const char *message, SCM arg)
102 {
103 SCM fn = (scm_is_string (SCM_FILENAME(port))
104 ? SCM_FILENAME(port)
105 : scm_from_locale_string ("#<unknown port>"));
106
107 SCM string_port = scm_open_output_string ();
108 SCM string = SCM_EOL;
109 scm_simple_format (string_port,
110 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_list_4 (fn,
112 scm_from_long (SCM_LINUM (port) + 1),
113 scm_from_int (SCM_COL (port) + 1),
114 scm_from_locale_string (message)));
115
116 string = scm_get_output_string (string_port);
117 scm_close_output_port (string_port);
118 scm_error_scm (scm_from_locale_symbol ("read-error"),
119 function? scm_from_locale_string (function) : SCM_BOOL_F,
120 string,
121 arg,
122 SCM_BOOL_F);
123 }
124
125
126 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
127 (SCM setting),
128 "Option interface for the read options. Instead of using\n"
129 "this procedure directly, use the procedures @code{read-enable},\n"
130 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
131 #define FUNC_NAME s_scm_read_options
132 {
133 SCM ans = scm_options (setting,
134 scm_read_opts,
135 FUNC_NAME);
136 if (SCM_COPY_SOURCE_P)
137 SCM_RECORD_POSITIONS_P = 1;
138 return ans;
139 }
140 #undef FUNC_NAME
141
142 /* An association list mapping extra hash characters to procedures. */
143 static SCM *scm_read_hash_procedures;
144
145
146 \f
147 /* Token readers. */
148
149
150 /* Size of the C buffer used to read symbols and numbers. */
151 #define READER_BUFFER_SIZE 128
152
153 /* Size of the C buffer used to read strings. */
154 #define READER_STRING_BUFFER_SIZE 512
155
156 /* The maximum size of Scheme character names. */
157 #define READER_CHAR_NAME_MAX_SIZE 50
158
159
160 /* `isblank' is only in C99. */
161 #define CHAR_IS_BLANK_(_chr) \
162 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
163 || ((_chr) == '\f') || ((_chr) == '\r'))
164
165 #ifdef MSDOS
166 # define CHAR_IS_BLANK(_chr) \
167 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
168 #else
169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
170 #endif
171
172
173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
174 structure''). */
175 #define CHAR_IS_R5RS_DELIMITER(c) \
176 (CHAR_IS_BLANK (c) \
177 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
178 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
179
180 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
181
182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
183 Structure''. */
184 #define CHAR_IS_EXPONENT_MARKER(_chr) \
185 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
186 || ((_chr) == 'd') || ((_chr) == 'l'))
187
188 /* Read an SCSH block comment. */
189 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
190 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
191 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
192 static SCM scm_read_shebang (scm_t_wchar, SCM);
193 static SCM scm_get_hash_procedure (int);
194
195 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
196 result in the pre-allocated buffer BUF. Return zero if the whole token has
197 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
198 bytes actually read. */
199 static inline int
200 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
201 {
202 *read = 0;
203
204 while (*read < buf_size)
205 {
206 int chr;
207
208 chr = scm_get_byte_or_eof (port);
209
210 if (chr == EOF)
211 return 0;
212 else if (CHAR_IS_DELIMITER (chr))
213 {
214 scm_unget_byte (chr, port);
215 return 0;
216 }
217 else
218 {
219 *buf = (char) chr;
220 buf++, (*read)++;
221 }
222 }
223
224 return 1;
225 }
226
227 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
228 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
229 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
230 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
231 will be set the number of bytes actually read. */
232 static int
233 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
234 char **overflow_buffer, size_t *read)
235 {
236 int overflow = 0;
237 size_t bytes_read, overflow_size;
238
239 *overflow_buffer = NULL;
240 overflow_size = 0;
241
242 do
243 {
244 overflow = read_token (port, buffer, buffer_size, &bytes_read);
245 if (bytes_read == 0)
246 break;
247 if (overflow || overflow_size != 0)
248 {
249 if (overflow_size == 0)
250 {
251 *overflow_buffer = scm_malloc (bytes_read);
252 memcpy (*overflow_buffer, buffer, bytes_read);
253 overflow_size = bytes_read;
254 }
255 else
256 {
257 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
258 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
259 overflow_size += bytes_read;
260 }
261 }
262 }
263 while (overflow);
264
265 if (overflow_size)
266 *read = overflow_size;
267 else
268 *read = bytes_read;
269
270 return (overflow_size != 0);
271 }
272
273 /* Skip whitespace from PORT and return the first non-whitespace character
274 read. Raise an error on end-of-file. */
275 static int
276 flush_ws (SCM port, const char *eoferr)
277 {
278 register scm_t_wchar c;
279 while (1)
280 switch (c = scm_getc (port))
281 {
282 case EOF:
283 goteof:
284 if (eoferr)
285 {
286 scm_i_input_error (eoferr,
287 port,
288 "end of file",
289 SCM_EOL);
290 }
291 return c;
292
293 case ';':
294 lp:
295 switch (c = scm_getc (port))
296 {
297 case EOF:
298 goto goteof;
299 default:
300 goto lp;
301 case SCM_LINE_INCREMENTORS:
302 break;
303 }
304 break;
305
306 case '#':
307 switch (c = scm_getc (port))
308 {
309 case EOF:
310 eoferr = "read_sharp";
311 goto goteof;
312 case '!':
313 scm_read_shebang (c, port);
314 break;
315 case ';':
316 scm_read_commented_expression (c, port);
317 break;
318 case '|':
319 if (scm_is_false (scm_get_hash_procedure (c)))
320 {
321 scm_read_r6rs_block_comment (c, port);
322 break;
323 }
324 /* fall through */
325 default:
326 scm_ungetc (c, port);
327 return '#';
328 }
329 break;
330
331 case SCM_LINE_INCREMENTORS:
332 case SCM_SINGLE_SPACES:
333 case '\t':
334 break;
335
336 default:
337 return c;
338 }
339
340 return 0;
341 }
342
343
344 \f
345 /* Token readers. */
346
347 static SCM scm_read_expression (SCM port);
348 static SCM scm_read_sharp (int chr, SCM port);
349 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
350
351
352 static SCM
353 scm_read_sexp (scm_t_wchar chr, SCM port)
354 #define FUNC_NAME "scm_i_lreadparen"
355 {
356 register int c;
357 register SCM tmp;
358 register SCM tl, ans = SCM_EOL;
359 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
360 const int terminating_char = ((chr == '[') ? ']' : ')');
361
362 /* Need to capture line and column numbers here. */
363 long line = SCM_LINUM (port);
364 int column = SCM_COL (port) - 1;
365
366
367 c = flush_ws (port, FUNC_NAME);
368 if (terminating_char == c)
369 return SCM_EOL;
370
371 scm_ungetc (c, port);
372 if (scm_is_eq (scm_sym_dot,
373 (tmp = scm_read_expression (port))))
374 {
375 ans = scm_read_expression (port);
376 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
377 scm_i_input_error (FUNC_NAME, port, "missing close paren",
378 SCM_EOL);
379 return ans;
380 }
381
382 /* Build the head of the list structure. */
383 ans = tl = scm_cons (tmp, SCM_EOL);
384
385 if (SCM_COPY_SOURCE_P)
386 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
387 ? copy
388 : tmp,
389 SCM_EOL);
390
391 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
392 {
393 SCM new_tail;
394
395 scm_ungetc (c, port);
396 if (scm_is_eq (scm_sym_dot,
397 (tmp = scm_read_expression (port))))
398 {
399 SCM_SETCDR (tl, tmp = scm_read_expression (port));
400
401 if (SCM_COPY_SOURCE_P)
402 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
403 SCM_EOL));
404
405 c = flush_ws (port, FUNC_NAME);
406 if (terminating_char != c)
407 scm_i_input_error (FUNC_NAME, port,
408 "in pair: missing close paren", SCM_EOL);
409 goto exit;
410 }
411
412 new_tail = scm_cons (tmp, SCM_EOL);
413 SCM_SETCDR (tl, new_tail);
414 tl = new_tail;
415
416 if (SCM_COPY_SOURCE_P)
417 {
418 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
419 ? copy
420 : tmp, SCM_EOL);
421 SCM_SETCDR (tl2, new_tail2);
422 tl2 = new_tail2;
423 }
424 }
425
426 exit:
427 if (SCM_RECORD_POSITIONS_P)
428 scm_whash_insert (scm_source_whash,
429 ans,
430 scm_make_srcprops (line, column,
431 SCM_FILENAME (port),
432 SCM_COPY_SOURCE_P
433 ? ans2
434 : SCM_UNDEFINED,
435 SCM_EOL));
436 return ans;
437 }
438 #undef FUNC_NAME
439
440
441 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
442 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
443 found. */
444 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
445 do \
446 { \
447 scm_t_wchar a; \
448 size_t i = 0; \
449 c = 0; \
450 while (i < ndigits) \
451 { \
452 a = scm_getc (port); \
453 if (a == EOF) \
454 goto str_eof; \
455 if (terminator \
456 && (a == (scm_t_wchar) terminator) \
457 && (i > 0)) \
458 break; \
459 if ('0' <= a && a <= '9') \
460 a -= '0'; \
461 else if ('A' <= a && a <= 'F') \
462 a = a - 'A' + 10; \
463 else if ('a' <= a && a <= 'f') \
464 a = a - 'a' + 10; \
465 else \
466 { \
467 c = a; \
468 goto bad_escaped; \
469 } \
470 c = c * 16 + a; \
471 i ++; \
472 } \
473 } while (0)
474
475 static SCM
476 scm_read_string (int chr, SCM port)
477 #define FUNC_NAME "scm_lreadr"
478 {
479 /* For strings smaller than C_STR, this function creates only one Scheme
480 object (the string returned). */
481
482 SCM str = SCM_BOOL_F;
483 unsigned c_str_len = 0;
484 scm_t_wchar c;
485
486 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
487 while ('"' != (c = scm_getc (port)))
488 {
489 if (c == EOF)
490 {
491 str_eof:
492 scm_i_input_error (FUNC_NAME, port,
493 "end of file in string constant", SCM_EOL);
494 }
495
496 if (c_str_len + 1 >= scm_i_string_length (str))
497 {
498 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
499
500 str = scm_string_append (scm_list_2 (str, addy));
501 }
502
503 if (c == '\\')
504 {
505 switch (c = scm_getc (port))
506 {
507 case EOF:
508 goto str_eof;
509 case '"':
510 case '\\':
511 break;
512 case '(':
513 case ')':
514 if (SCM_ESCAPED_PARENS_P)
515 break;
516 goto bad_escaped;
517 case '\n':
518 continue;
519 case '0':
520 c = '\0';
521 break;
522 case 'f':
523 c = '\f';
524 break;
525 case 'n':
526 c = '\n';
527 break;
528 case 'r':
529 c = '\r';
530 break;
531 case 't':
532 c = '\t';
533 break;
534 case 'a':
535 c = '\007';
536 break;
537 case 'v':
538 c = '\v';
539 break;
540 case 'b':
541 c = '\010';
542 break;
543 case 'x':
544 if (SCM_R6RS_ESCAPES_P)
545 SCM_READ_HEX_ESCAPE (10, ';');
546 else
547 SCM_READ_HEX_ESCAPE (2, '\0');
548 break;
549 case 'u':
550 if (!SCM_R6RS_ESCAPES_P)
551 {
552 SCM_READ_HEX_ESCAPE (4, '\0');
553 break;
554 }
555 case 'U':
556 if (!SCM_R6RS_ESCAPES_P)
557 {
558 SCM_READ_HEX_ESCAPE (6, '\0');
559 break;
560 }
561 default:
562 bad_escaped:
563 scm_i_input_error (FUNC_NAME, port,
564 "illegal character in escape sequence: ~S",
565 scm_list_1 (SCM_MAKE_CHAR (c)));
566 }
567 }
568 str = scm_i_string_start_writing (str);
569 scm_i_string_set_x (str, c_str_len++, c);
570 scm_i_string_stop_writing ();
571 }
572
573 if (c_str_len > 0)
574 {
575 return scm_i_substring_copy (str, 0, c_str_len);
576 }
577
578 return scm_nullstr;
579 }
580 #undef FUNC_NAME
581
582
583 static SCM
584 scm_read_number (scm_t_wchar chr, SCM port)
585 {
586 SCM result, str = SCM_EOL;
587 char buffer[READER_BUFFER_SIZE];
588 char *overflow_buffer = NULL;
589 size_t bytes_read;
590 int overflow;
591 scm_t_port *pt = SCM_PTAB_ENTRY (port);
592
593 scm_ungetc (chr, port);
594 overflow = read_complete_token (port, buffer, sizeof (buffer),
595 &overflow_buffer, &bytes_read);
596
597 if (!overflow)
598 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
599 else
600 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
601 pt->ilseq_handler);
602
603 result = scm_string_to_number (str, SCM_UNDEFINED);
604 if (!scm_is_true (result))
605 {
606 /* Return a symbol instead of a number */
607 if (SCM_CASE_INSENSITIVE_P)
608 str = scm_string_downcase_x (str);
609 result = scm_string_to_symbol (str);
610 }
611
612 if (overflow)
613 free (overflow_buffer);
614 SCM_COL (port) += scm_i_string_length (str);
615 return result;
616 }
617
618 static SCM
619 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
620 {
621 SCM result;
622 int ends_with_colon = 0;
623 size_t bytes_read;
624 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
625 int overflow;
626 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
627 scm_t_port *pt = SCM_PTAB_ENTRY (port);
628 SCM str;
629
630 scm_ungetc (chr, port);
631 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
632 &overflow_buffer, &bytes_read);
633 if (bytes_read > 0)
634 {
635 if (!overflow)
636 ends_with_colon = buffer[bytes_read - 1] == ':';
637 else
638 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
639 }
640
641 if (postfix && ends_with_colon && (bytes_read > 1))
642 {
643 if (!overflow)
644 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
645 else
646 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
647 pt->ilseq_handler);
648
649 if (SCM_CASE_INSENSITIVE_P)
650 str = scm_string_downcase_x (str);
651 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
652 }
653 else
654 {
655 if (!overflow)
656 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
657 else
658 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
659 pt->ilseq_handler);
660
661 if (SCM_CASE_INSENSITIVE_P)
662 str = scm_string_downcase_x (str);
663 result = scm_string_to_symbol (str);
664 }
665
666 if (overflow)
667 free (overflow_buffer);
668 SCM_COL (port) += scm_i_string_length (str);
669 return result;
670 }
671
672 static SCM
673 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
674 #define FUNC_NAME "scm_lreadr"
675 {
676 SCM result;
677 size_t read;
678 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
679 int overflow;
680 unsigned int radix;
681 SCM str;
682 scm_t_port *pt;
683
684 switch (chr)
685 {
686 case 'B':
687 case 'b':
688 radix = 2;
689 break;
690
691 case 'o':
692 case 'O':
693 radix = 8;
694 break;
695
696 case 'd':
697 case 'D':
698 radix = 10;
699 break;
700
701 case 'x':
702 case 'X':
703 radix = 16;
704 break;
705
706 default:
707 scm_ungetc (chr, port);
708 scm_ungetc ('#', port);
709 radix = 10;
710 }
711
712 overflow = read_complete_token (port, buffer, sizeof (buffer),
713 &overflow_buffer, &read);
714
715 pt = SCM_PTAB_ENTRY (port);
716 if (!overflow)
717 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
718 else
719 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
720 pt->ilseq_handler);
721
722 result = scm_string_to_number (str, scm_from_uint (radix));
723
724 if (overflow)
725 free (overflow_buffer);
726
727 SCM_COL (port) += scm_i_string_length (str);
728
729 if (scm_is_true (result))
730 return result;
731
732 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
733
734 return SCM_BOOL_F;
735 }
736 #undef FUNC_NAME
737
738 static SCM
739 scm_read_quote (int chr, SCM port)
740 {
741 SCM p;
742 long line = SCM_LINUM (port);
743 int column = SCM_COL (port) - 1;
744
745 switch (chr)
746 {
747 case '`':
748 p = scm_sym_quasiquote;
749 break;
750
751 case '\'':
752 p = scm_sym_quote;
753 break;
754
755 case ',':
756 {
757 scm_t_wchar c;
758
759 c = scm_getc (port);
760 if ('@' == c)
761 p = scm_sym_uq_splicing;
762 else
763 {
764 scm_ungetc (c, port);
765 p = scm_sym_unquote;
766 }
767 break;
768 }
769
770 default:
771 fprintf (stderr, "%s: unhandled quote character (%i)\n",
772 "scm_read_quote", chr);
773 abort ();
774 }
775
776 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
777 if (SCM_RECORD_POSITIONS_P)
778 scm_whash_insert (scm_source_whash, p,
779 scm_make_srcprops (line, column,
780 SCM_FILENAME (port),
781 SCM_COPY_SOURCE_P
782 ? (scm_cons2 (SCM_CAR (p),
783 SCM_CAR (SCM_CDR (p)),
784 SCM_EOL))
785 : SCM_UNDEFINED,
786 SCM_EOL));
787
788
789 return p;
790 }
791
792 SCM_SYMBOL (sym_syntax, "syntax");
793 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
794 SCM_SYMBOL (sym_unsyntax, "unsyntax");
795 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
796
797 static SCM
798 scm_read_syntax (int chr, SCM port)
799 {
800 SCM p;
801 long line = SCM_LINUM (port);
802 int column = SCM_COL (port) - 1;
803
804 switch (chr)
805 {
806 case '`':
807 p = sym_quasisyntax;
808 break;
809
810 case '\'':
811 p = sym_syntax;
812 break;
813
814 case ',':
815 {
816 int c;
817
818 c = scm_getc (port);
819 if ('@' == c)
820 p = sym_unsyntax_splicing;
821 else
822 {
823 scm_ungetc (c, port);
824 p = sym_unsyntax;
825 }
826 break;
827 }
828
829 default:
830 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
831 "scm_read_syntax", chr);
832 abort ();
833 }
834
835 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
836 if (SCM_RECORD_POSITIONS_P)
837 scm_whash_insert (scm_source_whash, p,
838 scm_make_srcprops (line, column,
839 SCM_FILENAME (port),
840 SCM_COPY_SOURCE_P
841 ? (scm_cons2 (SCM_CAR (p),
842 SCM_CAR (SCM_CDR (p)),
843 SCM_EOL))
844 : SCM_UNDEFINED,
845 SCM_EOL));
846
847
848 return p;
849 }
850
851 static inline SCM
852 scm_read_nil (int chr, SCM port)
853 {
854 SCM id = scm_read_mixed_case_symbol (chr, port);
855
856 if (!scm_is_eq (id, sym_nil))
857 scm_i_input_error ("scm_read_nil", port,
858 "unexpected input while reading #nil: ~a",
859 scm_list_1 (id));
860
861 return SCM_ELISP_NIL;
862 }
863
864 static inline SCM
865 scm_read_semicolon_comment (int chr, SCM port)
866 {
867 int c;
868
869 /* We use the get_byte here because there is no need to get the
870 locale correct with comment input. This presumes that newline
871 always represents itself no matter what the encoding is. */
872 for (c = scm_get_byte_or_eof (port);
873 (c != EOF) && (c != '\n');
874 c = scm_get_byte_or_eof (port));
875
876 return SCM_UNSPECIFIED;
877 }
878
879 \f
880 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
881
882 static SCM
883 scm_read_boolean (int chr, SCM port)
884 {
885 switch (chr)
886 {
887 case 't':
888 case 'T':
889 return SCM_BOOL_T;
890
891 case 'f':
892 case 'F':
893 return SCM_BOOL_F;
894 }
895
896 return SCM_UNSPECIFIED;
897 }
898
899 static SCM
900 scm_read_character (scm_t_wchar chr, SCM port)
901 #define FUNC_NAME "scm_lreadr"
902 {
903 char buffer[READER_CHAR_NAME_MAX_SIZE];
904 SCM charname;
905 size_t charname_len, bytes_read;
906 scm_t_wchar cp;
907 int overflow;
908 scm_t_port *pt;
909
910 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
911 if (overflow)
912 goto char_error;
913
914 if (bytes_read == 0)
915 {
916 chr = scm_getc (port);
917 if (chr == EOF)
918 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
919 "while reading character", SCM_EOL);
920
921 /* CHR must be a token delimiter, like a whitespace. */
922 return (SCM_MAKE_CHAR (chr));
923 }
924
925 pt = SCM_PTAB_ENTRY (port);
926
927 /* Simple ASCII characters can be processed immediately. Also, simple
928 ISO-8859-1 characters can be processed immediately if the encoding for this
929 port is ISO-8859-1. */
930 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
931 {
932 SCM_COL (port) += 1;
933 return SCM_MAKE_CHAR (buffer[0]);
934 }
935
936 /* Otherwise, convert the buffer into a proper scheme string for
937 processing. */
938 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
939 pt->ilseq_handler);
940 charname_len = scm_i_string_length (charname);
941 SCM_COL (port) += charname_len;
942 cp = scm_i_string_ref (charname, 0);
943 if (charname_len == 1)
944 return SCM_MAKE_CHAR (cp);
945
946 /* Ignore dotted circles, which may be used to keep combining characters from
947 combining with the backslash in #\charname. */
948 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
949 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
950
951 if (cp >= '0' && cp < '8')
952 {
953 /* Dirk:FIXME:: This type of character syntax is not R5RS
954 * compliant. Further, it should be verified that the constant
955 * does only consist of octal digits. */
956 SCM p = scm_string_to_number (charname, scm_from_uint (8));
957 if (SCM_I_INUMP (p))
958 {
959 scm_t_wchar c = SCM_I_INUM (p);
960 if (SCM_IS_UNICODE_CHAR (c))
961 return SCM_MAKE_CHAR (c);
962 else
963 scm_i_input_error (FUNC_NAME, port,
964 "out-of-range octal character escape: ~a",
965 scm_list_1 (charname));
966 }
967 }
968
969 if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
970 {
971 SCM p;
972
973 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
974 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
975 scm_from_uint (16));
976 if (SCM_I_INUMP (p))
977 {
978 scm_t_wchar c = SCM_I_INUM (p);
979 if (SCM_IS_UNICODE_CHAR (c))
980 return SCM_MAKE_CHAR (c);
981 else
982 scm_i_input_error (FUNC_NAME, port,
983 "out-of-range hex character escape: ~a",
984 scm_list_1 (charname));
985 }
986 }
987
988 /* The names of characters should never have non-Latin1
989 characters. */
990 if (scm_i_is_narrow_string (charname)
991 || scm_i_try_narrow_string (charname))
992 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
993 charname_len);
994 if (scm_is_true (ch))
995 return ch;
996 }
997
998 char_error:
999 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1000 scm_list_1 (charname));
1001
1002 return SCM_UNSPECIFIED;
1003 }
1004 #undef FUNC_NAME
1005
1006 static inline SCM
1007 scm_read_keyword (int chr, SCM port)
1008 {
1009 SCM symbol;
1010
1011 /* Read the symbol that comprises the keyword. Doing this instead of
1012 invoking a specific symbol reader function allows `scm_read_keyword ()'
1013 to adapt to the delimiters currently valid of symbols.
1014
1015 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1016 symbol = scm_read_expression (port);
1017 if (!scm_is_symbol (symbol))
1018 scm_i_input_error ("scm_read_keyword", port,
1019 "keyword prefix `~a' not followed by a symbol: ~s",
1020 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1021
1022 return (scm_symbol_to_keyword (symbol));
1023 }
1024
1025 static inline SCM
1026 scm_read_vector (int chr, SCM port)
1027 {
1028 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1029 guarantee that it's going to do what we want. After all, this is an
1030 implementation detail of `scm_read_vector ()', not a desirable
1031 property. */
1032 return (scm_vector (scm_read_sexp (chr, port)));
1033 }
1034
1035 static inline SCM
1036 scm_read_srfi4_vector (int chr, SCM port)
1037 {
1038 return scm_i_read_array (port, chr);
1039 }
1040
1041 static SCM
1042 scm_read_bytevector (scm_t_wchar chr, SCM port)
1043 {
1044 chr = scm_getc (port);
1045 if (chr != 'u')
1046 goto syntax;
1047
1048 chr = scm_getc (port);
1049 if (chr != '8')
1050 goto syntax;
1051
1052 chr = scm_getc (port);
1053 if (chr != '(')
1054 goto syntax;
1055
1056 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1057
1058 syntax:
1059 scm_i_input_error ("read_bytevector", port,
1060 "invalid bytevector prefix",
1061 SCM_MAKE_CHAR (chr));
1062 return SCM_UNSPECIFIED;
1063 }
1064
1065 static SCM
1066 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1067 {
1068 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1069 terribly inefficient but who cares? */
1070 SCM s_bits = SCM_EOL;
1071
1072 for (chr = scm_getc (port);
1073 (chr != EOF) && ((chr == '0') || (chr == '1'));
1074 chr = scm_getc (port))
1075 {
1076 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1077 }
1078
1079 if (chr != EOF)
1080 scm_ungetc (chr, port);
1081
1082 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1083 }
1084
1085 static inline SCM
1086 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1087 {
1088 int bang_seen = 0;
1089
1090 /* We can use the get_byte here because there is no need to get the
1091 locale correct when reading comments. This presumes that
1092 hash and exclamation points always represent themselves no
1093 matter what the source encoding is.*/
1094 for (;;)
1095 {
1096 int c = scm_get_byte_or_eof (port);
1097
1098 if (c == EOF)
1099 scm_i_input_error ("skip_block_comment", port,
1100 "unterminated `#! ... !#' comment", SCM_EOL);
1101
1102 if (c == '!')
1103 bang_seen = 1;
1104 else if (c == '#' && bang_seen)
1105 break;
1106 else
1107 bang_seen = 0;
1108 }
1109
1110 return SCM_UNSPECIFIED;
1111 }
1112
1113 static inline SCM
1114 scm_read_shebang (scm_t_wchar chr, SCM port)
1115 {
1116 int c = 0;
1117 if ((c = scm_get_byte_or_eof (port)) != 'r')
1118 {
1119 scm_ungetc (c, port);
1120 return scm_read_scsh_block_comment (chr, port);
1121 }
1122 if ((c = scm_get_byte_or_eof (port)) != '6')
1123 {
1124 scm_ungetc (c, port);
1125 scm_ungetc ('r', port);
1126 return scm_read_scsh_block_comment (chr, port);
1127 }
1128 if ((c = scm_get_byte_or_eof (port)) != 'r')
1129 {
1130 scm_ungetc (c, port);
1131 scm_ungetc ('6', port);
1132 scm_ungetc ('r', port);
1133 return scm_read_scsh_block_comment (chr, port);
1134 }
1135 if ((c = scm_get_byte_or_eof (port)) != 's')
1136 {
1137 scm_ungetc (c, port);
1138 scm_ungetc ('r', port);
1139 scm_ungetc ('6', port);
1140 scm_ungetc ('r', port);
1141 return scm_read_scsh_block_comment (chr, port);
1142 }
1143
1144 return SCM_UNSPECIFIED;
1145 }
1146
1147 static SCM
1148 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1149 {
1150 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1151 nested. So care must be taken. */
1152 int nesting_level = 1;
1153 int opening_seen = 0, closing_seen = 0;
1154
1155 while (nesting_level > 0)
1156 {
1157 int c = scm_getc (port);
1158
1159 if (c == EOF)
1160 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1161 "unterminated `#| ... |#' comment", SCM_EOL);
1162
1163 if (opening_seen)
1164 {
1165 if (c == '|')
1166 nesting_level++;
1167 opening_seen = 0;
1168 }
1169 else if (closing_seen)
1170 {
1171 if (c == '#')
1172 nesting_level--;
1173 closing_seen = 0;
1174 }
1175 else if (c == '|')
1176 closing_seen = 1;
1177 else if (c == '#')
1178 opening_seen = 1;
1179 else
1180 opening_seen = closing_seen = 0;
1181 }
1182
1183 return SCM_UNSPECIFIED;
1184 }
1185
1186 static SCM
1187 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1188 {
1189 scm_t_wchar c;
1190
1191 c = flush_ws (port, (char *) NULL);
1192 if (EOF == c)
1193 scm_i_input_error ("read_commented_expression", port,
1194 "no expression after #; comment", SCM_EOL);
1195 scm_ungetc (c, port);
1196 scm_read_expression (port);
1197 return SCM_UNSPECIFIED;
1198 }
1199
1200 static SCM
1201 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1202 {
1203 /* Guile's extended symbol read syntax looks like this:
1204
1205 #{This is all a symbol name}#
1206
1207 So here, CHR is expected to be `{'. */
1208 int saw_brace = 0, finished = 0;
1209 size_t len = 0;
1210 SCM buf = scm_i_make_string (1024, NULL);
1211
1212 buf = scm_i_string_start_writing (buf);
1213
1214 while ((chr = scm_getc (port)) != EOF)
1215 {
1216 if (saw_brace)
1217 {
1218 if (chr == '#')
1219 {
1220 finished = 1;
1221 break;
1222 }
1223 else
1224 {
1225 saw_brace = 0;
1226 scm_i_string_set_x (buf, len++, '}');
1227 scm_i_string_set_x (buf, len++, chr);
1228 }
1229 }
1230 else if (chr == '}')
1231 saw_brace = 1;
1232 else
1233 scm_i_string_set_x (buf, len++, chr);
1234
1235 if (len >= scm_i_string_length (buf) - 2)
1236 {
1237 SCM addy;
1238
1239 scm_i_string_stop_writing ();
1240 addy = scm_i_make_string (1024, NULL);
1241 buf = scm_string_append (scm_list_2 (buf, addy));
1242 len = 0;
1243 buf = scm_i_string_start_writing (buf);
1244 }
1245
1246 if (finished)
1247 break;
1248 }
1249 scm_i_string_stop_writing ();
1250
1251 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1252 }
1253
1254
1255 \f
1256 /* Top-level token readers, i.e., dispatchers. */
1257
1258 static SCM
1259 scm_read_sharp_extension (int chr, SCM port)
1260 {
1261 SCM proc;
1262
1263 proc = scm_get_hash_procedure (chr);
1264 if (scm_is_true (scm_procedure_p (proc)))
1265 {
1266 long line = SCM_LINUM (port);
1267 int column = SCM_COL (port) - 2;
1268 SCM got;
1269
1270 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1271 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1272 {
1273 if (SCM_RECORD_POSITIONS_P)
1274 return (recsexpr (got, line, column,
1275 SCM_FILENAME (port)));
1276 else
1277 return got;
1278 }
1279 }
1280
1281 return SCM_UNSPECIFIED;
1282 }
1283
1284 /* The reader for the sharp `#' character. It basically dispatches reads
1285 among the above token readers. */
1286 static SCM
1287 scm_read_sharp (scm_t_wchar chr, SCM port)
1288 #define FUNC_NAME "scm_lreadr"
1289 {
1290 SCM result;
1291
1292 chr = scm_getc (port);
1293
1294 result = scm_read_sharp_extension (chr, port);
1295 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1296 return result;
1297
1298 switch (chr)
1299 {
1300 case '\\':
1301 return (scm_read_character (chr, port));
1302 case '(':
1303 return (scm_read_vector (chr, port));
1304 case 's':
1305 case 'u':
1306 case 'f':
1307 /* This one may return either a boolean or an SRFI-4 vector. */
1308 return (scm_read_srfi4_vector (chr, port));
1309 case 'v':
1310 return (scm_read_bytevector (chr, port));
1311 case '*':
1312 return (scm_read_guile_bit_vector (chr, port));
1313 case 't':
1314 case 'T':
1315 case 'F':
1316 /* This one may return either a boolean or an SRFI-4 vector. */
1317 return (scm_read_boolean (chr, port));
1318 case ':':
1319 return (scm_read_keyword (chr, port));
1320 case '0': case '1': case '2': case '3': case '4':
1321 case '5': case '6': case '7': case '8': case '9':
1322 case '@':
1323 #if SCM_ENABLE_DEPRECATED
1324 /* See below for 'i' and 'e'. */
1325 case 'a':
1326 case 'c':
1327 case 'y':
1328 case 'h':
1329 case 'l':
1330 #endif
1331 return (scm_i_read_array (port, chr));
1332
1333 case 'i':
1334 case 'e':
1335 #if SCM_ENABLE_DEPRECATED
1336 {
1337 /* When next char is '(', it really is an old-style
1338 uniform array. */
1339 scm_t_wchar next_c = scm_getc (port);
1340 if (next_c != EOF)
1341 scm_ungetc (next_c, port);
1342 if (next_c == '(')
1343 return scm_i_read_array (port, chr);
1344 /* Fall through. */
1345 }
1346 #endif
1347 case 'b':
1348 case 'B':
1349 case 'o':
1350 case 'O':
1351 case 'd':
1352 case 'D':
1353 case 'x':
1354 case 'X':
1355 case 'I':
1356 case 'E':
1357 return (scm_read_number_and_radix (chr, port));
1358 case '{':
1359 return (scm_read_extended_symbol (chr, port));
1360 case '!':
1361 return (scm_read_shebang (chr, port));
1362 case ';':
1363 return (scm_read_commented_expression (chr, port));
1364 case '`':
1365 case '\'':
1366 case ',':
1367 return (scm_read_syntax (chr, port));
1368 case 'n':
1369 return (scm_read_nil (chr, port));
1370 default:
1371 result = scm_read_sharp_extension (chr, port);
1372 if (scm_is_eq (result, SCM_UNSPECIFIED))
1373 {
1374 /* To remain compatible with 1.8 and earlier, the following
1375 characters have lower precedence than `read-hash-extend'
1376 characters. */
1377 switch (chr)
1378 {
1379 case '|':
1380 return scm_read_r6rs_block_comment (chr, port);
1381 default:
1382 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1383 scm_list_1 (SCM_MAKE_CHAR (chr)));
1384 }
1385 }
1386 else
1387 return result;
1388 }
1389
1390 return SCM_UNSPECIFIED;
1391 }
1392 #undef FUNC_NAME
1393
1394 static SCM
1395 scm_read_expression (SCM port)
1396 #define FUNC_NAME "scm_read_expression"
1397 {
1398 while (1)
1399 {
1400 register scm_t_wchar chr;
1401
1402 chr = scm_getc (port);
1403
1404 switch (chr)
1405 {
1406 case SCM_WHITE_SPACES:
1407 case SCM_LINE_INCREMENTORS:
1408 break;
1409 case ';':
1410 (void) scm_read_semicolon_comment (chr, port);
1411 break;
1412 case '[':
1413 if (!SCM_SQUARE_BRACKETS_P)
1414 return (scm_read_mixed_case_symbol (chr, port));
1415 /* otherwise fall through */
1416 case '(':
1417 return (scm_read_sexp (chr, port));
1418 case '"':
1419 return (scm_read_string (chr, port));
1420 case '\'':
1421 case '`':
1422 case ',':
1423 return (scm_read_quote (chr, port));
1424 case '#':
1425 {
1426 SCM result;
1427 result = scm_read_sharp (chr, port);
1428 if (scm_is_eq (result, SCM_UNSPECIFIED))
1429 /* We read a comment or some such. */
1430 break;
1431 else
1432 return result;
1433 }
1434 case ')':
1435 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1436 break;
1437 case EOF:
1438 return SCM_EOF_VAL;
1439 case ':':
1440 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1441 return scm_symbol_to_keyword (scm_read_expression (port));
1442 /* Fall through. */
1443
1444 default:
1445 {
1446 if (((chr >= '0') && (chr <= '9'))
1447 || (strchr ("+-.", chr)))
1448 return (scm_read_number (chr, port));
1449 else
1450 return (scm_read_mixed_case_symbol (chr, port));
1451 }
1452 }
1453 }
1454 }
1455 #undef FUNC_NAME
1456
1457 \f
1458 /* Actual reader. */
1459
1460 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1461 (SCM port),
1462 "Read an s-expression from the input port @var{port}, or from\n"
1463 "the current input port if @var{port} is not specified.\n"
1464 "Any whitespace before the next token is discarded.")
1465 #define FUNC_NAME s_scm_read
1466 {
1467 int c;
1468
1469 if (SCM_UNBNDP (port))
1470 port = scm_current_input_port ();
1471 SCM_VALIDATE_OPINPORT (1, port);
1472
1473 c = flush_ws (port, (char *) NULL);
1474 if (EOF == c)
1475 return SCM_EOF_VAL;
1476 scm_ungetc (c, port);
1477
1478 return (scm_read_expression (port));
1479 }
1480 #undef FUNC_NAME
1481
1482
1483 \f
1484
1485 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1486 static SCM
1487 recsexpr (SCM obj, long line, int column, SCM filename)
1488 {
1489 if (!scm_is_pair(obj)) {
1490 return obj;
1491 } else {
1492 SCM tmp = obj, copy;
1493 /* If this sexpr is visible in the read:sharp source, we want to
1494 keep that information, so only record non-constant cons cells
1495 which haven't previously been read by the reader. */
1496 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1497 {
1498 if (SCM_COPY_SOURCE_P)
1499 {
1500 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1501 SCM_UNDEFINED);
1502 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1503 {
1504 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1505 line,
1506 column,
1507 filename),
1508 SCM_UNDEFINED));
1509 copy = SCM_CDR (copy);
1510 }
1511 SCM_SETCDR (copy, tmp);
1512 }
1513 else
1514 {
1515 recsexpr (SCM_CAR (obj), line, column, filename);
1516 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1517 recsexpr (SCM_CAR (tmp), line, column, filename);
1518 copy = SCM_UNDEFINED;
1519 }
1520 scm_whash_insert (scm_source_whash,
1521 obj,
1522 scm_make_srcprops (line,
1523 column,
1524 filename,
1525 copy,
1526 SCM_EOL));
1527 }
1528 return obj;
1529 }
1530 }
1531
1532 /* Manipulate the read-hash-procedures alist. This could be written in
1533 Scheme, but maybe it will also be used by C code during initialisation. */
1534 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1535 (SCM chr, SCM proc),
1536 "Install the procedure @var{proc} for reading expressions\n"
1537 "starting with the character sequence @code{#} and @var{chr}.\n"
1538 "@var{proc} will be called with two arguments: the character\n"
1539 "@var{chr} and the port to read further data from. The object\n"
1540 "returned will be the return value of @code{read}. \n"
1541 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1542 )
1543 #define FUNC_NAME s_scm_read_hash_extend
1544 {
1545 SCM this;
1546 SCM prev;
1547
1548 SCM_VALIDATE_CHAR (1, chr);
1549 SCM_ASSERT (scm_is_false (proc)
1550 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1551 proc, SCM_ARG2, FUNC_NAME);
1552
1553 /* Check if chr is already in the alist. */
1554 this = *scm_read_hash_procedures;
1555 prev = SCM_BOOL_F;
1556 while (1)
1557 {
1558 if (scm_is_null (this))
1559 {
1560 /* not found, so add it to the beginning. */
1561 if (scm_is_true (proc))
1562 {
1563 *scm_read_hash_procedures =
1564 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1565 }
1566 break;
1567 }
1568 if (scm_is_eq (chr, SCM_CAAR (this)))
1569 {
1570 /* already in the alist. */
1571 if (scm_is_false (proc))
1572 {
1573 /* remove it. */
1574 if (scm_is_false (prev))
1575 {
1576 *scm_read_hash_procedures =
1577 SCM_CDR (*scm_read_hash_procedures);
1578 }
1579 else
1580 scm_set_cdr_x (prev, SCM_CDR (this));
1581 }
1582 else
1583 {
1584 /* replace it. */
1585 scm_set_cdr_x (SCM_CAR (this), proc);
1586 }
1587 break;
1588 }
1589 prev = this;
1590 this = SCM_CDR (this);
1591 }
1592
1593 return SCM_UNSPECIFIED;
1594 }
1595 #undef FUNC_NAME
1596
1597 /* Recover the read-hash procedure corresponding to char c. */
1598 static SCM
1599 scm_get_hash_procedure (int c)
1600 {
1601 SCM rest = *scm_read_hash_procedures;
1602
1603 while (1)
1604 {
1605 if (scm_is_null (rest))
1606 return SCM_BOOL_F;
1607
1608 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1609 return SCM_CDAR (rest);
1610
1611 rest = SCM_CDR (rest);
1612 }
1613 }
1614
1615 #define SCM_ENCODING_SEARCH_SIZE (500)
1616
1617 /* Search the first few hundred characters of a file for an Emacs-like coding
1618 declaration. Returns either NULL or a string whose storage has been
1619 allocated with `scm_gc_malloc ()'. */
1620 char *
1621 scm_i_scan_for_encoding (SCM port)
1622 {
1623 char header[SCM_ENCODING_SEARCH_SIZE+1];
1624 size_t bytes_read;
1625 char *encoding = NULL;
1626 int utf8_bom = 0;
1627 char *pos;
1628 int i;
1629 int in_comment;
1630
1631 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1632 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1633 "guile <(echo '(display "hello")')") so bail out. */
1634 return NULL;
1635
1636 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1637
1638 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1639
1640 if (bytes_read > 3
1641 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1642 utf8_bom = 1;
1643
1644 /* search past "coding[:=]" */
1645 pos = header;
1646 while (1)
1647 {
1648 if ((pos = strstr(pos, "coding")) == NULL)
1649 return NULL;
1650
1651 pos += strlen("coding");
1652 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1653 (*pos == ':' || *pos == '='))
1654 {
1655 pos ++;
1656 break;
1657 }
1658 }
1659
1660 /* skip spaces */
1661 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1662 (*pos == ' ' || *pos == '\t'))
1663 pos ++;
1664
1665 /* grab the next token */
1666 i = 0;
1667 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1668 && pos + i - header < bytes_read
1669 && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1670 i++;
1671
1672 if (i == 0)
1673 return NULL;
1674
1675 encoding = scm_gc_strndup (pos, i, "encoding");
1676 for (i = 0; i < strlen (encoding); i++)
1677 encoding[i] = toupper ((int) encoding[i]);
1678
1679 /* push backwards to make sure we were in a comment */
1680 in_comment = 0;
1681 while (pos - i - header > 0)
1682 {
1683 if (*(pos - i) == '\n')
1684 {
1685 /* This wasn't in a semicolon comment. Check for a
1686 hash-bang comment. */
1687 char *beg = strstr (header, "#!");
1688 char *end = strstr (header, "!#");
1689 if (beg < pos && pos < end)
1690 in_comment = 1;
1691 break;
1692 }
1693 if (*(pos - i) == ';')
1694 {
1695 in_comment = 1;
1696 break;
1697 }
1698 i ++;
1699 }
1700 if (!in_comment)
1701 /* This wasn't in a comment */
1702 return NULL;
1703
1704 if (utf8_bom && strcmp(encoding, "UTF-8"))
1705 scm_misc_error (NULL,
1706 "the port input declares the encoding ~s but is encoded as UTF-8",
1707 scm_list_1 (scm_from_locale_string (encoding)));
1708
1709 return encoding;
1710 }
1711
1712 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1713 (SCM port),
1714 "Scans the port for an Emacs-like character coding declaration\n"
1715 "near the top of the contents of a port with random-acessible contents.\n"
1716 "The coding declaration is of the form\n"
1717 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1718 "\n"
1719 "Returns a string containing the character encoding of the file\n"
1720 "if a declaration was found, or @code{#f} otherwise.\n")
1721 #define FUNC_NAME s_scm_file_encoding
1722 {
1723 char *enc;
1724 SCM s_enc;
1725
1726 enc = scm_i_scan_for_encoding (port);
1727 if (enc == NULL)
1728 return SCM_BOOL_F;
1729 else
1730 {
1731 s_enc = scm_from_locale_string (enc);
1732 return s_enc;
1733 }
1734
1735 return SCM_BOOL_F;
1736 }
1737 #undef FUNC_NAME
1738
1739 void
1740 scm_init_read ()
1741 {
1742 scm_read_hash_procedures =
1743 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1744
1745 scm_init_opts (scm_read_options, scm_read_opts);
1746 #include "libguile/read.x"
1747 }
1748
1749 /*
1750 Local Variables:
1751 c-file-style: "gnu"
1752 End:
1753 */