Enable character hex escapes by default
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
53
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
56
57
58 \f
59
60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
63 SCM_SYMBOL (sym_nil, "nil");
64
65 scm_t_option scm_read_opts[] = {
66 { SCM_OPTION_BOOLEAN, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
76 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
77 "Support `\\(' and `\\)' in strings."},
78 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
79 "Use R6RS variable-length character and string hex escapes."},
80 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
81 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
82 { 0, },
83 };
84
85 /*
86 Give meaningful error messages for errors
87
88 We use the format
89
90 FILE:LINE:COL: MESSAGE
91 This happened in ....
92
93 This is not standard GNU format, but the test-suite likes the real
94 message to be in front.
95
96 */
97
98
99 void
100 scm_i_input_error (char const *function,
101 SCM port, const char *message, SCM arg)
102 {
103 SCM fn = (scm_is_string (SCM_FILENAME(port))
104 ? SCM_FILENAME(port)
105 : scm_from_locale_string ("#<unknown port>"));
106
107 SCM string_port = scm_open_output_string ();
108 SCM string = SCM_EOL;
109 scm_simple_format (string_port,
110 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_list_4 (fn,
112 scm_from_long (SCM_LINUM (port) + 1),
113 scm_from_int (SCM_COL (port) + 1),
114 scm_from_locale_string (message)));
115
116 string = scm_get_output_string (string_port);
117 scm_close_output_port (string_port);
118 scm_error_scm (scm_from_locale_symbol ("read-error"),
119 function? scm_from_locale_string (function) : SCM_BOOL_F,
120 string,
121 arg,
122 SCM_BOOL_F);
123 }
124
125
126 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
127 (SCM setting),
128 "Option interface for the read options. Instead of using\n"
129 "this procedure directly, use the procedures @code{read-enable},\n"
130 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
131 #define FUNC_NAME s_scm_read_options
132 {
133 SCM ans = scm_options (setting,
134 scm_read_opts,
135 FUNC_NAME);
136 if (SCM_COPY_SOURCE_P)
137 SCM_RECORD_POSITIONS_P = 1;
138 return ans;
139 }
140 #undef FUNC_NAME
141
142 /* An association list mapping extra hash characters to procedures. */
143 static SCM *scm_read_hash_procedures;
144
145
146 \f
147 /* Token readers. */
148
149
150 /* Size of the C buffer used to read symbols and numbers. */
151 #define READER_BUFFER_SIZE 128
152
153 /* Size of the C buffer used to read strings. */
154 #define READER_STRING_BUFFER_SIZE 512
155
156 /* The maximum size of Scheme character names. */
157 #define READER_CHAR_NAME_MAX_SIZE 50
158
159
160 /* `isblank' is only in C99. */
161 #define CHAR_IS_BLANK_(_chr) \
162 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
163 || ((_chr) == '\f') || ((_chr) == '\r'))
164
165 #ifdef MSDOS
166 # define CHAR_IS_BLANK(_chr) \
167 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
168 #else
169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
170 #endif
171
172
173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
174 structure''). */
175 #define CHAR_IS_R5RS_DELIMITER(c) \
176 (CHAR_IS_BLANK (c) \
177 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
178 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
179
180 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
181
182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
183 Structure''. */
184 #define CHAR_IS_EXPONENT_MARKER(_chr) \
185 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
186 || ((_chr) == 'd') || ((_chr) == 'l'))
187
188 /* Read an SCSH block comment. */
189 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
190 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
191 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
192 static SCM scm_read_shebang (scm_t_wchar, SCM);
193 static SCM scm_get_hash_procedure (int);
194
195 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
196 result in the pre-allocated buffer BUF. Return zero if the whole token has
197 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
198 bytes actually read. */
199 static inline int
200 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
201 {
202 *read = 0;
203
204 while (*read < buf_size)
205 {
206 int chr;
207
208 chr = scm_get_byte_or_eof (port);
209
210 if (chr == EOF)
211 return 0;
212 else if (CHAR_IS_DELIMITER (chr))
213 {
214 scm_unget_byte (chr, port);
215 return 0;
216 }
217 else
218 {
219 *buf = (char) chr;
220 buf++, (*read)++;
221 }
222 }
223
224 return 1;
225 }
226
227 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
228 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
229 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
230 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
231 will be set the number of bytes actually read. */
232 static int
233 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
234 char **overflow_buffer, size_t *read)
235 {
236 int overflow = 0;
237 size_t bytes_read, overflow_size;
238
239 *overflow_buffer = NULL;
240 overflow_size = 0;
241
242 do
243 {
244 overflow = read_token (port, buffer, buffer_size, &bytes_read);
245 if (bytes_read == 0)
246 break;
247 if (overflow || overflow_size != 0)
248 {
249 if (overflow_size == 0)
250 {
251 *overflow_buffer = scm_malloc (bytes_read);
252 memcpy (*overflow_buffer, buffer, bytes_read);
253 overflow_size = bytes_read;
254 }
255 else
256 {
257 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
258 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
259 overflow_size += bytes_read;
260 }
261 }
262 }
263 while (overflow);
264
265 if (overflow_size)
266 *read = overflow_size;
267 else
268 *read = bytes_read;
269
270 return (overflow_size != 0);
271 }
272
273 /* Skip whitespace from PORT and return the first non-whitespace character
274 read. Raise an error on end-of-file. */
275 static int
276 flush_ws (SCM port, const char *eoferr)
277 {
278 register scm_t_wchar c;
279 while (1)
280 switch (c = scm_getc (port))
281 {
282 case EOF:
283 goteof:
284 if (eoferr)
285 {
286 scm_i_input_error (eoferr,
287 port,
288 "end of file",
289 SCM_EOL);
290 }
291 return c;
292
293 case ';':
294 lp:
295 switch (c = scm_getc (port))
296 {
297 case EOF:
298 goto goteof;
299 default:
300 goto lp;
301 case SCM_LINE_INCREMENTORS:
302 break;
303 }
304 break;
305
306 case '#':
307 switch (c = scm_getc (port))
308 {
309 case EOF:
310 eoferr = "read_sharp";
311 goto goteof;
312 case '!':
313 scm_read_shebang (c, port);
314 break;
315 case ';':
316 scm_read_commented_expression (c, port);
317 break;
318 case '|':
319 if (scm_is_false (scm_get_hash_procedure (c)))
320 {
321 scm_read_r6rs_block_comment (c, port);
322 break;
323 }
324 /* fall through */
325 default:
326 scm_ungetc (c, port);
327 return '#';
328 }
329 break;
330
331 case SCM_LINE_INCREMENTORS:
332 case SCM_SINGLE_SPACES:
333 case '\t':
334 break;
335
336 default:
337 return c;
338 }
339
340 return 0;
341 }
342
343
344 \f
345 /* Token readers. */
346
347 static SCM scm_read_expression (SCM port);
348 static SCM scm_read_sharp (int chr, SCM port);
349 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
350
351
352 static SCM
353 scm_read_sexp (scm_t_wchar chr, SCM port)
354 #define FUNC_NAME "scm_i_lreadparen"
355 {
356 register int c;
357 register SCM tmp;
358 register SCM tl, ans = SCM_EOL;
359 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
360 const int terminating_char = ((chr == '[') ? ']' : ')');
361
362 /* Need to capture line and column numbers here. */
363 long line = SCM_LINUM (port);
364 int column = SCM_COL (port) - 1;
365
366
367 c = flush_ws (port, FUNC_NAME);
368 if (terminating_char == c)
369 return SCM_EOL;
370
371 scm_ungetc (c, port);
372 if (scm_is_eq (scm_sym_dot,
373 (tmp = scm_read_expression (port))))
374 {
375 ans = scm_read_expression (port);
376 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
377 scm_i_input_error (FUNC_NAME, port, "missing close paren",
378 SCM_EOL);
379 return ans;
380 }
381
382 /* Build the head of the list structure. */
383 ans = tl = scm_cons (tmp, SCM_EOL);
384
385 if (SCM_COPY_SOURCE_P)
386 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
387 ? copy
388 : tmp,
389 SCM_EOL);
390
391 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
392 {
393 SCM new_tail;
394
395 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
396 scm_i_input_error (FUNC_NAME, port,
397 "in pair: mismatched close paren: ~A",
398 scm_list_1 (SCM_MAKE_CHAR (c)));
399
400 scm_ungetc (c, port);
401 tmp = scm_read_expression (port);
402
403 if (scm_is_eq (scm_sym_dot, tmp))
404 {
405 SCM_SETCDR (tl, tmp = scm_read_expression (port));
406
407 if (SCM_COPY_SOURCE_P)
408 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
409 SCM_EOL));
410
411 c = flush_ws (port, FUNC_NAME);
412 if (terminating_char != c)
413 scm_i_input_error (FUNC_NAME, port,
414 "in pair: missing close paren", SCM_EOL);
415 goto exit;
416 }
417
418 new_tail = scm_cons (tmp, SCM_EOL);
419 SCM_SETCDR (tl, new_tail);
420 tl = new_tail;
421
422 if (SCM_COPY_SOURCE_P)
423 {
424 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
425 ? copy
426 : tmp, SCM_EOL);
427 SCM_SETCDR (tl2, new_tail2);
428 tl2 = new_tail2;
429 }
430 }
431
432 exit:
433 if (SCM_RECORD_POSITIONS_P)
434 scm_whash_insert (scm_source_whash,
435 ans,
436 scm_make_srcprops (line, column,
437 SCM_FILENAME (port),
438 SCM_COPY_SOURCE_P
439 ? ans2
440 : SCM_UNDEFINED,
441 SCM_EOL));
442 return ans;
443 }
444 #undef FUNC_NAME
445
446
447 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
448 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
449 found. */
450 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
451 do \
452 { \
453 scm_t_wchar a; \
454 size_t i = 0; \
455 c = 0; \
456 while (i < ndigits) \
457 { \
458 a = scm_getc (port); \
459 if (a == EOF) \
460 goto str_eof; \
461 if (terminator \
462 && (a == (scm_t_wchar) terminator) \
463 && (i > 0)) \
464 break; \
465 if ('0' <= a && a <= '9') \
466 a -= '0'; \
467 else if ('A' <= a && a <= 'F') \
468 a = a - 'A' + 10; \
469 else if ('a' <= a && a <= 'f') \
470 a = a - 'a' + 10; \
471 else \
472 { \
473 c = a; \
474 goto bad_escaped; \
475 } \
476 c = c * 16 + a; \
477 i ++; \
478 } \
479 } while (0)
480
481 static SCM
482 scm_read_string (int chr, SCM port)
483 #define FUNC_NAME "scm_lreadr"
484 {
485 /* For strings smaller than C_STR, this function creates only one Scheme
486 object (the string returned). */
487
488 SCM str = SCM_BOOL_F;
489 unsigned c_str_len = 0;
490 scm_t_wchar c;
491
492 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
493 while ('"' != (c = scm_getc (port)))
494 {
495 if (c == EOF)
496 {
497 str_eof:
498 scm_i_input_error (FUNC_NAME, port,
499 "end of file in string constant", SCM_EOL);
500 }
501
502 if (c_str_len + 1 >= scm_i_string_length (str))
503 {
504 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
505
506 str = scm_string_append (scm_list_2 (str, addy));
507 }
508
509 if (c == '\\')
510 {
511 switch (c = scm_getc (port))
512 {
513 case EOF:
514 goto str_eof;
515 case '"':
516 case '\\':
517 break;
518 case '(':
519 case ')':
520 if (SCM_ESCAPED_PARENS_P)
521 break;
522 goto bad_escaped;
523 case '\n':
524 continue;
525 case '0':
526 c = '\0';
527 break;
528 case 'f':
529 c = '\f';
530 break;
531 case 'n':
532 c = '\n';
533 break;
534 case 'r':
535 c = '\r';
536 break;
537 case 't':
538 c = '\t';
539 break;
540 case 'a':
541 c = '\007';
542 break;
543 case 'v':
544 c = '\v';
545 break;
546 case 'b':
547 c = '\010';
548 break;
549 case 'x':
550 if (SCM_R6RS_ESCAPES_P)
551 SCM_READ_HEX_ESCAPE (10, ';');
552 else
553 SCM_READ_HEX_ESCAPE (2, '\0');
554 break;
555 case 'u':
556 if (!SCM_R6RS_ESCAPES_P)
557 {
558 SCM_READ_HEX_ESCAPE (4, '\0');
559 break;
560 }
561 case 'U':
562 if (!SCM_R6RS_ESCAPES_P)
563 {
564 SCM_READ_HEX_ESCAPE (6, '\0');
565 break;
566 }
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
577 }
578
579 if (c_str_len > 0)
580 {
581 return scm_i_substring_copy (str, 0, c_str_len);
582 }
583
584 return scm_nullstr;
585 }
586 #undef FUNC_NAME
587
588
589 static SCM
590 scm_read_number (scm_t_wchar chr, SCM port)
591 {
592 SCM result, str = SCM_EOL;
593 char buffer[READER_BUFFER_SIZE];
594 char *overflow_buffer = NULL;
595 size_t bytes_read;
596 int overflow;
597 scm_t_port *pt = SCM_PTAB_ENTRY (port);
598
599 scm_ungetc (chr, port);
600 overflow = read_complete_token (port, buffer, sizeof (buffer),
601 &overflow_buffer, &bytes_read);
602
603 if (!overflow)
604 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
605 else
606 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
607 pt->ilseq_handler);
608
609 result = scm_string_to_number (str, SCM_UNDEFINED);
610 if (!scm_is_true (result))
611 {
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P)
614 str = scm_string_downcase_x (str);
615 result = scm_string_to_symbol (str);
616 }
617
618 if (overflow)
619 free (overflow_buffer);
620 SCM_COL (port) += scm_i_string_length (str);
621 return result;
622 }
623
624 static SCM
625 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
626 {
627 SCM result;
628 int ends_with_colon = 0;
629 size_t bytes_read;
630 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
631 int overflow;
632 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
633 scm_t_port *pt = SCM_PTAB_ENTRY (port);
634 SCM str;
635
636 scm_ungetc (chr, port);
637 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
638 &overflow_buffer, &bytes_read);
639 if (bytes_read > 0)
640 {
641 if (!overflow)
642 ends_with_colon = buffer[bytes_read - 1] == ':';
643 else
644 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
645 }
646
647 if (postfix && ends_with_colon && (bytes_read > 1))
648 {
649 if (!overflow)
650 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
651 else
652 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
653 pt->ilseq_handler);
654
655 if (SCM_CASE_INSENSITIVE_P)
656 str = scm_string_downcase_x (str);
657 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
658 }
659 else
660 {
661 if (!overflow)
662 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
663 else
664 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
665 pt->ilseq_handler);
666
667 if (SCM_CASE_INSENSITIVE_P)
668 str = scm_string_downcase_x (str);
669 result = scm_string_to_symbol (str);
670 }
671
672 if (overflow)
673 free (overflow_buffer);
674 SCM_COL (port) += scm_i_string_length (str);
675 return result;
676 }
677
678 static SCM
679 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
680 #define FUNC_NAME "scm_lreadr"
681 {
682 SCM result;
683 size_t read;
684 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
685 int overflow;
686 unsigned int radix;
687 SCM str;
688 scm_t_port *pt;
689
690 switch (chr)
691 {
692 case 'B':
693 case 'b':
694 radix = 2;
695 break;
696
697 case 'o':
698 case 'O':
699 radix = 8;
700 break;
701
702 case 'd':
703 case 'D':
704 radix = 10;
705 break;
706
707 case 'x':
708 case 'X':
709 radix = 16;
710 break;
711
712 default:
713 scm_ungetc (chr, port);
714 scm_ungetc ('#', port);
715 radix = 10;
716 }
717
718 overflow = read_complete_token (port, buffer, sizeof (buffer),
719 &overflow_buffer, &read);
720
721 pt = SCM_PTAB_ENTRY (port);
722 if (!overflow)
723 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
724 else
725 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
726 pt->ilseq_handler);
727
728 result = scm_string_to_number (str, scm_from_uint (radix));
729
730 if (overflow)
731 free (overflow_buffer);
732
733 SCM_COL (port) += scm_i_string_length (str);
734
735 if (scm_is_true (result))
736 return result;
737
738 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
739
740 return SCM_BOOL_F;
741 }
742 #undef FUNC_NAME
743
744 static SCM
745 scm_read_quote (int chr, SCM port)
746 {
747 SCM p;
748 long line = SCM_LINUM (port);
749 int column = SCM_COL (port) - 1;
750
751 switch (chr)
752 {
753 case '`':
754 p = scm_sym_quasiquote;
755 break;
756
757 case '\'':
758 p = scm_sym_quote;
759 break;
760
761 case ',':
762 {
763 scm_t_wchar c;
764
765 c = scm_getc (port);
766 if ('@' == c)
767 p = scm_sym_uq_splicing;
768 else
769 {
770 scm_ungetc (c, port);
771 p = scm_sym_unquote;
772 }
773 break;
774 }
775
776 default:
777 fprintf (stderr, "%s: unhandled quote character (%i)\n",
778 "scm_read_quote", chr);
779 abort ();
780 }
781
782 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
783 if (SCM_RECORD_POSITIONS_P)
784 scm_whash_insert (scm_source_whash, p,
785 scm_make_srcprops (line, column,
786 SCM_FILENAME (port),
787 SCM_COPY_SOURCE_P
788 ? (scm_cons2 (SCM_CAR (p),
789 SCM_CAR (SCM_CDR (p)),
790 SCM_EOL))
791 : SCM_UNDEFINED,
792 SCM_EOL));
793
794
795 return p;
796 }
797
798 SCM_SYMBOL (sym_syntax, "syntax");
799 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
800 SCM_SYMBOL (sym_unsyntax, "unsyntax");
801 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
802
803 static SCM
804 scm_read_syntax (int chr, SCM port)
805 {
806 SCM p;
807 long line = SCM_LINUM (port);
808 int column = SCM_COL (port) - 1;
809
810 switch (chr)
811 {
812 case '`':
813 p = sym_quasisyntax;
814 break;
815
816 case '\'':
817 p = sym_syntax;
818 break;
819
820 case ',':
821 {
822 int c;
823
824 c = scm_getc (port);
825 if ('@' == c)
826 p = sym_unsyntax_splicing;
827 else
828 {
829 scm_ungetc (c, port);
830 p = sym_unsyntax;
831 }
832 break;
833 }
834
835 default:
836 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
837 "scm_read_syntax", chr);
838 abort ();
839 }
840
841 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
842 if (SCM_RECORD_POSITIONS_P)
843 scm_whash_insert (scm_source_whash, p,
844 scm_make_srcprops (line, column,
845 SCM_FILENAME (port),
846 SCM_COPY_SOURCE_P
847 ? (scm_cons2 (SCM_CAR (p),
848 SCM_CAR (SCM_CDR (p)),
849 SCM_EOL))
850 : SCM_UNDEFINED,
851 SCM_EOL));
852
853
854 return p;
855 }
856
857 static inline SCM
858 scm_read_nil (int chr, SCM port)
859 {
860 SCM id = scm_read_mixed_case_symbol (chr, port);
861
862 if (!scm_is_eq (id, sym_nil))
863 scm_i_input_error ("scm_read_nil", port,
864 "unexpected input while reading #nil: ~a",
865 scm_list_1 (id));
866
867 return SCM_ELISP_NIL;
868 }
869
870 static inline SCM
871 scm_read_semicolon_comment (int chr, SCM port)
872 {
873 int c;
874
875 /* We use the get_byte here because there is no need to get the
876 locale correct with comment input. This presumes that newline
877 always represents itself no matter what the encoding is. */
878 for (c = scm_get_byte_or_eof (port);
879 (c != EOF) && (c != '\n');
880 c = scm_get_byte_or_eof (port));
881
882 return SCM_UNSPECIFIED;
883 }
884
885 \f
886 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
887
888 static SCM
889 scm_read_boolean (int chr, SCM port)
890 {
891 switch (chr)
892 {
893 case 't':
894 case 'T':
895 return SCM_BOOL_T;
896
897 case 'f':
898 case 'F':
899 return SCM_BOOL_F;
900 }
901
902 return SCM_UNSPECIFIED;
903 }
904
905 static SCM
906 scm_read_character (scm_t_wchar chr, SCM port)
907 #define FUNC_NAME "scm_lreadr"
908 {
909 char buffer[READER_CHAR_NAME_MAX_SIZE];
910 SCM charname;
911 size_t charname_len, bytes_read;
912 scm_t_wchar cp;
913 int overflow;
914 scm_t_port *pt;
915
916 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
917 if (overflow)
918 goto char_error;
919
920 if (bytes_read == 0)
921 {
922 chr = scm_getc (port);
923 if (chr == EOF)
924 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
925 "while reading character", SCM_EOL);
926
927 /* CHR must be a token delimiter, like a whitespace. */
928 return (SCM_MAKE_CHAR (chr));
929 }
930
931 pt = SCM_PTAB_ENTRY (port);
932
933 /* Simple ASCII characters can be processed immediately. Also, simple
934 ISO-8859-1 characters can be processed immediately if the encoding for this
935 port is ISO-8859-1. */
936 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
937 {
938 SCM_COL (port) += 1;
939 return SCM_MAKE_CHAR (buffer[0]);
940 }
941
942 /* Otherwise, convert the buffer into a proper scheme string for
943 processing. */
944 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
945 pt->ilseq_handler);
946 charname_len = scm_i_string_length (charname);
947 SCM_COL (port) += charname_len;
948 cp = scm_i_string_ref (charname, 0);
949 if (charname_len == 1)
950 return SCM_MAKE_CHAR (cp);
951
952 /* Ignore dotted circles, which may be used to keep combining characters from
953 combining with the backslash in #\charname. */
954 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
955 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
956
957 if (cp >= '0' && cp < '8')
958 {
959 /* Dirk:FIXME:: This type of character syntax is not R5RS
960 * compliant. Further, it should be verified that the constant
961 * does only consist of octal digits. */
962 SCM p = scm_string_to_number (charname, scm_from_uint (8));
963 if (SCM_I_INUMP (p))
964 {
965 scm_t_wchar c = SCM_I_INUM (p);
966 if (SCM_IS_UNICODE_CHAR (c))
967 return SCM_MAKE_CHAR (c);
968 else
969 scm_i_input_error (FUNC_NAME, port,
970 "out-of-range octal character escape: ~a",
971 scm_list_1 (charname));
972 }
973 }
974
975 if (cp == 'x' && (charname_len > 1))
976 {
977 SCM p;
978
979 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
980 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
981 scm_from_uint (16));
982 if (SCM_I_INUMP (p))
983 {
984 scm_t_wchar c = SCM_I_INUM (p);
985 if (SCM_IS_UNICODE_CHAR (c))
986 return SCM_MAKE_CHAR (c);
987 else
988 scm_i_input_error (FUNC_NAME, port,
989 "out-of-range hex character escape: ~a",
990 scm_list_1 (charname));
991 }
992 }
993
994 /* The names of characters should never have non-Latin1
995 characters. */
996 if (scm_i_is_narrow_string (charname)
997 || scm_i_try_narrow_string (charname))
998 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
999 charname_len);
1000 if (scm_is_true (ch))
1001 return ch;
1002 }
1003
1004 char_error:
1005 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1006 scm_list_1 (charname));
1007
1008 return SCM_UNSPECIFIED;
1009 }
1010 #undef FUNC_NAME
1011
1012 static inline SCM
1013 scm_read_keyword (int chr, SCM port)
1014 {
1015 SCM symbol;
1016
1017 /* Read the symbol that comprises the keyword. Doing this instead of
1018 invoking a specific symbol reader function allows `scm_read_keyword ()'
1019 to adapt to the delimiters currently valid of symbols.
1020
1021 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1022 symbol = scm_read_expression (port);
1023 if (!scm_is_symbol (symbol))
1024 scm_i_input_error ("scm_read_keyword", port,
1025 "keyword prefix `~a' not followed by a symbol: ~s",
1026 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1027
1028 return (scm_symbol_to_keyword (symbol));
1029 }
1030
1031 static inline SCM
1032 scm_read_vector (int chr, SCM port)
1033 {
1034 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1035 guarantee that it's going to do what we want. After all, this is an
1036 implementation detail of `scm_read_vector ()', not a desirable
1037 property. */
1038 return (scm_vector (scm_read_sexp (chr, port)));
1039 }
1040
1041 static inline SCM
1042 scm_read_srfi4_vector (int chr, SCM port)
1043 {
1044 return scm_i_read_array (port, chr);
1045 }
1046
1047 static SCM
1048 scm_read_bytevector (scm_t_wchar chr, SCM port)
1049 {
1050 chr = scm_getc (port);
1051 if (chr != 'u')
1052 goto syntax;
1053
1054 chr = scm_getc (port);
1055 if (chr != '8')
1056 goto syntax;
1057
1058 chr = scm_getc (port);
1059 if (chr != '(')
1060 goto syntax;
1061
1062 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1063
1064 syntax:
1065 scm_i_input_error ("read_bytevector", port,
1066 "invalid bytevector prefix",
1067 SCM_MAKE_CHAR (chr));
1068 return SCM_UNSPECIFIED;
1069 }
1070
1071 static SCM
1072 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1073 {
1074 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1075 terribly inefficient but who cares? */
1076 SCM s_bits = SCM_EOL;
1077
1078 for (chr = scm_getc (port);
1079 (chr != EOF) && ((chr == '0') || (chr == '1'));
1080 chr = scm_getc (port))
1081 {
1082 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1083 }
1084
1085 if (chr != EOF)
1086 scm_ungetc (chr, port);
1087
1088 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1089 }
1090
1091 static inline SCM
1092 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1093 {
1094 int bang_seen = 0;
1095
1096 /* We can use the get_byte here because there is no need to get the
1097 locale correct when reading comments. This presumes that
1098 hash and exclamation points always represent themselves no
1099 matter what the source encoding is.*/
1100 for (;;)
1101 {
1102 int c = scm_get_byte_or_eof (port);
1103
1104 if (c == EOF)
1105 scm_i_input_error ("skip_block_comment", port,
1106 "unterminated `#! ... !#' comment", SCM_EOL);
1107
1108 if (c == '!')
1109 bang_seen = 1;
1110 else if (c == '#' && bang_seen)
1111 break;
1112 else
1113 bang_seen = 0;
1114 }
1115
1116 return SCM_UNSPECIFIED;
1117 }
1118
1119 static inline SCM
1120 scm_read_shebang (scm_t_wchar chr, SCM port)
1121 {
1122 int c = 0;
1123 if ((c = scm_get_byte_or_eof (port)) != 'r')
1124 {
1125 scm_ungetc (c, port);
1126 return scm_read_scsh_block_comment (chr, port);
1127 }
1128 if ((c = scm_get_byte_or_eof (port)) != '6')
1129 {
1130 scm_ungetc (c, port);
1131 scm_ungetc ('r', port);
1132 return scm_read_scsh_block_comment (chr, port);
1133 }
1134 if ((c = scm_get_byte_or_eof (port)) != 'r')
1135 {
1136 scm_ungetc (c, port);
1137 scm_ungetc ('6', port);
1138 scm_ungetc ('r', port);
1139 return scm_read_scsh_block_comment (chr, port);
1140 }
1141 if ((c = scm_get_byte_or_eof (port)) != 's')
1142 {
1143 scm_ungetc (c, port);
1144 scm_ungetc ('r', port);
1145 scm_ungetc ('6', port);
1146 scm_ungetc ('r', port);
1147 return scm_read_scsh_block_comment (chr, port);
1148 }
1149
1150 return SCM_UNSPECIFIED;
1151 }
1152
1153 static SCM
1154 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1155 {
1156 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1157 nested. So care must be taken. */
1158 int nesting_level = 1;
1159 int opening_seen = 0, closing_seen = 0;
1160
1161 while (nesting_level > 0)
1162 {
1163 int c = scm_getc (port);
1164
1165 if (c == EOF)
1166 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1167 "unterminated `#| ... |#' comment", SCM_EOL);
1168
1169 if (opening_seen)
1170 {
1171 if (c == '|')
1172 nesting_level++;
1173 opening_seen = 0;
1174 }
1175 else if (closing_seen)
1176 {
1177 if (c == '#')
1178 nesting_level--;
1179 closing_seen = 0;
1180 }
1181 else if (c == '|')
1182 closing_seen = 1;
1183 else if (c == '#')
1184 opening_seen = 1;
1185 else
1186 opening_seen = closing_seen = 0;
1187 }
1188
1189 return SCM_UNSPECIFIED;
1190 }
1191
1192 static SCM
1193 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1194 {
1195 scm_t_wchar c;
1196
1197 c = flush_ws (port, (char *) NULL);
1198 if (EOF == c)
1199 scm_i_input_error ("read_commented_expression", port,
1200 "no expression after #; comment", SCM_EOL);
1201 scm_ungetc (c, port);
1202 scm_read_expression (port);
1203 return SCM_UNSPECIFIED;
1204 }
1205
1206 static SCM
1207 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1208 {
1209 /* Guile's extended symbol read syntax looks like this:
1210
1211 #{This is all a symbol name}#
1212
1213 So here, CHR is expected to be `{'. */
1214 int saw_brace = 0, finished = 0;
1215 size_t len = 0;
1216 SCM buf = scm_i_make_string (1024, NULL);
1217
1218 buf = scm_i_string_start_writing (buf);
1219
1220 while ((chr = scm_getc (port)) != EOF)
1221 {
1222 if (saw_brace)
1223 {
1224 if (chr == '#')
1225 {
1226 finished = 1;
1227 break;
1228 }
1229 else
1230 {
1231 saw_brace = 0;
1232 scm_i_string_set_x (buf, len++, '}');
1233 scm_i_string_set_x (buf, len++, chr);
1234 }
1235 }
1236 else if (chr == '}')
1237 saw_brace = 1;
1238 else
1239 scm_i_string_set_x (buf, len++, chr);
1240
1241 if (len >= scm_i_string_length (buf) - 2)
1242 {
1243 SCM addy;
1244
1245 scm_i_string_stop_writing ();
1246 addy = scm_i_make_string (1024, NULL);
1247 buf = scm_string_append (scm_list_2 (buf, addy));
1248 len = 0;
1249 buf = scm_i_string_start_writing (buf);
1250 }
1251
1252 if (finished)
1253 break;
1254 }
1255 scm_i_string_stop_writing ();
1256
1257 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1258 }
1259
1260
1261 \f
1262 /* Top-level token readers, i.e., dispatchers. */
1263
1264 static SCM
1265 scm_read_sharp_extension (int chr, SCM port)
1266 {
1267 SCM proc;
1268
1269 proc = scm_get_hash_procedure (chr);
1270 if (scm_is_true (scm_procedure_p (proc)))
1271 {
1272 long line = SCM_LINUM (port);
1273 int column = SCM_COL (port) - 2;
1274 SCM got;
1275
1276 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1277 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1278 {
1279 if (SCM_RECORD_POSITIONS_P)
1280 return (recsexpr (got, line, column,
1281 SCM_FILENAME (port)));
1282 else
1283 return got;
1284 }
1285 }
1286
1287 return SCM_UNSPECIFIED;
1288 }
1289
1290 /* The reader for the sharp `#' character. It basically dispatches reads
1291 among the above token readers. */
1292 static SCM
1293 scm_read_sharp (scm_t_wchar chr, SCM port)
1294 #define FUNC_NAME "scm_lreadr"
1295 {
1296 SCM result;
1297
1298 chr = scm_getc (port);
1299
1300 result = scm_read_sharp_extension (chr, port);
1301 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1302 return result;
1303
1304 switch (chr)
1305 {
1306 case '\\':
1307 return (scm_read_character (chr, port));
1308 case '(':
1309 return (scm_read_vector (chr, port));
1310 case 's':
1311 case 'u':
1312 case 'f':
1313 /* This one may return either a boolean or an SRFI-4 vector. */
1314 return (scm_read_srfi4_vector (chr, port));
1315 case 'v':
1316 return (scm_read_bytevector (chr, port));
1317 case '*':
1318 return (scm_read_guile_bit_vector (chr, port));
1319 case 't':
1320 case 'T':
1321 case 'F':
1322 /* This one may return either a boolean or an SRFI-4 vector. */
1323 return (scm_read_boolean (chr, port));
1324 case ':':
1325 return (scm_read_keyword (chr, port));
1326 case '0': case '1': case '2': case '3': case '4':
1327 case '5': case '6': case '7': case '8': case '9':
1328 case '@':
1329 #if SCM_ENABLE_DEPRECATED
1330 /* See below for 'i' and 'e'. */
1331 case 'a':
1332 case 'c':
1333 case 'y':
1334 case 'h':
1335 case 'l':
1336 #endif
1337 return (scm_i_read_array (port, chr));
1338
1339 case 'i':
1340 case 'e':
1341 #if SCM_ENABLE_DEPRECATED
1342 {
1343 /* When next char is '(', it really is an old-style
1344 uniform array. */
1345 scm_t_wchar next_c = scm_getc (port);
1346 if (next_c != EOF)
1347 scm_ungetc (next_c, port);
1348 if (next_c == '(')
1349 return scm_i_read_array (port, chr);
1350 /* Fall through. */
1351 }
1352 #endif
1353 case 'b':
1354 case 'B':
1355 case 'o':
1356 case 'O':
1357 case 'd':
1358 case 'D':
1359 case 'x':
1360 case 'X':
1361 case 'I':
1362 case 'E':
1363 return (scm_read_number_and_radix (chr, port));
1364 case '{':
1365 return (scm_read_extended_symbol (chr, port));
1366 case '!':
1367 return (scm_read_shebang (chr, port));
1368 case ';':
1369 return (scm_read_commented_expression (chr, port));
1370 case '`':
1371 case '\'':
1372 case ',':
1373 return (scm_read_syntax (chr, port));
1374 case 'n':
1375 return (scm_read_nil (chr, port));
1376 default:
1377 result = scm_read_sharp_extension (chr, port);
1378 if (scm_is_eq (result, SCM_UNSPECIFIED))
1379 {
1380 /* To remain compatible with 1.8 and earlier, the following
1381 characters have lower precedence than `read-hash-extend'
1382 characters. */
1383 switch (chr)
1384 {
1385 case '|':
1386 return scm_read_r6rs_block_comment (chr, port);
1387 default:
1388 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1389 scm_list_1 (SCM_MAKE_CHAR (chr)));
1390 }
1391 }
1392 else
1393 return result;
1394 }
1395
1396 return SCM_UNSPECIFIED;
1397 }
1398 #undef FUNC_NAME
1399
1400 static SCM
1401 scm_read_expression (SCM port)
1402 #define FUNC_NAME "scm_read_expression"
1403 {
1404 while (1)
1405 {
1406 register scm_t_wchar chr;
1407
1408 chr = scm_getc (port);
1409
1410 switch (chr)
1411 {
1412 case SCM_WHITE_SPACES:
1413 case SCM_LINE_INCREMENTORS:
1414 break;
1415 case ';':
1416 (void) scm_read_semicolon_comment (chr, port);
1417 break;
1418 case '[':
1419 if (!SCM_SQUARE_BRACKETS_P)
1420 return (scm_read_mixed_case_symbol (chr, port));
1421 /* otherwise fall through */
1422 case '(':
1423 return (scm_read_sexp (chr, port));
1424 case '"':
1425 return (scm_read_string (chr, port));
1426 case '\'':
1427 case '`':
1428 case ',':
1429 return (scm_read_quote (chr, port));
1430 case '#':
1431 {
1432 SCM result;
1433 result = scm_read_sharp (chr, port);
1434 if (scm_is_eq (result, SCM_UNSPECIFIED))
1435 /* We read a comment or some such. */
1436 break;
1437 else
1438 return result;
1439 }
1440 case ')':
1441 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1442 break;
1443 case EOF:
1444 return SCM_EOF_VAL;
1445 case ':':
1446 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1447 return scm_symbol_to_keyword (scm_read_expression (port));
1448 /* Fall through. */
1449
1450 default:
1451 {
1452 if (((chr >= '0') && (chr <= '9'))
1453 || (strchr ("+-.", chr)))
1454 return (scm_read_number (chr, port));
1455 else
1456 return (scm_read_mixed_case_symbol (chr, port));
1457 }
1458 }
1459 }
1460 }
1461 #undef FUNC_NAME
1462
1463 \f
1464 /* Actual reader. */
1465
1466 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1467 (SCM port),
1468 "Read an s-expression from the input port @var{port}, or from\n"
1469 "the current input port if @var{port} is not specified.\n"
1470 "Any whitespace before the next token is discarded.")
1471 #define FUNC_NAME s_scm_read
1472 {
1473 int c;
1474
1475 if (SCM_UNBNDP (port))
1476 port = scm_current_input_port ();
1477 SCM_VALIDATE_OPINPORT (1, port);
1478
1479 c = flush_ws (port, (char *) NULL);
1480 if (EOF == c)
1481 return SCM_EOF_VAL;
1482 scm_ungetc (c, port);
1483
1484 return (scm_read_expression (port));
1485 }
1486 #undef FUNC_NAME
1487
1488
1489 \f
1490
1491 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1492 static SCM
1493 recsexpr (SCM obj, long line, int column, SCM filename)
1494 {
1495 if (!scm_is_pair(obj)) {
1496 return obj;
1497 } else {
1498 SCM tmp = obj, copy;
1499 /* If this sexpr is visible in the read:sharp source, we want to
1500 keep that information, so only record non-constant cons cells
1501 which haven't previously been read by the reader. */
1502 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1503 {
1504 if (SCM_COPY_SOURCE_P)
1505 {
1506 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1507 SCM_UNDEFINED);
1508 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1509 {
1510 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1511 line,
1512 column,
1513 filename),
1514 SCM_UNDEFINED));
1515 copy = SCM_CDR (copy);
1516 }
1517 SCM_SETCDR (copy, tmp);
1518 }
1519 else
1520 {
1521 recsexpr (SCM_CAR (obj), line, column, filename);
1522 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1523 recsexpr (SCM_CAR (tmp), line, column, filename);
1524 copy = SCM_UNDEFINED;
1525 }
1526 scm_whash_insert (scm_source_whash,
1527 obj,
1528 scm_make_srcprops (line,
1529 column,
1530 filename,
1531 copy,
1532 SCM_EOL));
1533 }
1534 return obj;
1535 }
1536 }
1537
1538 /* Manipulate the read-hash-procedures alist. This could be written in
1539 Scheme, but maybe it will also be used by C code during initialisation. */
1540 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1541 (SCM chr, SCM proc),
1542 "Install the procedure @var{proc} for reading expressions\n"
1543 "starting with the character sequence @code{#} and @var{chr}.\n"
1544 "@var{proc} will be called with two arguments: the character\n"
1545 "@var{chr} and the port to read further data from. The object\n"
1546 "returned will be the return value of @code{read}. \n"
1547 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1548 )
1549 #define FUNC_NAME s_scm_read_hash_extend
1550 {
1551 SCM this;
1552 SCM prev;
1553
1554 SCM_VALIDATE_CHAR (1, chr);
1555 SCM_ASSERT (scm_is_false (proc)
1556 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1557 proc, SCM_ARG2, FUNC_NAME);
1558
1559 /* Check if chr is already in the alist. */
1560 this = *scm_read_hash_procedures;
1561 prev = SCM_BOOL_F;
1562 while (1)
1563 {
1564 if (scm_is_null (this))
1565 {
1566 /* not found, so add it to the beginning. */
1567 if (scm_is_true (proc))
1568 {
1569 *scm_read_hash_procedures =
1570 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1571 }
1572 break;
1573 }
1574 if (scm_is_eq (chr, SCM_CAAR (this)))
1575 {
1576 /* already in the alist. */
1577 if (scm_is_false (proc))
1578 {
1579 /* remove it. */
1580 if (scm_is_false (prev))
1581 {
1582 *scm_read_hash_procedures =
1583 SCM_CDR (*scm_read_hash_procedures);
1584 }
1585 else
1586 scm_set_cdr_x (prev, SCM_CDR (this));
1587 }
1588 else
1589 {
1590 /* replace it. */
1591 scm_set_cdr_x (SCM_CAR (this), proc);
1592 }
1593 break;
1594 }
1595 prev = this;
1596 this = SCM_CDR (this);
1597 }
1598
1599 return SCM_UNSPECIFIED;
1600 }
1601 #undef FUNC_NAME
1602
1603 /* Recover the read-hash procedure corresponding to char c. */
1604 static SCM
1605 scm_get_hash_procedure (int c)
1606 {
1607 SCM rest = *scm_read_hash_procedures;
1608
1609 while (1)
1610 {
1611 if (scm_is_null (rest))
1612 return SCM_BOOL_F;
1613
1614 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1615 return SCM_CDAR (rest);
1616
1617 rest = SCM_CDR (rest);
1618 }
1619 }
1620
1621 #define SCM_ENCODING_SEARCH_SIZE (500)
1622
1623 /* Search the first few hundred characters of a file for an Emacs-like coding
1624 declaration. Returns either NULL or a string whose storage has been
1625 allocated with `scm_gc_malloc ()'. */
1626 char *
1627 scm_i_scan_for_encoding (SCM port)
1628 {
1629 char header[SCM_ENCODING_SEARCH_SIZE+1];
1630 size_t bytes_read, encoding_length, i;
1631 char *encoding = NULL;
1632 int utf8_bom = 0;
1633 char *pos, *encoding_start;
1634 int in_comment;
1635
1636 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1637 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1638 "guile <(echo '(display "hello")')") so bail out. */
1639 return NULL;
1640
1641 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1642
1643 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1644
1645 if (bytes_read > 3
1646 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1647 utf8_bom = 1;
1648
1649 /* search past "coding[:=]" */
1650 pos = header;
1651 while (1)
1652 {
1653 if ((pos = strstr(pos, "coding")) == NULL)
1654 return NULL;
1655
1656 pos += strlen("coding");
1657 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1658 (*pos == ':' || *pos == '='))
1659 {
1660 pos ++;
1661 break;
1662 }
1663 }
1664
1665 /* skip spaces */
1666 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1667 (*pos == ' ' || *pos == '\t'))
1668 pos ++;
1669
1670 /* grab the next token */
1671 encoding_start = pos;
1672 i = 0;
1673 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1674 && encoding_start + i - header < bytes_read
1675 && (isalnum ((int) encoding_start[i])
1676 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1677 i++;
1678
1679 encoding_length = i;
1680 if (encoding_length == 0)
1681 return NULL;
1682
1683 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1684 for (i = 0; i < encoding_length; i++)
1685 encoding[i] = toupper ((int) encoding[i]);
1686
1687 /* push backwards to make sure we were in a comment */
1688 in_comment = 0;
1689 pos = encoding_start;
1690 while (pos >= header)
1691 {
1692 if (*pos == '\n')
1693 {
1694 /* This wasn't in a semicolon comment. Check for a
1695 hash-bang comment. */
1696 char *beg = strstr (header, "#!");
1697 char *end = strstr (header, "!#");
1698 if (beg < encoding_start && encoding_start + encoding_length < end)
1699 in_comment = 1;
1700 break;
1701 }
1702 if (*pos == ';')
1703 {
1704 in_comment = 1;
1705 break;
1706 }
1707 pos --;
1708 }
1709 if (!in_comment)
1710 /* This wasn't in a comment */
1711 return NULL;
1712
1713 if (utf8_bom && strcmp(encoding, "UTF-8"))
1714 scm_misc_error (NULL,
1715 "the port input declares the encoding ~s but is encoded as UTF-8",
1716 scm_list_1 (scm_from_locale_string (encoding)));
1717
1718 return encoding;
1719 }
1720
1721 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1722 (SCM port),
1723 "Scans the port for an Emacs-like character coding declaration\n"
1724 "near the top of the contents of a port with random-acessible contents.\n"
1725 "The coding declaration is of the form\n"
1726 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1727 "\n"
1728 "Returns a string containing the character encoding of the file\n"
1729 "if a declaration was found, or @code{#f} otherwise.\n")
1730 #define FUNC_NAME s_scm_file_encoding
1731 {
1732 char *enc;
1733 SCM s_enc;
1734
1735 enc = scm_i_scan_for_encoding (port);
1736 if (enc == NULL)
1737 return SCM_BOOL_F;
1738 else
1739 {
1740 s_enc = scm_from_locale_string (enc);
1741 return s_enc;
1742 }
1743
1744 return SCM_BOOL_F;
1745 }
1746 #undef FUNC_NAME
1747
1748 void
1749 scm_init_read ()
1750 {
1751 scm_read_hash_procedures =
1752 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1753
1754 scm_init_opts (scm_read_options, scm_read_opts);
1755 #include "libguile/read.x"
1756 }
1757
1758 /*
1759 Local Variables:
1760 c-file-style: "gnu"
1761 End:
1762 */