lisp nil always enabled
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
53
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
56
57
58 \f
59
60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
63
64 scm_t_option scm_read_opts[] = {
65 { SCM_OPTION_BOOLEAN, "copy", 0,
66 "Copy source code expressions." },
67 { SCM_OPTION_BOOLEAN, "positions", 0,
68 "Record positions of source code expressions." },
69 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
70 "Convert symbols to lower case."},
71 { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
72 "Style of keyword recognition: #f, 'prefix or 'postfix."},
73 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
74 "Support Elisp vector syntax, namely `[...]'."},
75 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
76 "Support `\\(' and `\\)' in strings."},
77 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
78 "Use R6RS variable-length character and string hex escapes."},
79 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
80 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_locale_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* An association list mapping extra hash characters to procedures. */
142 static SCM *scm_read_hash_procedures;
143
144
145 \f
146 /* Token readers. */
147
148
149 /* Size of the C buffer used to read symbols and numbers. */
150 #define READER_BUFFER_SIZE 128
151
152 /* Size of the C buffer used to read strings. */
153 #define READER_STRING_BUFFER_SIZE 512
154
155 /* The maximum size of Scheme character names. */
156 #define READER_CHAR_NAME_MAX_SIZE 50
157
158
159 /* `isblank' is only in C99. */
160 #define CHAR_IS_BLANK_(_chr) \
161 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
162 || ((_chr) == '\f') || ((_chr) == '\r'))
163
164 #ifdef MSDOS
165 # define CHAR_IS_BLANK(_chr) \
166 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
167 #else
168 # define CHAR_IS_BLANK CHAR_IS_BLANK_
169 #endif
170
171
172 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
173 structure''). */
174 #define CHAR_IS_R5RS_DELIMITER(c) \
175 (CHAR_IS_BLANK (c) \
176 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
177 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
178
179 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
180
181 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
182 Structure''. */
183 #define CHAR_IS_EXPONENT_MARKER(_chr) \
184 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
185 || ((_chr) == 'd') || ((_chr) == 'l'))
186
187 /* Read an SCSH block comment. */
188 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
189 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
190 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
191 static SCM scm_get_hash_procedure (int);
192
193 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
194 result in the pre-allocated buffer BUF. Return zero if the whole token has
195 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
196 bytes actually read. */
197 static inline int
198 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
199 {
200 *read = 0;
201
202 while (*read < buf_size)
203 {
204 int chr;
205
206 chr = scm_get_byte_or_eof (port);
207
208 if (chr == EOF)
209 return 0;
210 else if (CHAR_IS_DELIMITER (chr))
211 {
212 scm_unget_byte (chr, port);
213 return 0;
214 }
215 else
216 {
217 *buf = (char) chr;
218 buf++, (*read)++;
219 }
220 }
221
222 return 1;
223 }
224
225 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
226 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
227 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
228 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
229 will be set the number of bytes actually read. */
230 static int
231 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
232 char **overflow_buffer, size_t *read)
233 {
234 int overflow = 0;
235 size_t bytes_read, overflow_size;
236
237 *overflow_buffer = NULL;
238 overflow_size = 0;
239
240 do
241 {
242 overflow = read_token (port, buffer, buffer_size, &bytes_read);
243 if (bytes_read == 0)
244 break;
245 if (overflow || overflow_size != 0)
246 {
247 if (overflow_size == 0)
248 {
249 *overflow_buffer = scm_malloc (bytes_read);
250 memcpy (*overflow_buffer, buffer, bytes_read);
251 overflow_size = bytes_read;
252 }
253 else
254 {
255 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
256 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
257 overflow_size += bytes_read;
258 }
259 }
260 }
261 while (overflow);
262
263 if (overflow_size)
264 *read = overflow_size;
265 else
266 *read = bytes_read;
267
268 return (overflow_size != 0);
269 }
270
271 /* Skip whitespace from PORT and return the first non-whitespace character
272 read. Raise an error on end-of-file. */
273 static int
274 flush_ws (SCM port, const char *eoferr)
275 {
276 register scm_t_wchar c;
277 while (1)
278 switch (c = scm_getc (port))
279 {
280 case EOF:
281 goteof:
282 if (eoferr)
283 {
284 scm_i_input_error (eoferr,
285 port,
286 "end of file",
287 SCM_EOL);
288 }
289 return c;
290
291 case ';':
292 lp:
293 switch (c = scm_getc (port))
294 {
295 case EOF:
296 goto goteof;
297 default:
298 goto lp;
299 case SCM_LINE_INCREMENTORS:
300 break;
301 }
302 break;
303
304 case '#':
305 switch (c = scm_getc (port))
306 {
307 case EOF:
308 eoferr = "read_sharp";
309 goto goteof;
310 case '!':
311 scm_read_scsh_block_comment (c, port);
312 break;
313 case ';':
314 scm_read_commented_expression (c, port);
315 break;
316 case '|':
317 if (scm_is_false (scm_get_hash_procedure (c)))
318 {
319 scm_read_r6rs_block_comment (c, port);
320 break;
321 }
322 /* fall through */
323 default:
324 scm_ungetc (c, port);
325 return '#';
326 }
327 break;
328
329 case SCM_LINE_INCREMENTORS:
330 case SCM_SINGLE_SPACES:
331 case '\t':
332 break;
333
334 default:
335 return c;
336 }
337
338 return 0;
339 }
340
341
342 \f
343 /* Token readers. */
344
345 static SCM scm_read_expression (SCM port);
346 static SCM scm_read_sharp (int chr, SCM port);
347 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
348
349
350 static SCM
351 scm_read_sexp (scm_t_wchar chr, SCM port)
352 #define FUNC_NAME "scm_i_lreadparen"
353 {
354 register int c;
355 register SCM tmp;
356 register SCM tl, ans = SCM_EOL;
357 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
358 const int terminating_char = ((chr == '[') ? ']' : ')');
359
360 /* Need to capture line and column numbers here. */
361 long line = SCM_LINUM (port);
362 int column = SCM_COL (port) - 1;
363
364
365 c = flush_ws (port, FUNC_NAME);
366 if (terminating_char == c)
367 return SCM_EOL;
368
369 scm_ungetc (c, port);
370 if (scm_is_eq (scm_sym_dot,
371 (tmp = scm_read_expression (port))))
372 {
373 ans = scm_read_expression (port);
374 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
375 scm_i_input_error (FUNC_NAME, port, "missing close paren",
376 SCM_EOL);
377 return ans;
378 }
379
380 /* Build the head of the list structure. */
381 ans = tl = scm_cons (tmp, SCM_EOL);
382
383 if (SCM_COPY_SOURCE_P)
384 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
385 ? copy
386 : tmp,
387 SCM_EOL);
388
389 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
390 {
391 SCM new_tail;
392
393 scm_ungetc (c, port);
394 if (scm_is_eq (scm_sym_dot,
395 (tmp = scm_read_expression (port))))
396 {
397 SCM_SETCDR (tl, tmp = scm_read_expression (port));
398
399 if (SCM_COPY_SOURCE_P)
400 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
401 SCM_EOL));
402
403 c = flush_ws (port, FUNC_NAME);
404 if (terminating_char != c)
405 scm_i_input_error (FUNC_NAME, port,
406 "in pair: missing close paren", SCM_EOL);
407 goto exit;
408 }
409
410 new_tail = scm_cons (tmp, SCM_EOL);
411 SCM_SETCDR (tl, new_tail);
412 tl = new_tail;
413
414 if (SCM_COPY_SOURCE_P)
415 {
416 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
417 ? copy
418 : tmp, SCM_EOL);
419 SCM_SETCDR (tl2, new_tail2);
420 tl2 = new_tail2;
421 }
422 }
423
424 exit:
425 if (SCM_RECORD_POSITIONS_P)
426 scm_whash_insert (scm_source_whash,
427 ans,
428 scm_make_srcprops (line, column,
429 SCM_FILENAME (port),
430 SCM_COPY_SOURCE_P
431 ? ans2
432 : SCM_UNDEFINED,
433 SCM_EOL));
434 return ans;
435 }
436 #undef FUNC_NAME
437
438
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
471 } while (0)
472
473 static SCM
474 scm_read_string (int chr, SCM port)
475 #define FUNC_NAME "scm_lreadr"
476 {
477 /* For strings smaller than C_STR, this function creates only one Scheme
478 object (the string returned). */
479
480 SCM str = SCM_BOOL_F;
481 unsigned c_str_len = 0;
482 scm_t_wchar c;
483
484 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
485 while ('"' != (c = scm_getc (port)))
486 {
487 if (c == EOF)
488 {
489 str_eof:
490 scm_i_input_error (FUNC_NAME, port,
491 "end of file in string constant", SCM_EOL);
492 }
493
494 if (c_str_len + 1 >= scm_i_string_length (str))
495 {
496 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
497
498 str = scm_string_append (scm_list_2 (str, addy));
499 }
500
501 if (c == '\\')
502 {
503 switch (c = scm_getc (port))
504 {
505 case EOF:
506 goto str_eof;
507 case '"':
508 case '\\':
509 break;
510 case '(':
511 case ')':
512 if (SCM_ESCAPED_PARENS_P)
513 break;
514 goto bad_escaped;
515 case '\n':
516 continue;
517 case '0':
518 c = '\0';
519 break;
520 case 'f':
521 c = '\f';
522 break;
523 case 'n':
524 c = '\n';
525 break;
526 case 'r':
527 c = '\r';
528 break;
529 case 't':
530 c = '\t';
531 break;
532 case 'a':
533 c = '\007';
534 break;
535 case 'v':
536 c = '\v';
537 break;
538 case 'b':
539 c = '\010';
540 break;
541 case 'x':
542 if (SCM_R6RS_ESCAPES_P)
543 SCM_READ_HEX_ESCAPE (10, ';');
544 else
545 SCM_READ_HEX_ESCAPE (2, '\0');
546 break;
547 case 'u':
548 if (!SCM_R6RS_ESCAPES_P)
549 {
550 SCM_READ_HEX_ESCAPE (4, '\0');
551 break;
552 }
553 case 'U':
554 if (!SCM_R6RS_ESCAPES_P)
555 {
556 SCM_READ_HEX_ESCAPE (6, '\0');
557 break;
558 }
559 default:
560 bad_escaped:
561 scm_i_input_error (FUNC_NAME, port,
562 "illegal character in escape sequence: ~S",
563 scm_list_1 (SCM_MAKE_CHAR (c)));
564 }
565 }
566 str = scm_i_string_start_writing (str);
567 scm_i_string_set_x (str, c_str_len++, c);
568 scm_i_string_stop_writing ();
569 }
570
571 if (c_str_len > 0)
572 {
573 return scm_i_substring_copy (str, 0, c_str_len);
574 }
575
576 return scm_nullstr;
577 }
578 #undef FUNC_NAME
579
580
581 static SCM
582 scm_read_number (scm_t_wchar chr, SCM port)
583 {
584 SCM result, str = SCM_EOL;
585 char buffer[READER_BUFFER_SIZE];
586 char *overflow_buffer = NULL;
587 size_t bytes_read;
588 int overflow;
589 scm_t_port *pt = SCM_PTAB_ENTRY (port);
590
591 scm_ungetc (chr, port);
592 overflow = read_complete_token (port, buffer, sizeof (buffer),
593 &overflow_buffer, &bytes_read);
594
595 if (!overflow)
596 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
597 else
598 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
599 pt->ilseq_handler);
600
601 result = scm_string_to_number (str, SCM_UNDEFINED);
602 if (!scm_is_true (result))
603 {
604 /* Return a symbol instead of a number */
605 if (SCM_CASE_INSENSITIVE_P)
606 str = scm_string_downcase_x (str);
607 result = scm_string_to_symbol (str);
608 }
609
610 if (overflow)
611 free (overflow_buffer);
612 SCM_COL (port) += scm_i_string_length (str);
613 return result;
614 }
615
616 static SCM
617 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
618 {
619 SCM result;
620 int ends_with_colon = 0;
621 size_t bytes_read;
622 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
623 int overflow;
624 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
625 scm_t_port *pt = SCM_PTAB_ENTRY (port);
626 SCM str;
627
628 scm_ungetc (chr, port);
629 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
630 &overflow_buffer, &bytes_read);
631 if (bytes_read > 0)
632 {
633 if (!overflow)
634 ends_with_colon = buffer[bytes_read - 1] == ':';
635 else
636 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
637 }
638
639 if (postfix && ends_with_colon && (bytes_read > 1))
640 {
641 if (!overflow)
642 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
643 else
644 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
645 pt->ilseq_handler);
646
647 if (SCM_CASE_INSENSITIVE_P)
648 str = scm_string_downcase_x (str);
649 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
650 }
651 else
652 {
653 if (!overflow)
654 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
655 else
656 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
657 pt->ilseq_handler);
658
659 if (SCM_CASE_INSENSITIVE_P)
660 str = scm_string_downcase_x (str);
661 result = scm_string_to_symbol (str);
662 }
663
664 if (overflow)
665 free (overflow_buffer);
666 SCM_COL (port) += scm_i_string_length (str);
667 return result;
668 }
669
670 static SCM
671 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
672 #define FUNC_NAME "scm_lreadr"
673 {
674 SCM result;
675 size_t read;
676 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
677 int overflow;
678 unsigned int radix;
679 SCM str;
680 scm_t_port *pt;
681
682 switch (chr)
683 {
684 case 'B':
685 case 'b':
686 radix = 2;
687 break;
688
689 case 'o':
690 case 'O':
691 radix = 8;
692 break;
693
694 case 'd':
695 case 'D':
696 radix = 10;
697 break;
698
699 case 'x':
700 case 'X':
701 radix = 16;
702 break;
703
704 default:
705 scm_ungetc (chr, port);
706 scm_ungetc ('#', port);
707 radix = 10;
708 }
709
710 overflow = read_complete_token (port, buffer, sizeof (buffer),
711 &overflow_buffer, &read);
712
713 pt = SCM_PTAB_ENTRY (port);
714 if (!overflow)
715 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
716 else
717 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
718 pt->ilseq_handler);
719
720 result = scm_string_to_number (str, scm_from_uint (radix));
721
722 if (overflow)
723 free (overflow_buffer);
724
725 SCM_COL (port) += scm_i_string_length (str);
726
727 if (scm_is_true (result))
728 return result;
729
730 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
731
732 return SCM_BOOL_F;
733 }
734 #undef FUNC_NAME
735
736 static SCM
737 scm_read_quote (int chr, SCM port)
738 {
739 SCM p;
740 long line = SCM_LINUM (port);
741 int column = SCM_COL (port) - 1;
742
743 switch (chr)
744 {
745 case '`':
746 p = scm_sym_quasiquote;
747 break;
748
749 case '\'':
750 p = scm_sym_quote;
751 break;
752
753 case ',':
754 {
755 scm_t_wchar c;
756
757 c = scm_getc (port);
758 if ('@' == c)
759 p = scm_sym_uq_splicing;
760 else
761 {
762 scm_ungetc (c, port);
763 p = scm_sym_unquote;
764 }
765 break;
766 }
767
768 default:
769 fprintf (stderr, "%s: unhandled quote character (%i)\n",
770 "scm_read_quote", chr);
771 abort ();
772 }
773
774 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
775 if (SCM_RECORD_POSITIONS_P)
776 scm_whash_insert (scm_source_whash, p,
777 scm_make_srcprops (line, column,
778 SCM_FILENAME (port),
779 SCM_COPY_SOURCE_P
780 ? (scm_cons2 (SCM_CAR (p),
781 SCM_CAR (SCM_CDR (p)),
782 SCM_EOL))
783 : SCM_UNDEFINED,
784 SCM_EOL));
785
786
787 return p;
788 }
789
790 SCM_SYMBOL (sym_syntax, "syntax");
791 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
792 SCM_SYMBOL (sym_unsyntax, "unsyntax");
793 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
794
795 static SCM
796 scm_read_syntax (int chr, SCM port)
797 {
798 SCM p;
799 long line = SCM_LINUM (port);
800 int column = SCM_COL (port) - 1;
801
802 switch (chr)
803 {
804 case '`':
805 p = sym_quasisyntax;
806 break;
807
808 case '\'':
809 p = sym_syntax;
810 break;
811
812 case ',':
813 {
814 int c;
815
816 c = scm_getc (port);
817 if ('@' == c)
818 p = sym_unsyntax_splicing;
819 else
820 {
821 scm_ungetc (c, port);
822 p = sym_unsyntax;
823 }
824 break;
825 }
826
827 default:
828 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
829 "scm_read_syntax", chr);
830 abort ();
831 }
832
833 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
834 if (SCM_RECORD_POSITIONS_P)
835 scm_whash_insert (scm_source_whash, p,
836 scm_make_srcprops (line, column,
837 SCM_FILENAME (port),
838 SCM_COPY_SOURCE_P
839 ? (scm_cons2 (SCM_CAR (p),
840 SCM_CAR (SCM_CDR (p)),
841 SCM_EOL))
842 : SCM_UNDEFINED,
843 SCM_EOL));
844
845
846 return p;
847 }
848
849 static inline SCM
850 scm_read_semicolon_comment (int chr, SCM port)
851 {
852 int c;
853
854 /* We use the get_byte here because there is no need to get the
855 locale correct with comment input. This presumes that newline
856 always represents itself no matter what the encoding is. */
857 for (c = scm_get_byte_or_eof (port);
858 (c != EOF) && (c != '\n');
859 c = scm_get_byte_or_eof (port));
860
861 return SCM_UNSPECIFIED;
862 }
863
864 \f
865 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
866
867 static SCM
868 scm_read_boolean (int chr, SCM port)
869 {
870 switch (chr)
871 {
872 case 't':
873 case 'T':
874 return SCM_BOOL_T;
875
876 case 'f':
877 case 'F':
878 return SCM_BOOL_F;
879 }
880
881 return SCM_UNSPECIFIED;
882 }
883
884 static SCM
885 scm_read_character (scm_t_wchar chr, SCM port)
886 #define FUNC_NAME "scm_lreadr"
887 {
888 char buffer[READER_CHAR_NAME_MAX_SIZE];
889 SCM charname;
890 size_t charname_len, bytes_read;
891 scm_t_wchar cp;
892 int overflow;
893 scm_t_port *pt;
894
895 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
896 if (overflow)
897 goto char_error;
898
899 if (bytes_read == 0)
900 {
901 chr = scm_getc (port);
902 if (chr == EOF)
903 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
904 "while reading character", SCM_EOL);
905
906 /* CHR must be a token delimiter, like a whitespace. */
907 return (SCM_MAKE_CHAR (chr));
908 }
909
910 pt = SCM_PTAB_ENTRY (port);
911
912 /* Simple ASCII characters can be processed immediately. Also, simple
913 ISO-8859-1 characters can be processed immediately if the encoding for this
914 port is ISO-8859-1. */
915 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
916 {
917 SCM_COL (port) += 1;
918 return SCM_MAKE_CHAR (buffer[0]);
919 }
920
921 /* Otherwise, convert the buffer into a proper scheme string for
922 processing. */
923 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
924 pt->ilseq_handler);
925 charname_len = scm_i_string_length (charname);
926 SCM_COL (port) += charname_len;
927 cp = scm_i_string_ref (charname, 0);
928 if (charname_len == 1)
929 return SCM_MAKE_CHAR (cp);
930
931 /* Ignore dotted circles, which may be used to keep combining characters from
932 combining with the backslash in #\charname. */
933 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
934 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
935
936 if (cp >= '0' && cp < '8')
937 {
938 /* Dirk:FIXME:: This type of character syntax is not R5RS
939 * compliant. Further, it should be verified that the constant
940 * does only consist of octal digits. */
941 SCM p = scm_string_to_number (charname, scm_from_uint (8));
942 if (SCM_I_INUMP (p))
943 {
944 scm_t_wchar c = SCM_I_INUM (p);
945 if (SCM_IS_UNICODE_CHAR (c))
946 return SCM_MAKE_CHAR (c);
947 else
948 scm_i_input_error (FUNC_NAME, port,
949 "out-of-range octal character escape: ~a",
950 scm_list_1 (charname));
951 }
952 }
953
954 if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
955 {
956 SCM p;
957
958 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
959 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
960 scm_from_uint (16));
961 if (SCM_I_INUMP (p))
962 {
963 scm_t_wchar c = SCM_I_INUM (p);
964 if (SCM_IS_UNICODE_CHAR (c))
965 return SCM_MAKE_CHAR (c);
966 else
967 scm_i_input_error (FUNC_NAME, port,
968 "out-of-range hex character escape: ~a",
969 scm_list_1 (charname));
970 }
971 }
972
973 /* The names of characters should never have non-Latin1
974 characters. */
975 if (scm_i_is_narrow_string (charname)
976 || scm_i_try_narrow_string (charname))
977 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
978 charname_len);
979 if (scm_is_true (ch))
980 return ch;
981 }
982
983 char_error:
984 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
985 scm_list_1 (charname));
986
987 return SCM_UNSPECIFIED;
988 }
989 #undef FUNC_NAME
990
991 static inline SCM
992 scm_read_keyword (int chr, SCM port)
993 {
994 SCM symbol;
995
996 /* Read the symbol that comprises the keyword. Doing this instead of
997 invoking a specific symbol reader function allows `scm_read_keyword ()'
998 to adapt to the delimiters currently valid of symbols.
999
1000 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1001 symbol = scm_read_expression (port);
1002 if (!scm_is_symbol (symbol))
1003 scm_i_input_error ("scm_read_keyword", port,
1004 "keyword prefix `~a' not followed by a symbol: ~s",
1005 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1006
1007 return (scm_symbol_to_keyword (symbol));
1008 }
1009
1010 static inline SCM
1011 scm_read_vector (int chr, SCM port)
1012 {
1013 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1014 guarantee that it's going to do what we want. After all, this is an
1015 implementation detail of `scm_read_vector ()', not a desirable
1016 property. */
1017 return (scm_vector (scm_read_sexp (chr, port)));
1018 }
1019
1020 static inline SCM
1021 scm_read_srfi4_vector (int chr, SCM port)
1022 {
1023 return scm_i_read_array (port, chr);
1024 }
1025
1026 static SCM
1027 scm_read_bytevector (scm_t_wchar chr, SCM port)
1028 {
1029 chr = scm_getc (port);
1030 if (chr != 'u')
1031 goto syntax;
1032
1033 chr = scm_getc (port);
1034 if (chr != '8')
1035 goto syntax;
1036
1037 chr = scm_getc (port);
1038 if (chr != '(')
1039 goto syntax;
1040
1041 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1042
1043 syntax:
1044 scm_i_input_error ("read_bytevector", port,
1045 "invalid bytevector prefix",
1046 SCM_MAKE_CHAR (chr));
1047 return SCM_UNSPECIFIED;
1048 }
1049
1050 static SCM
1051 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1052 {
1053 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1054 terribly inefficient but who cares? */
1055 SCM s_bits = SCM_EOL;
1056
1057 for (chr = scm_getc (port);
1058 (chr != EOF) && ((chr == '0') || (chr == '1'));
1059 chr = scm_getc (port))
1060 {
1061 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1062 }
1063
1064 if (chr != EOF)
1065 scm_ungetc (chr, port);
1066
1067 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1068 }
1069
1070 static inline SCM
1071 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1072 {
1073 int bang_seen = 0;
1074
1075 /* We can use the get_byte here because there is no need to get the
1076 locale correct when reading comments. This presumes that
1077 hash and exclamation points always represent themselves no
1078 matter what the source encoding is.*/
1079 for (;;)
1080 {
1081 int c = scm_get_byte_or_eof (port);
1082
1083 if (c == EOF)
1084 scm_i_input_error ("skip_block_comment", port,
1085 "unterminated `#! ... !#' comment", SCM_EOL);
1086
1087 if (c == '!')
1088 bang_seen = 1;
1089 else if (c == '#' && bang_seen)
1090 break;
1091 else
1092 bang_seen = 0;
1093 }
1094
1095 return SCM_UNSPECIFIED;
1096 }
1097
1098 static SCM
1099 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1100 {
1101 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1102 nested. So care must be taken. */
1103 int nesting_level = 1;
1104 int opening_seen = 0, closing_seen = 0;
1105
1106 while (nesting_level > 0)
1107 {
1108 int c = scm_getc (port);
1109
1110 if (c == EOF)
1111 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1112 "unterminated `#| ... |#' comment", SCM_EOL);
1113
1114 if (opening_seen)
1115 {
1116 if (c == '|')
1117 nesting_level++;
1118 opening_seen = 0;
1119 }
1120 else if (closing_seen)
1121 {
1122 if (c == '#')
1123 nesting_level--;
1124 closing_seen = 0;
1125 }
1126 else if (c == '|')
1127 closing_seen = 1;
1128 else if (c == '#')
1129 opening_seen = 1;
1130 else
1131 opening_seen = closing_seen = 0;
1132 }
1133
1134 return SCM_UNSPECIFIED;
1135 }
1136
1137 static SCM
1138 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1139 {
1140 scm_t_wchar c;
1141
1142 c = flush_ws (port, (char *) NULL);
1143 if (EOF == c)
1144 scm_i_input_error ("read_commented_expression", port,
1145 "no expression after #; comment", SCM_EOL);
1146 scm_ungetc (c, port);
1147 scm_read_expression (port);
1148 return SCM_UNSPECIFIED;
1149 }
1150
1151 static SCM
1152 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1153 {
1154 /* Guile's extended symbol read syntax looks like this:
1155
1156 #{This is all a symbol name}#
1157
1158 So here, CHR is expected to be `{'. */
1159 int saw_brace = 0, finished = 0;
1160 size_t len = 0;
1161 SCM buf = scm_i_make_string (1024, NULL);
1162
1163 buf = scm_i_string_start_writing (buf);
1164
1165 while ((chr = scm_getc (port)) != EOF)
1166 {
1167 if (saw_brace)
1168 {
1169 if (chr == '#')
1170 {
1171 finished = 1;
1172 break;
1173 }
1174 else
1175 {
1176 saw_brace = 0;
1177 scm_i_string_set_x (buf, len++, '}');
1178 scm_i_string_set_x (buf, len++, chr);
1179 }
1180 }
1181 else if (chr == '}')
1182 saw_brace = 1;
1183 else
1184 scm_i_string_set_x (buf, len++, chr);
1185
1186 if (len >= scm_i_string_length (buf) - 2)
1187 {
1188 SCM addy;
1189
1190 scm_i_string_stop_writing ();
1191 addy = scm_i_make_string (1024, NULL);
1192 buf = scm_string_append (scm_list_2 (buf, addy));
1193 len = 0;
1194 buf = scm_i_string_start_writing (buf);
1195 }
1196
1197 if (finished)
1198 break;
1199 }
1200 scm_i_string_stop_writing ();
1201
1202 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1203 }
1204
1205
1206 \f
1207 /* Top-level token readers, i.e., dispatchers. */
1208
1209 static SCM
1210 scm_read_sharp_extension (int chr, SCM port)
1211 {
1212 SCM proc;
1213
1214 proc = scm_get_hash_procedure (chr);
1215 if (scm_is_true (scm_procedure_p (proc)))
1216 {
1217 long line = SCM_LINUM (port);
1218 int column = SCM_COL (port) - 2;
1219 SCM got;
1220
1221 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1222 if (!scm_is_eq (got, SCM_UNSPECIFIED))
1223 {
1224 if (SCM_RECORD_POSITIONS_P)
1225 return (recsexpr (got, line, column,
1226 SCM_FILENAME (port)));
1227 else
1228 return got;
1229 }
1230 }
1231
1232 return SCM_UNSPECIFIED;
1233 }
1234
1235 /* The reader for the sharp `#' character. It basically dispatches reads
1236 among the above token readers. */
1237 static SCM
1238 scm_read_sharp (scm_t_wchar chr, SCM port)
1239 #define FUNC_NAME "scm_lreadr"
1240 {
1241 SCM result;
1242
1243 chr = scm_getc (port);
1244
1245 result = scm_read_sharp_extension (chr, port);
1246 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1247 return result;
1248
1249 switch (chr)
1250 {
1251 case '\\':
1252 return (scm_read_character (chr, port));
1253 case '(':
1254 return (scm_read_vector (chr, port));
1255 case 's':
1256 case 'u':
1257 case 'f':
1258 /* This one may return either a boolean or an SRFI-4 vector. */
1259 return (scm_read_srfi4_vector (chr, port));
1260 case 'v':
1261 return (scm_read_bytevector (chr, port));
1262 case '*':
1263 return (scm_read_guile_bit_vector (chr, port));
1264 case 't':
1265 case 'T':
1266 case 'F':
1267 /* This one may return either a boolean or an SRFI-4 vector. */
1268 return (scm_read_boolean (chr, port));
1269 case ':':
1270 return (scm_read_keyword (chr, port));
1271 case '0': case '1': case '2': case '3': case '4':
1272 case '5': case '6': case '7': case '8': case '9':
1273 case '@':
1274 #if SCM_ENABLE_DEPRECATED
1275 /* See below for 'i' and 'e'. */
1276 case 'a':
1277 case 'c':
1278 case 'y':
1279 case 'h':
1280 case 'l':
1281 #endif
1282 return (scm_i_read_array (port, chr));
1283
1284 case 'i':
1285 case 'e':
1286 #if SCM_ENABLE_DEPRECATED
1287 {
1288 /* When next char is '(', it really is an old-style
1289 uniform array. */
1290 scm_t_wchar next_c = scm_getc (port);
1291 if (next_c != EOF)
1292 scm_ungetc (next_c, port);
1293 if (next_c == '(')
1294 return scm_i_read_array (port, chr);
1295 /* Fall through. */
1296 }
1297 #endif
1298 case 'b':
1299 case 'B':
1300 case 'o':
1301 case 'O':
1302 case 'd':
1303 case 'D':
1304 case 'x':
1305 case 'X':
1306 case 'I':
1307 case 'E':
1308 return (scm_read_number_and_radix (chr, port));
1309 case '{':
1310 return (scm_read_extended_symbol (chr, port));
1311 case '!':
1312 return (scm_read_scsh_block_comment (chr, port));
1313 case ';':
1314 return (scm_read_commented_expression (chr, port));
1315 case '`':
1316 case '\'':
1317 case ',':
1318 return (scm_read_syntax (chr, port));
1319 default:
1320 result = scm_read_sharp_extension (chr, port);
1321 if (scm_is_eq (result, SCM_UNSPECIFIED))
1322 {
1323 /* To remain compatible with 1.8 and earlier, the following
1324 characters have lower precedence than `read-hash-extend'
1325 characters. */
1326 switch (chr)
1327 {
1328 case '|':
1329 return scm_read_r6rs_block_comment (chr, port);
1330 default:
1331 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1332 scm_list_1 (SCM_MAKE_CHAR (chr)));
1333 }
1334 }
1335 else
1336 return result;
1337 }
1338
1339 return SCM_UNSPECIFIED;
1340 }
1341 #undef FUNC_NAME
1342
1343 static SCM
1344 scm_read_expression (SCM port)
1345 #define FUNC_NAME "scm_read_expression"
1346 {
1347 while (1)
1348 {
1349 register scm_t_wchar chr;
1350
1351 chr = scm_getc (port);
1352
1353 switch (chr)
1354 {
1355 case SCM_WHITE_SPACES:
1356 case SCM_LINE_INCREMENTORS:
1357 break;
1358 case ';':
1359 (void) scm_read_semicolon_comment (chr, port);
1360 break;
1361 case '[':
1362 if (!SCM_SQUARE_BRACKETS_P)
1363 return (scm_read_mixed_case_symbol (chr, port));
1364 /* otherwise fall through */
1365 case '(':
1366 return (scm_read_sexp (chr, port));
1367 case '"':
1368 return (scm_read_string (chr, port));
1369 case '\'':
1370 case '`':
1371 case ',':
1372 return (scm_read_quote (chr, port));
1373 case '#':
1374 {
1375 SCM result;
1376 result = scm_read_sharp (chr, port);
1377 if (scm_is_eq (result, SCM_UNSPECIFIED))
1378 /* We read a comment or some such. */
1379 break;
1380 else
1381 return result;
1382 }
1383 case ')':
1384 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1385 break;
1386 case EOF:
1387 return SCM_EOF_VAL;
1388 case ':':
1389 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1390 return scm_symbol_to_keyword (scm_read_expression (port));
1391 /* Fall through. */
1392
1393 default:
1394 {
1395 if (((chr >= '0') && (chr <= '9'))
1396 || (strchr ("+-.", chr)))
1397 return (scm_read_number (chr, port));
1398 else
1399 return (scm_read_mixed_case_symbol (chr, port));
1400 }
1401 }
1402 }
1403 }
1404 #undef FUNC_NAME
1405
1406 \f
1407 /* Actual reader. */
1408
1409 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1410 (SCM port),
1411 "Read an s-expression from the input port @var{port}, or from\n"
1412 "the current input port if @var{port} is not specified.\n"
1413 "Any whitespace before the next token is discarded.")
1414 #define FUNC_NAME s_scm_read
1415 {
1416 int c;
1417
1418 if (SCM_UNBNDP (port))
1419 port = scm_current_input_port ();
1420 SCM_VALIDATE_OPINPORT (1, port);
1421
1422 c = flush_ws (port, (char *) NULL);
1423 if (EOF == c)
1424 return SCM_EOF_VAL;
1425 scm_ungetc (c, port);
1426
1427 return (scm_read_expression (port));
1428 }
1429 #undef FUNC_NAME
1430
1431
1432 \f
1433
1434 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1435 static SCM
1436 recsexpr (SCM obj, long line, int column, SCM filename)
1437 {
1438 if (!scm_is_pair(obj)) {
1439 return obj;
1440 } else {
1441 SCM tmp = obj, copy;
1442 /* If this sexpr is visible in the read:sharp source, we want to
1443 keep that information, so only record non-constant cons cells
1444 which haven't previously been read by the reader. */
1445 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1446 {
1447 if (SCM_COPY_SOURCE_P)
1448 {
1449 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1450 SCM_UNDEFINED);
1451 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1452 {
1453 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1454 line,
1455 column,
1456 filename),
1457 SCM_UNDEFINED));
1458 copy = SCM_CDR (copy);
1459 }
1460 SCM_SETCDR (copy, tmp);
1461 }
1462 else
1463 {
1464 recsexpr (SCM_CAR (obj), line, column, filename);
1465 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1466 recsexpr (SCM_CAR (tmp), line, column, filename);
1467 copy = SCM_UNDEFINED;
1468 }
1469 scm_whash_insert (scm_source_whash,
1470 obj,
1471 scm_make_srcprops (line,
1472 column,
1473 filename,
1474 copy,
1475 SCM_EOL));
1476 }
1477 return obj;
1478 }
1479 }
1480
1481 /* Manipulate the read-hash-procedures alist. This could be written in
1482 Scheme, but maybe it will also be used by C code during initialisation. */
1483 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1484 (SCM chr, SCM proc),
1485 "Install the procedure @var{proc} for reading expressions\n"
1486 "starting with the character sequence @code{#} and @var{chr}.\n"
1487 "@var{proc} will be called with two arguments: the character\n"
1488 "@var{chr} and the port to read further data from. The object\n"
1489 "returned will be the return value of @code{read}. \n"
1490 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1491 )
1492 #define FUNC_NAME s_scm_read_hash_extend
1493 {
1494 SCM this;
1495 SCM prev;
1496
1497 SCM_VALIDATE_CHAR (1, chr);
1498 SCM_ASSERT (scm_is_false (proc)
1499 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1500 proc, SCM_ARG2, FUNC_NAME);
1501
1502 /* Check if chr is already in the alist. */
1503 this = *scm_read_hash_procedures;
1504 prev = SCM_BOOL_F;
1505 while (1)
1506 {
1507 if (scm_is_null (this))
1508 {
1509 /* not found, so add it to the beginning. */
1510 if (scm_is_true (proc))
1511 {
1512 *scm_read_hash_procedures =
1513 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1514 }
1515 break;
1516 }
1517 if (scm_is_eq (chr, SCM_CAAR (this)))
1518 {
1519 /* already in the alist. */
1520 if (scm_is_false (proc))
1521 {
1522 /* remove it. */
1523 if (scm_is_false (prev))
1524 {
1525 *scm_read_hash_procedures =
1526 SCM_CDR (*scm_read_hash_procedures);
1527 }
1528 else
1529 scm_set_cdr_x (prev, SCM_CDR (this));
1530 }
1531 else
1532 {
1533 /* replace it. */
1534 scm_set_cdr_x (SCM_CAR (this), proc);
1535 }
1536 break;
1537 }
1538 prev = this;
1539 this = SCM_CDR (this);
1540 }
1541
1542 return SCM_UNSPECIFIED;
1543 }
1544 #undef FUNC_NAME
1545
1546 /* Recover the read-hash procedure corresponding to char c. */
1547 static SCM
1548 scm_get_hash_procedure (int c)
1549 {
1550 SCM rest = *scm_read_hash_procedures;
1551
1552 while (1)
1553 {
1554 if (scm_is_null (rest))
1555 return SCM_BOOL_F;
1556
1557 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1558 return SCM_CDAR (rest);
1559
1560 rest = SCM_CDR (rest);
1561 }
1562 }
1563
1564 #define SCM_ENCODING_SEARCH_SIZE (500)
1565
1566 /* Search the first few hundred characters of a file for an Emacs-like coding
1567 declaration. Returns either NULL or a string whose storage has been
1568 allocated with `scm_gc_malloc ()'. */
1569 char *
1570 scm_i_scan_for_encoding (SCM port)
1571 {
1572 char header[SCM_ENCODING_SEARCH_SIZE+1];
1573 size_t bytes_read;
1574 char *encoding = NULL;
1575 int utf8_bom = 0;
1576 char *pos;
1577 int i;
1578 int in_comment;
1579
1580 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1581 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1582 "guile <(echo '(display "hello")')") so bail out. */
1583 return NULL;
1584
1585 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1586
1587 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1588
1589 if (bytes_read > 3
1590 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1591 utf8_bom = 1;
1592
1593 /* search past "coding[:=]" */
1594 pos = header;
1595 while (1)
1596 {
1597 if ((pos = strstr(pos, "coding")) == NULL)
1598 return NULL;
1599
1600 pos += strlen("coding");
1601 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1602 (*pos == ':' || *pos == '='))
1603 {
1604 pos ++;
1605 break;
1606 }
1607 }
1608
1609 /* skip spaces */
1610 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1611 (*pos == ' ' || *pos == '\t'))
1612 pos ++;
1613
1614 /* grab the next token */
1615 i = 0;
1616 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1617 && pos + i - header < bytes_read
1618 && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1619 i++;
1620
1621 if (i == 0)
1622 return NULL;
1623
1624 encoding = scm_gc_strndup (pos, i, "encoding");
1625 for (i = 0; i < strlen (encoding); i++)
1626 encoding[i] = toupper ((int) encoding[i]);
1627
1628 /* push backwards to make sure we were in a comment */
1629 in_comment = 0;
1630 while (pos - i - header > 0)
1631 {
1632 if (*(pos - i) == '\n')
1633 {
1634 /* This wasn't in a semicolon comment. Check for a
1635 hash-bang comment. */
1636 char *beg = strstr (header, "#!");
1637 char *end = strstr (header, "!#");
1638 if (beg < pos && pos < end)
1639 in_comment = 1;
1640 break;
1641 }
1642 if (*(pos - i) == ';')
1643 {
1644 in_comment = 1;
1645 break;
1646 }
1647 i ++;
1648 }
1649 if (!in_comment)
1650 /* This wasn't in a comment */
1651 return NULL;
1652
1653 if (utf8_bom && strcmp(encoding, "UTF-8"))
1654 scm_misc_error (NULL,
1655 "the port input declares the encoding ~s but is encoded as UTF-8",
1656 scm_list_1 (scm_from_locale_string (encoding)));
1657
1658 return encoding;
1659 }
1660
1661 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1662 (SCM port),
1663 "Scans the port for an Emacs-like character coding declaration\n"
1664 "near the top of the contents of a port with random-acessible contents.\n"
1665 "The coding declaration is of the form\n"
1666 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1667 "\n"
1668 "Returns a string containing the character encoding of the file\n"
1669 "if a declaration was found, or @code{#f} otherwise.\n")
1670 #define FUNC_NAME s_scm_file_encoding
1671 {
1672 char *enc;
1673 SCM s_enc;
1674
1675 enc = scm_i_scan_for_encoding (port);
1676 if (enc == NULL)
1677 return SCM_BOOL_F;
1678 else
1679 {
1680 s_enc = scm_from_locale_string (enc);
1681 return s_enc;
1682 }
1683
1684 return SCM_BOOL_F;
1685 }
1686 #undef FUNC_NAME
1687
1688 void
1689 scm_init_read ()
1690 {
1691 scm_read_hash_procedures =
1692 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1693
1694 scm_init_opts (scm_read_options, scm_read_opts);
1695 #include "libguile/read.x"
1696 }
1697
1698 /*
1699 Local Variables:
1700 c-file-style: "gnu"
1701 End:
1702 */