Add source properties to many more types of data
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port, long line, int column);
360
361
362 static SCM
363 maybe_annotate_source (SCM x, SCM port, long line, int column)
364 {
365 if (SCM_RECORD_POSITIONS_P)
366 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
367 return x;
368 }
369
370 static SCM
371 scm_read_sexp (scm_t_wchar chr, SCM port)
372 #define FUNC_NAME "scm_i_lreadparen"
373 {
374 int c;
375 SCM tmp, tl, ans = SCM_EOL;
376 const int terminating_char = ((chr == '[') ? ']' : ')');
377
378 /* Need to capture line and column numbers here. */
379 long line = SCM_LINUM (port);
380 int column = SCM_COL (port) - 1;
381
382 c = flush_ws (port, FUNC_NAME);
383 if (terminating_char == c)
384 return SCM_EOL;
385
386 scm_ungetc (c, port);
387 tmp = scm_read_expression (port);
388
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
393 {
394 ans = scm_read_expression (port);
395 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
396 scm_i_input_error (FUNC_NAME, port, "missing close paren",
397 SCM_EOL);
398 return ans;
399 }
400
401 /* Build the head of the list structure. */
402 ans = tl = scm_cons (tmp, SCM_EOL);
403
404 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
405 {
406 SCM new_tail;
407
408 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c)));
412
413 scm_ungetc (c, port);
414 tmp = scm_read_expression (port);
415
416 /* See above note about scm_sym_dot. */
417 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
418 {
419 SCM_SETCDR (tl, scm_read_expression (port));
420
421 c = flush_ws (port, FUNC_NAME);
422 if (terminating_char != c)
423 scm_i_input_error (FUNC_NAME, port,
424 "in pair: missing close paren", SCM_EOL);
425 goto exit;
426 }
427
428 new_tail = scm_cons (tmp, SCM_EOL);
429 SCM_SETCDR (tl, new_tail);
430 tl = new_tail;
431 }
432
433 exit:
434 return maybe_annotate_source (ans, port, line, column);
435 }
436 #undef FUNC_NAME
437
438
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
471 } while (0)
472
473 static void
474 skip_intraline_whitespace (SCM port)
475 {
476 scm_t_wchar c;
477
478 do
479 {
480 c = scm_getc (port);
481 if (c == EOF)
482 return;
483 }
484 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
485
486 scm_ungetc (c, port);
487 }
488
489 static SCM
490 scm_read_string (int chr, SCM port)
491 #define FUNC_NAME "scm_lreadr"
492 {
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
495
496 SCM str = SCM_BOOL_F;
497 unsigned c_str_len = 0;
498 scm_t_wchar c;
499
500 /* Need to capture line and column numbers here. */
501 long line = SCM_LINUM (port);
502 int column = SCM_COL (port) - 1;
503
504 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
505 while ('"' != (c = scm_getc (port)))
506 {
507 if (c == EOF)
508 {
509 str_eof:
510 scm_i_input_error (FUNC_NAME, port,
511 "end of file in string constant", SCM_EOL);
512 }
513
514 if (c_str_len + 1 >= scm_i_string_length (str))
515 {
516 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
517
518 str = scm_string_append (scm_list_2 (str, addy));
519 }
520
521 if (c == '\\')
522 {
523 switch (c = scm_getc (port))
524 {
525 case EOF:
526 goto str_eof;
527 case '"':
528 case '\\':
529 break;
530 case '\n':
531 if (SCM_HUNGRY_EOL_ESCAPES_P)
532 skip_intraline_whitespace (port);
533 continue;
534 case '0':
535 c = '\0';
536 break;
537 case 'f':
538 c = '\f';
539 break;
540 case 'n':
541 c = '\n';
542 break;
543 case 'r':
544 c = '\r';
545 break;
546 case 't':
547 c = '\t';
548 break;
549 case 'a':
550 c = '\007';
551 break;
552 case 'v':
553 c = '\v';
554 break;
555 case 'b':
556 c = '\010';
557 break;
558 case 'x':
559 if (SCM_R6RS_ESCAPES_P)
560 SCM_READ_HEX_ESCAPE (10, ';');
561 else
562 SCM_READ_HEX_ESCAPE (2, '\0');
563 break;
564 case 'u':
565 if (!SCM_R6RS_ESCAPES_P)
566 {
567 SCM_READ_HEX_ESCAPE (4, '\0');
568 break;
569 }
570 case 'U':
571 if (!SCM_R6RS_ESCAPES_P)
572 {
573 SCM_READ_HEX_ESCAPE (6, '\0');
574 break;
575 }
576 default:
577 bad_escaped:
578 scm_i_input_error (FUNC_NAME, port,
579 "illegal character in escape sequence: ~S",
580 scm_list_1 (SCM_MAKE_CHAR (c)));
581 }
582 }
583 str = scm_i_string_start_writing (str);
584 scm_i_string_set_x (str, c_str_len++, c);
585 scm_i_string_stop_writing ();
586 }
587 return maybe_annotate_source (scm_i_substring_copy (str, 0, c_str_len),
588 port, line, column);
589 }
590 #undef FUNC_NAME
591
592
593 static SCM
594 scm_read_number (scm_t_wchar chr, SCM port)
595 {
596 SCM result, str = SCM_EOL;
597 char buffer[READER_BUFFER_SIZE];
598 char *overflow_buffer = NULL;
599 size_t bytes_read;
600 int overflow;
601 scm_t_port *pt = SCM_PTAB_ENTRY (port);
602
603 scm_ungetc (chr, port);
604 overflow = read_complete_token (port, buffer, sizeof (buffer),
605 &overflow_buffer, &bytes_read);
606
607 if (!overflow)
608 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
609 else
610 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
611 pt->ilseq_handler);
612
613 result = scm_string_to_number (str, SCM_UNDEFINED);
614 if (!scm_is_true (result))
615 {
616 /* Return a symbol instead of a number */
617 if (SCM_CASE_INSENSITIVE_P)
618 str = scm_string_downcase_x (str);
619 result = scm_string_to_symbol (str);
620 }
621
622 if (overflow)
623 free (overflow_buffer);
624 SCM_COL (port) += scm_i_string_length (str);
625 return result;
626 }
627
628 static SCM
629 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
630 {
631 SCM result;
632 int ends_with_colon = 0;
633 size_t bytes_read;
634 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
635 int overflow;
636 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
637 scm_t_port *pt = SCM_PTAB_ENTRY (port);
638 SCM str;
639
640 scm_ungetc (chr, port);
641 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
642 &overflow_buffer, &bytes_read);
643 if (bytes_read > 0)
644 {
645 if (!overflow)
646 ends_with_colon = buffer[bytes_read - 1] == ':';
647 else
648 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
649 }
650
651 if (postfix && ends_with_colon && (bytes_read > 1))
652 {
653 if (!overflow)
654 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
655 else
656 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
657 pt->ilseq_handler);
658
659 if (SCM_CASE_INSENSITIVE_P)
660 str = scm_string_downcase_x (str);
661 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
662 }
663 else
664 {
665 if (!overflow)
666 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
667 else
668 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
669 pt->ilseq_handler);
670
671 if (SCM_CASE_INSENSITIVE_P)
672 str = scm_string_downcase_x (str);
673 result = scm_string_to_symbol (str);
674 }
675
676 if (overflow)
677 free (overflow_buffer);
678 SCM_COL (port) += scm_i_string_length (str);
679 return result;
680 }
681
682 static SCM
683 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
684 #define FUNC_NAME "scm_lreadr"
685 {
686 SCM result;
687 size_t read;
688 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
689 int overflow;
690 unsigned int radix;
691 SCM str;
692 scm_t_port *pt;
693
694 switch (chr)
695 {
696 case 'B':
697 case 'b':
698 radix = 2;
699 break;
700
701 case 'o':
702 case 'O':
703 radix = 8;
704 break;
705
706 case 'd':
707 case 'D':
708 radix = 10;
709 break;
710
711 case 'x':
712 case 'X':
713 radix = 16;
714 break;
715
716 default:
717 scm_ungetc (chr, port);
718 scm_ungetc ('#', port);
719 radix = 10;
720 }
721
722 overflow = read_complete_token (port, buffer, sizeof (buffer),
723 &overflow_buffer, &read);
724
725 pt = SCM_PTAB_ENTRY (port);
726 if (!overflow)
727 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
728 else
729 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
730 pt->ilseq_handler);
731
732 result = scm_string_to_number (str, scm_from_uint (radix));
733
734 if (overflow)
735 free (overflow_buffer);
736
737 SCM_COL (port) += scm_i_string_length (str);
738
739 if (scm_is_true (result))
740 return result;
741
742 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
743
744 return SCM_BOOL_F;
745 }
746 #undef FUNC_NAME
747
748 static SCM
749 scm_read_quote (int chr, SCM port)
750 {
751 SCM p;
752 long line = SCM_LINUM (port);
753 int column = SCM_COL (port) - 1;
754
755 switch (chr)
756 {
757 case '`':
758 p = scm_sym_quasiquote;
759 break;
760
761 case '\'':
762 p = scm_sym_quote;
763 break;
764
765 case ',':
766 {
767 scm_t_wchar c;
768
769 c = scm_getc (port);
770 if ('@' == c)
771 p = scm_sym_uq_splicing;
772 else
773 {
774 scm_ungetc (c, port);
775 p = scm_sym_unquote;
776 }
777 break;
778 }
779
780 default:
781 fprintf (stderr, "%s: unhandled quote character (%i)\n",
782 "scm_read_quote", chr);
783 abort ();
784 }
785
786 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
787 return maybe_annotate_source (p, port, line, column);
788 }
789
790 SCM_SYMBOL (sym_syntax, "syntax");
791 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
792 SCM_SYMBOL (sym_unsyntax, "unsyntax");
793 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
794
795 static SCM
796 scm_read_syntax (int chr, SCM port)
797 {
798 SCM p;
799 long line = SCM_LINUM (port);
800 int column = SCM_COL (port) - 1;
801
802 switch (chr)
803 {
804 case '`':
805 p = sym_quasisyntax;
806 break;
807
808 case '\'':
809 p = sym_syntax;
810 break;
811
812 case ',':
813 {
814 int c;
815
816 c = scm_getc (port);
817 if ('@' == c)
818 p = sym_unsyntax_splicing;
819 else
820 {
821 scm_ungetc (c, port);
822 p = sym_unsyntax;
823 }
824 break;
825 }
826
827 default:
828 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
829 "scm_read_syntax", chr);
830 abort ();
831 }
832
833 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
834 return maybe_annotate_source (p, port, line, column);
835 }
836
837 static SCM
838 scm_read_nil (int chr, SCM port)
839 {
840 SCM id = scm_read_mixed_case_symbol (chr, port);
841
842 if (!scm_is_eq (id, sym_nil))
843 scm_i_input_error ("scm_read_nil", port,
844 "unexpected input while reading #nil: ~a",
845 scm_list_1 (id));
846
847 return SCM_ELISP_NIL;
848 }
849
850 static SCM
851 scm_read_semicolon_comment (int chr, SCM port)
852 {
853 int c;
854
855 /* We use the get_byte here because there is no need to get the
856 locale correct with comment input. This presumes that newline
857 always represents itself no matter what the encoding is. */
858 for (c = scm_get_byte_or_eof (port);
859 (c != EOF) && (c != '\n');
860 c = scm_get_byte_or_eof (port));
861
862 return SCM_UNSPECIFIED;
863 }
864
865 \f
866 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
867
868 static SCM
869 scm_read_boolean (int chr, SCM port)
870 {
871 switch (chr)
872 {
873 case 't':
874 case 'T':
875 return SCM_BOOL_T;
876
877 case 'f':
878 case 'F':
879 return SCM_BOOL_F;
880 }
881
882 return SCM_UNSPECIFIED;
883 }
884
885 static SCM
886 scm_read_character (scm_t_wchar chr, SCM port)
887 #define FUNC_NAME "scm_lreadr"
888 {
889 char buffer[READER_CHAR_NAME_MAX_SIZE];
890 SCM charname;
891 size_t charname_len, bytes_read;
892 scm_t_wchar cp;
893 int overflow;
894 scm_t_port *pt;
895
896 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
897 if (overflow)
898 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
899
900 if (bytes_read == 0)
901 {
902 chr = scm_getc (port);
903 if (chr == EOF)
904 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
905 "while reading character", SCM_EOL);
906
907 /* CHR must be a token delimiter, like a whitespace. */
908 return (SCM_MAKE_CHAR (chr));
909 }
910
911 pt = SCM_PTAB_ENTRY (port);
912
913 /* Simple ASCII characters can be processed immediately. Also, simple
914 ISO-8859-1 characters can be processed immediately if the encoding for this
915 port is ISO-8859-1. */
916 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
917 {
918 SCM_COL (port) += 1;
919 return SCM_MAKE_CHAR (buffer[0]);
920 }
921
922 /* Otherwise, convert the buffer into a proper scheme string for
923 processing. */
924 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
925 pt->ilseq_handler);
926 charname_len = scm_i_string_length (charname);
927 SCM_COL (port) += charname_len;
928 cp = scm_i_string_ref (charname, 0);
929 if (charname_len == 1)
930 return SCM_MAKE_CHAR (cp);
931
932 /* Ignore dotted circles, which may be used to keep combining characters from
933 combining with the backslash in #\charname. */
934 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
935 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
936
937 if (cp >= '0' && cp < '8')
938 {
939 /* Dirk:FIXME:: This type of character syntax is not R5RS
940 * compliant. Further, it should be verified that the constant
941 * does only consist of octal digits. */
942 SCM p = scm_string_to_number (charname, scm_from_uint (8));
943 if (SCM_I_INUMP (p))
944 {
945 scm_t_wchar c = scm_to_uint32 (p);
946 if (SCM_IS_UNICODE_CHAR (c))
947 return SCM_MAKE_CHAR (c);
948 else
949 scm_i_input_error (FUNC_NAME, port,
950 "out-of-range octal character escape: ~a",
951 scm_list_1 (charname));
952 }
953 }
954
955 if (cp == 'x' && (charname_len > 1))
956 {
957 SCM p;
958
959 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
960 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
961 scm_from_uint (16));
962 if (SCM_I_INUMP (p))
963 {
964 scm_t_wchar c = scm_to_uint32 (p);
965 if (SCM_IS_UNICODE_CHAR (c))
966 return SCM_MAKE_CHAR (c);
967 else
968 scm_i_input_error (FUNC_NAME, port,
969 "out-of-range hex character escape: ~a",
970 scm_list_1 (charname));
971 }
972 }
973
974 /* The names of characters should never have non-Latin1
975 characters. */
976 if (scm_i_is_narrow_string (charname)
977 || scm_i_try_narrow_string (charname))
978 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
979 charname_len);
980 if (scm_is_true (ch))
981 return ch;
982 }
983
984 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
985 scm_list_1 (charname));
986
987 return SCM_UNSPECIFIED;
988 }
989 #undef FUNC_NAME
990
991 static SCM
992 scm_read_keyword (int chr, SCM port)
993 {
994 SCM symbol;
995
996 /* Read the symbol that comprises the keyword. Doing this instead of
997 invoking a specific symbol reader function allows `scm_read_keyword ()'
998 to adapt to the delimiters currently valid of symbols.
999
1000 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1001 symbol = scm_read_expression (port);
1002 if (!scm_is_symbol (symbol))
1003 scm_i_input_error ("scm_read_keyword", port,
1004 "keyword prefix `~a' not followed by a symbol: ~s",
1005 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1006
1007 return (scm_symbol_to_keyword (symbol));
1008 }
1009
1010 static SCM
1011 scm_read_vector (int chr, SCM port, long line, int column)
1012 {
1013 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1014 guarantee that it's going to do what we want. After all, this is an
1015 implementation detail of `scm_read_vector ()', not a desirable
1016 property. */
1017 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1018 port, line, column);
1019 }
1020
1021 static SCM
1022 scm_read_array (int chr, SCM port, long line, int column)
1023 {
1024 SCM result = scm_i_read_array (port, chr);
1025 if (scm_is_false (result))
1026 return result;
1027 else
1028 return maybe_annotate_source (result, port, line, column);
1029 }
1030
1031 static SCM
1032 scm_read_srfi4_vector (int chr, SCM port, long line, int column)
1033 {
1034 return scm_read_array (chr, port, line, column);
1035 }
1036
1037 static SCM
1038 scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
1039 {
1040 chr = scm_getc (port);
1041 if (chr != 'u')
1042 goto syntax;
1043
1044 chr = scm_getc (port);
1045 if (chr != '8')
1046 goto syntax;
1047
1048 chr = scm_getc (port);
1049 if (chr != '(')
1050 goto syntax;
1051
1052 return maybe_annotate_source
1053 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1054 port, line, column);
1055
1056 syntax:
1057 scm_i_input_error ("read_bytevector", port,
1058 "invalid bytevector prefix",
1059 SCM_MAKE_CHAR (chr));
1060 return SCM_UNSPECIFIED;
1061 }
1062
1063 static SCM
1064 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
1065 {
1066 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1067 terribly inefficient but who cares? */
1068 SCM s_bits = SCM_EOL;
1069
1070 for (chr = scm_getc (port);
1071 (chr != EOF) && ((chr == '0') || (chr == '1'));
1072 chr = scm_getc (port))
1073 {
1074 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1075 }
1076
1077 if (chr != EOF)
1078 scm_ungetc (chr, port);
1079
1080 return maybe_annotate_source
1081 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1082 port, line, column);
1083 }
1084
1085 static SCM
1086 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1087 {
1088 int bang_seen = 0;
1089
1090 for (;;)
1091 {
1092 int c = scm_getc (port);
1093
1094 if (c == EOF)
1095 scm_i_input_error ("skip_block_comment", port,
1096 "unterminated `#! ... !#' comment", SCM_EOL);
1097
1098 if (c == '!')
1099 bang_seen = 1;
1100 else if (c == '#' && bang_seen)
1101 break;
1102 else
1103 bang_seen = 0;
1104 }
1105
1106 return SCM_UNSPECIFIED;
1107 }
1108
1109 static SCM
1110 scm_read_shebang (scm_t_wchar chr, SCM port)
1111 {
1112 int c = 0;
1113 if ((c = scm_get_byte_or_eof (port)) != 'r')
1114 {
1115 scm_ungetc (c, port);
1116 return scm_read_scsh_block_comment (chr, port);
1117 }
1118 if ((c = scm_get_byte_or_eof (port)) != '6')
1119 {
1120 scm_ungetc (c, port);
1121 scm_ungetc ('r', port);
1122 return scm_read_scsh_block_comment (chr, port);
1123 }
1124 if ((c = scm_get_byte_or_eof (port)) != 'r')
1125 {
1126 scm_ungetc (c, port);
1127 scm_ungetc ('6', port);
1128 scm_ungetc ('r', port);
1129 return scm_read_scsh_block_comment (chr, port);
1130 }
1131 if ((c = scm_get_byte_or_eof (port)) != 's')
1132 {
1133 scm_ungetc (c, port);
1134 scm_ungetc ('r', port);
1135 scm_ungetc ('6', port);
1136 scm_ungetc ('r', port);
1137 return scm_read_scsh_block_comment (chr, port);
1138 }
1139
1140 return SCM_UNSPECIFIED;
1141 }
1142
1143 static SCM
1144 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1145 {
1146 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1147 nested. So care must be taken. */
1148 int nesting_level = 1;
1149
1150 int a = scm_getc (port);
1151
1152 if (a == EOF)
1153 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1154 "unterminated `#| ... |#' comment", SCM_EOL);
1155
1156 while (nesting_level > 0)
1157 {
1158 int b = scm_getc (port);
1159
1160 if (b == EOF)
1161 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1162 "unterminated `#| ... |#' comment", SCM_EOL);
1163
1164 if (a == '|' && b == '#')
1165 {
1166 nesting_level--;
1167 b = EOF;
1168 }
1169 else if (a == '#' && b == '|')
1170 {
1171 nesting_level++;
1172 b = EOF;
1173 }
1174
1175 a = b;
1176 }
1177
1178 return SCM_UNSPECIFIED;
1179 }
1180
1181 static SCM
1182 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1183 {
1184 scm_t_wchar c;
1185
1186 c = flush_ws (port, (char *) NULL);
1187 if (EOF == c)
1188 scm_i_input_error ("read_commented_expression", port,
1189 "no expression after #; comment", SCM_EOL);
1190 scm_ungetc (c, port);
1191 scm_read_expression (port);
1192 return SCM_UNSPECIFIED;
1193 }
1194
1195 static SCM
1196 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1197 {
1198 /* Guile's extended symbol read syntax looks like this:
1199
1200 #{This is all a symbol name}#
1201
1202 So here, CHR is expected to be `{'. */
1203 int saw_brace = 0;
1204 size_t len = 0;
1205 SCM buf = scm_i_make_string (1024, NULL, 0);
1206
1207 buf = scm_i_string_start_writing (buf);
1208
1209 while ((chr = scm_getc (port)) != EOF)
1210 {
1211 if (saw_brace)
1212 {
1213 if (chr == '#')
1214 {
1215 break;
1216 }
1217 else
1218 {
1219 saw_brace = 0;
1220 scm_i_string_set_x (buf, len++, '}');
1221 }
1222 }
1223
1224 if (chr == '}')
1225 saw_brace = 1;
1226 else if (chr == '\\')
1227 {
1228 /* It used to be that print.c would print extended-read-syntax
1229 symbols with backslashes before "non-standard" chars, but
1230 this routine wouldn't do anything with those escapes.
1231 Bummer. What we've done is to change print.c to output
1232 R6RS hex escapes for those characters, relying on the fact
1233 that the extended read syntax would never put a `\' before
1234 an `x'. For now, we just ignore other instances of
1235 backslash in the string. */
1236 switch ((chr = scm_getc (port)))
1237 {
1238 case EOF:
1239 goto done;
1240 case 'x':
1241 {
1242 scm_t_wchar c;
1243
1244 SCM_READ_HEX_ESCAPE (10, ';');
1245 scm_i_string_set_x (buf, len++, c);
1246 break;
1247
1248 str_eof:
1249 chr = EOF;
1250 goto done;
1251
1252 bad_escaped:
1253 scm_i_string_stop_writing ();
1254 scm_i_input_error ("scm_read_extended_symbol", port,
1255 "illegal character in escape sequence: ~S",
1256 scm_list_1 (SCM_MAKE_CHAR (c)));
1257 break;
1258 }
1259 default:
1260 scm_i_string_set_x (buf, len++, chr);
1261 break;
1262 }
1263 }
1264 else
1265 scm_i_string_set_x (buf, len++, chr);
1266
1267 if (len >= scm_i_string_length (buf) - 2)
1268 {
1269 SCM addy;
1270
1271 scm_i_string_stop_writing ();
1272 addy = scm_i_make_string (1024, NULL, 0);
1273 buf = scm_string_append (scm_list_2 (buf, addy));
1274 len = 0;
1275 buf = scm_i_string_start_writing (buf);
1276 }
1277 }
1278
1279 done:
1280 scm_i_string_stop_writing ();
1281 if (chr == EOF)
1282 scm_i_input_error ("scm_read_extended_symbol", port,
1283 "end of file while reading symbol", SCM_EOL);
1284
1285 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1286 }
1287
1288
1289 \f
1290 /* Top-level token readers, i.e., dispatchers. */
1291
1292 static SCM
1293 scm_read_sharp_extension (int chr, SCM port)
1294 {
1295 SCM proc;
1296
1297 proc = scm_get_hash_procedure (chr);
1298 if (scm_is_true (scm_procedure_p (proc)))
1299 {
1300 long line = SCM_LINUM (port);
1301 int column = SCM_COL (port) - 2;
1302 SCM got;
1303
1304 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1305
1306 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1307 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1308
1309 return got;
1310 }
1311
1312 return SCM_UNSPECIFIED;
1313 }
1314
1315 /* The reader for the sharp `#' character. It basically dispatches reads
1316 among the above token readers. */
1317 static SCM
1318 scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
1319 #define FUNC_NAME "scm_lreadr"
1320 {
1321 SCM result;
1322
1323 chr = scm_getc (port);
1324
1325 result = scm_read_sharp_extension (chr, port);
1326 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1327 return result;
1328
1329 switch (chr)
1330 {
1331 case '\\':
1332 return (scm_read_character (chr, port));
1333 case '(':
1334 return (scm_read_vector (chr, port, line, column));
1335 case 's':
1336 case 'u':
1337 case 'f':
1338 case 'c':
1339 /* This one may return either a boolean or an SRFI-4 vector. */
1340 return (scm_read_srfi4_vector (chr, port, line, column));
1341 case 'v':
1342 return (scm_read_bytevector (chr, port, line, column));
1343 case '*':
1344 return (scm_read_guile_bit_vector (chr, port, line, column));
1345 case 't':
1346 case 'T':
1347 case 'F':
1348 return (scm_read_boolean (chr, port));
1349 case ':':
1350 return (scm_read_keyword (chr, port));
1351 case '0': case '1': case '2': case '3': case '4':
1352 case '5': case '6': case '7': case '8': case '9':
1353 case '@':
1354 #if SCM_ENABLE_DEPRECATED
1355 /* See below for 'i' and 'e'. */
1356 case 'a':
1357 case 'y':
1358 case 'h':
1359 case 'l':
1360 #endif
1361 return (scm_read_array (chr, port, line, column));
1362
1363 case 'i':
1364 case 'e':
1365 #if SCM_ENABLE_DEPRECATED
1366 {
1367 /* When next char is '(', it really is an old-style
1368 uniform array. */
1369 scm_t_wchar next_c = scm_getc (port);
1370 if (next_c != EOF)
1371 scm_ungetc (next_c, port);
1372 if (next_c == '(')
1373 return scm_read_array (chr, port, line, column);
1374 /* Fall through. */
1375 }
1376 #endif
1377 case 'b':
1378 case 'B':
1379 case 'o':
1380 case 'O':
1381 case 'd':
1382 case 'D':
1383 case 'x':
1384 case 'X':
1385 case 'I':
1386 case 'E':
1387 return (scm_read_number_and_radix (chr, port));
1388 case '{':
1389 return (scm_read_extended_symbol (chr, port));
1390 case '!':
1391 return (scm_read_shebang (chr, port));
1392 case ';':
1393 return (scm_read_commented_expression (chr, port));
1394 case '`':
1395 case '\'':
1396 case ',':
1397 return (scm_read_syntax (chr, port));
1398 case 'n':
1399 return (scm_read_nil (chr, port));
1400 default:
1401 result = scm_read_sharp_extension (chr, port);
1402 if (scm_is_eq (result, SCM_UNSPECIFIED))
1403 {
1404 /* To remain compatible with 1.8 and earlier, the following
1405 characters have lower precedence than `read-hash-extend'
1406 characters. */
1407 switch (chr)
1408 {
1409 case '|':
1410 return scm_read_r6rs_block_comment (chr, port);
1411 default:
1412 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1413 scm_list_1 (SCM_MAKE_CHAR (chr)));
1414 }
1415 }
1416 else
1417 return result;
1418 }
1419
1420 return SCM_UNSPECIFIED;
1421 }
1422 #undef FUNC_NAME
1423
1424 static SCM
1425 scm_read_expression (SCM port)
1426 #define FUNC_NAME "scm_read_expression"
1427 {
1428 while (1)
1429 {
1430 scm_t_wchar chr;
1431
1432 chr = scm_getc (port);
1433
1434 switch (chr)
1435 {
1436 case SCM_WHITE_SPACES:
1437 case SCM_LINE_INCREMENTORS:
1438 break;
1439 case ';':
1440 (void) scm_read_semicolon_comment (chr, port);
1441 break;
1442 case '[':
1443 if (!SCM_SQUARE_BRACKETS_P)
1444 return (scm_read_mixed_case_symbol (chr, port));
1445 /* otherwise fall through */
1446 case '(':
1447 return (scm_read_sexp (chr, port));
1448 case '"':
1449 return (scm_read_string (chr, port));
1450 case '\'':
1451 case '`':
1452 case ',':
1453 return (scm_read_quote (chr, port));
1454 case '#':
1455 {
1456 long line = SCM_LINUM (port);
1457 int column = SCM_COL (port) - 1;
1458 SCM result = scm_read_sharp (chr, port, line, column);
1459 if (scm_is_eq (result, SCM_UNSPECIFIED))
1460 /* We read a comment or some such. */
1461 break;
1462 else
1463 return result;
1464 }
1465 case ')':
1466 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1467 break;
1468 case ']':
1469 if (SCM_SQUARE_BRACKETS_P)
1470 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1471 /* otherwise fall through */
1472 case EOF:
1473 return SCM_EOF_VAL;
1474 case ':':
1475 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1476 return scm_symbol_to_keyword (scm_read_expression (port));
1477 /* Fall through. */
1478
1479 default:
1480 {
1481 if (((chr >= '0') && (chr <= '9'))
1482 || (strchr ("+-.", chr)))
1483 return (scm_read_number (chr, port));
1484 else
1485 return (scm_read_mixed_case_symbol (chr, port));
1486 }
1487 }
1488 }
1489 }
1490 #undef FUNC_NAME
1491
1492 \f
1493 /* Actual reader. */
1494
1495 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1496 (SCM port),
1497 "Read an s-expression from the input port @var{port}, or from\n"
1498 "the current input port if @var{port} is not specified.\n"
1499 "Any whitespace before the next token is discarded.")
1500 #define FUNC_NAME s_scm_read
1501 {
1502 int c;
1503
1504 if (SCM_UNBNDP (port))
1505 port = scm_current_input_port ();
1506 SCM_VALIDATE_OPINPORT (1, port);
1507
1508 c = flush_ws (port, (char *) NULL);
1509 if (EOF == c)
1510 return SCM_EOF_VAL;
1511 scm_ungetc (c, port);
1512
1513 return (scm_read_expression (port));
1514 }
1515 #undef FUNC_NAME
1516
1517
1518 \f
1519
1520 /* Manipulate the read-hash-procedures alist. This could be written in
1521 Scheme, but maybe it will also be used by C code during initialisation. */
1522 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1523 (SCM chr, SCM proc),
1524 "Install the procedure @var{proc} for reading expressions\n"
1525 "starting with the character sequence @code{#} and @var{chr}.\n"
1526 "@var{proc} will be called with two arguments: the character\n"
1527 "@var{chr} and the port to read further data from. The object\n"
1528 "returned will be the return value of @code{read}. \n"
1529 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1530 )
1531 #define FUNC_NAME s_scm_read_hash_extend
1532 {
1533 SCM this;
1534 SCM prev;
1535
1536 SCM_VALIDATE_CHAR (1, chr);
1537 SCM_ASSERT (scm_is_false (proc)
1538 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1539 proc, SCM_ARG2, FUNC_NAME);
1540
1541 /* Check if chr is already in the alist. */
1542 this = scm_i_read_hash_procedures_ref ();
1543 prev = SCM_BOOL_F;
1544 while (1)
1545 {
1546 if (scm_is_null (this))
1547 {
1548 /* not found, so add it to the beginning. */
1549 if (scm_is_true (proc))
1550 {
1551 SCM new = scm_cons (scm_cons (chr, proc),
1552 scm_i_read_hash_procedures_ref ());
1553 scm_i_read_hash_procedures_set_x (new);
1554 }
1555 break;
1556 }
1557 if (scm_is_eq (chr, SCM_CAAR (this)))
1558 {
1559 /* already in the alist. */
1560 if (scm_is_false (proc))
1561 {
1562 /* remove it. */
1563 if (scm_is_false (prev))
1564 {
1565 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1566 scm_i_read_hash_procedures_set_x (rest);
1567 }
1568 else
1569 scm_set_cdr_x (prev, SCM_CDR (this));
1570 }
1571 else
1572 {
1573 /* replace it. */
1574 scm_set_cdr_x (SCM_CAR (this), proc);
1575 }
1576 break;
1577 }
1578 prev = this;
1579 this = SCM_CDR (this);
1580 }
1581
1582 return SCM_UNSPECIFIED;
1583 }
1584 #undef FUNC_NAME
1585
1586 /* Recover the read-hash procedure corresponding to char c. */
1587 static SCM
1588 scm_get_hash_procedure (int c)
1589 {
1590 SCM rest = scm_i_read_hash_procedures_ref ();
1591
1592 while (1)
1593 {
1594 if (scm_is_null (rest))
1595 return SCM_BOOL_F;
1596
1597 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1598 return SCM_CDAR (rest);
1599
1600 rest = SCM_CDR (rest);
1601 }
1602 }
1603
1604 #define SCM_ENCODING_SEARCH_SIZE (500)
1605
1606 /* Search the first few hundred characters of a file for an Emacs-like coding
1607 declaration. Returns either NULL or a string whose storage has been
1608 allocated with `scm_gc_malloc ()'. */
1609 char *
1610 scm_i_scan_for_encoding (SCM port)
1611 {
1612 scm_t_port *pt;
1613 char header[SCM_ENCODING_SEARCH_SIZE+1];
1614 size_t bytes_read, encoding_length, i;
1615 char *encoding = NULL;
1616 int utf8_bom = 0;
1617 char *pos, *encoding_start;
1618 int in_comment;
1619
1620 pt = SCM_PTAB_ENTRY (port);
1621
1622 if (pt->rw_active == SCM_PORT_WRITE)
1623 scm_flush (port);
1624
1625 if (pt->rw_random)
1626 pt->rw_active = SCM_PORT_READ;
1627
1628 if (pt->read_pos == pt->read_end)
1629 {
1630 /* We can use the read buffer, and thus avoid a seek. */
1631 if (scm_fill_input (port) == EOF)
1632 return NULL;
1633
1634 bytes_read = pt->read_end - pt->read_pos;
1635 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1636 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1637
1638 if (bytes_read <= 1)
1639 /* An unbuffered port -- don't scan. */
1640 return NULL;
1641
1642 memcpy (header, pt->read_pos, bytes_read);
1643 header[bytes_read] = '\0';
1644 }
1645 else
1646 {
1647 /* Try to read some bytes and then seek back. Not all ports
1648 support seeking back; and indeed some file ports (like
1649 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1650 check performed by SCM_FPORT_FDES---but fail to seek
1651 backwards. Hence this block comes second. We prefer to use
1652 the read buffer in-place. */
1653 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1654 return NULL;
1655
1656 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1657 header[bytes_read] = '\0';
1658 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1659 }
1660
1661 if (bytes_read > 3
1662 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1663 utf8_bom = 1;
1664
1665 /* search past "coding[:=]" */
1666 pos = header;
1667 while (1)
1668 {
1669 if ((pos = strstr(pos, "coding")) == NULL)
1670 return NULL;
1671
1672 pos += strlen("coding");
1673 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1674 (*pos == ':' || *pos == '='))
1675 {
1676 pos ++;
1677 break;
1678 }
1679 }
1680
1681 /* skip spaces */
1682 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1683 (*pos == ' ' || *pos == '\t'))
1684 pos ++;
1685
1686 /* grab the next token */
1687 encoding_start = pos;
1688 i = 0;
1689 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1690 && encoding_start + i - header < bytes_read
1691 && (isalnum ((int) encoding_start[i])
1692 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1693 i++;
1694
1695 encoding_length = i;
1696 if (encoding_length == 0)
1697 return NULL;
1698
1699 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1700 for (i = 0; i < encoding_length; i++)
1701 encoding[i] = toupper ((int) encoding[i]);
1702
1703 /* push backwards to make sure we were in a comment */
1704 in_comment = 0;
1705 pos = encoding_start;
1706 while (pos >= header)
1707 {
1708 if (*pos == ';')
1709 {
1710 in_comment = 1;
1711 break;
1712 }
1713 else if (*pos == '\n' || pos == header)
1714 {
1715 /* This wasn't in a semicolon comment. Check for a
1716 hash-bang comment. */
1717 char *beg = strstr (header, "#!");
1718 char *end = strstr (header, "!#");
1719 if (beg < encoding_start && encoding_start + encoding_length <= end)
1720 in_comment = 1;
1721 break;
1722 }
1723 else
1724 {
1725 pos --;
1726 continue;
1727 }
1728 }
1729 if (!in_comment)
1730 /* This wasn't in a comment */
1731 return NULL;
1732
1733 if (utf8_bom && strcmp(encoding, "UTF-8"))
1734 scm_misc_error (NULL,
1735 "the port input declares the encoding ~s but is encoded as UTF-8",
1736 scm_list_1 (scm_from_locale_string (encoding)));
1737
1738 return encoding;
1739 }
1740
1741 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1742 (SCM port),
1743 "Scans the port for an Emacs-like character coding declaration\n"
1744 "near the top of the contents of a port with random-accessible contents.\n"
1745 "The coding declaration is of the form\n"
1746 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1747 "\n"
1748 "Returns a string containing the character encoding of the file\n"
1749 "if a declaration was found, or @code{#f} otherwise.\n")
1750 #define FUNC_NAME s_scm_file_encoding
1751 {
1752 char *enc;
1753 SCM s_enc;
1754
1755 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1756
1757 enc = scm_i_scan_for_encoding (port);
1758 if (enc == NULL)
1759 return SCM_BOOL_F;
1760 else
1761 {
1762 s_enc = scm_from_locale_string (enc);
1763 return s_enc;
1764 }
1765
1766 return SCM_BOOL_F;
1767 }
1768 #undef FUNC_NAME
1769
1770 void
1771 scm_init_read ()
1772 {
1773 SCM read_hash_procs;
1774
1775 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1776
1777 scm_i_read_hash_procedures =
1778 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1779
1780 scm_init_opts (scm_read_options, scm_read_opts);
1781 #include "libguile/read.x"
1782 }
1783
1784 /*
1785 Local Variables:
1786 c-file-style: "gnu"
1787 End:
1788 */