Add missing `SCM_API' for `scm_take_from_input_buffers'.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Number of 32-bit codepoints in the buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 128
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static int
211 read_token (SCM port, char *buf, size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
239 if the token doesn't fit in BUFFER_SIZE bytes. */
240 static char *
241 read_complete_token (SCM port, char *buffer, size_t buffer_size,
242 size_t *read)
243 {
244 int overflow = 0;
245 size_t bytes_read, overflow_size = 0;
246 char *overflow_buffer = NULL;
247
248 do
249 {
250 overflow = read_token (port, buffer, buffer_size, &bytes_read);
251 if (bytes_read == 0)
252 break;
253 if (overflow || overflow_size != 0)
254 {
255 if (overflow_size == 0)
256 {
257 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
258 memcpy (overflow_buffer, buffer, bytes_read);
259 overflow_size = bytes_read;
260 }
261 else
262 {
263 char *new_buf =
264 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
265
266 memcpy (new_buf, overflow_buffer, overflow_size);
267 memcpy (new_buf + overflow_size, buffer, bytes_read);
268
269 overflow_buffer = new_buf;
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size > 0 ? overflow_buffer : buffer);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port, long line, int column);
360
361
362 static SCM
363 maybe_annotate_source (SCM x, SCM port, long line, int column)
364 {
365 if (SCM_RECORD_POSITIONS_P)
366 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
367 return x;
368 }
369
370 static SCM
371 scm_read_sexp (scm_t_wchar chr, SCM port)
372 #define FUNC_NAME "scm_i_lreadparen"
373 {
374 int c;
375 SCM tmp, tl, ans = SCM_EOL;
376 const int terminating_char = ((chr == '[') ? ']' : ')');
377
378 /* Need to capture line and column numbers here. */
379 long line = SCM_LINUM (port);
380 int column = SCM_COL (port) - 1;
381
382 c = flush_ws (port, FUNC_NAME);
383 if (terminating_char == c)
384 return SCM_EOL;
385
386 scm_ungetc (c, port);
387 tmp = scm_read_expression (port);
388
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
393 {
394 ans = scm_read_expression (port);
395 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
396 scm_i_input_error (FUNC_NAME, port, "missing close paren",
397 SCM_EOL);
398 return ans;
399 }
400
401 /* Build the head of the list structure. */
402 ans = tl = scm_cons (tmp, SCM_EOL);
403
404 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
405 {
406 SCM new_tail;
407
408 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c)));
412
413 scm_ungetc (c, port);
414 tmp = scm_read_expression (port);
415
416 /* See above note about scm_sym_dot. */
417 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
418 {
419 SCM_SETCDR (tl, scm_read_expression (port));
420
421 c = flush_ws (port, FUNC_NAME);
422 if (terminating_char != c)
423 scm_i_input_error (FUNC_NAME, port,
424 "in pair: missing close paren", SCM_EOL);
425 goto exit;
426 }
427
428 new_tail = scm_cons (tmp, SCM_EOL);
429 SCM_SETCDR (tl, new_tail);
430 tl = new_tail;
431 }
432
433 exit:
434 return maybe_annotate_source (ans, port, line, column);
435 }
436 #undef FUNC_NAME
437
438
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
471 } while (0)
472
473 static void
474 skip_intraline_whitespace (SCM port)
475 {
476 scm_t_wchar c;
477
478 do
479 {
480 c = scm_getc (port);
481 if (c == EOF)
482 return;
483 }
484 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
485
486 scm_ungetc (c, port);
487 }
488
489 static SCM
490 scm_read_string (int chr, SCM port)
491 #define FUNC_NAME "scm_lreadr"
492 {
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
495
496 SCM str = SCM_EOL;
497 size_t c_str_len = 0;
498 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
499
500 /* Need to capture line and column numbers here. */
501 long line = SCM_LINUM (port);
502 int column = SCM_COL (port) - 1;
503
504 while ('"' != (c = scm_getc (port)))
505 {
506 if (c == EOF)
507 {
508 str_eof:
509 scm_i_input_error (FUNC_NAME, port,
510 "end of file in string constant", SCM_EOL);
511 }
512
513 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
514 {
515 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
516 c_str_len = 0;
517 }
518
519 if (c == '\\')
520 {
521 switch (c = scm_getc (port))
522 {
523 case EOF:
524 goto str_eof;
525 case '"':
526 case '\\':
527 break;
528 case '\n':
529 if (SCM_HUNGRY_EOL_ESCAPES_P)
530 skip_intraline_whitespace (port);
531 continue;
532 case '0':
533 c = '\0';
534 break;
535 case 'f':
536 c = '\f';
537 break;
538 case 'n':
539 c = '\n';
540 break;
541 case 'r':
542 c = '\r';
543 break;
544 case 't':
545 c = '\t';
546 break;
547 case 'a':
548 c = '\007';
549 break;
550 case 'v':
551 c = '\v';
552 break;
553 case 'b':
554 c = '\010';
555 break;
556 case 'x':
557 if (SCM_R6RS_ESCAPES_P)
558 SCM_READ_HEX_ESCAPE (10, ';');
559 else
560 SCM_READ_HEX_ESCAPE (2, '\0');
561 break;
562 case 'u':
563 if (!SCM_R6RS_ESCAPES_P)
564 {
565 SCM_READ_HEX_ESCAPE (4, '\0');
566 break;
567 }
568 case 'U':
569 if (!SCM_R6RS_ESCAPES_P)
570 {
571 SCM_READ_HEX_ESCAPE (6, '\0');
572 break;
573 }
574 default:
575 bad_escaped:
576 scm_i_input_error (FUNC_NAME, port,
577 "illegal character in escape sequence: ~S",
578 scm_list_1 (SCM_MAKE_CHAR (c)));
579 }
580 }
581
582 c_str[c_str_len++] = c;
583 }
584
585 if (scm_is_null (str))
586 /* Fast path: we got a string that fits in C_STR. */
587 str = scm_from_utf32_stringn (c_str, c_str_len);
588 else
589 {
590 if (c_str_len > 0)
591 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
592
593 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
594 }
595
596 return maybe_annotate_source (str, port, line, column);
597 }
598 #undef FUNC_NAME
599
600
601 static SCM
602 scm_read_number (scm_t_wchar chr, SCM port)
603 {
604 SCM result, str = SCM_EOL;
605 char local_buffer[READER_BUFFER_SIZE], *buffer;
606 size_t bytes_read;
607 scm_t_port *pt = SCM_PTAB_ENTRY (port);
608
609 /* Need to capture line and column numbers here. */
610 long line = SCM_LINUM (port);
611 int column = SCM_COL (port) - 1;
612
613 scm_ungetc (chr, port);
614 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
615 &bytes_read);
616
617 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
618
619 result = scm_string_to_number (str, SCM_UNDEFINED);
620 if (scm_is_false (result))
621 {
622 /* Return a symbol instead of a number */
623 if (SCM_CASE_INSENSITIVE_P)
624 str = scm_string_downcase_x (str);
625 result = scm_string_to_symbol (str);
626 }
627 else if (SCM_NIMP (result))
628 result = maybe_annotate_source (result, port, line, column);
629
630 SCM_COL (port) += scm_i_string_length (str);
631 return result;
632 }
633
634 static SCM
635 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
636 {
637 SCM result;
638 int ends_with_colon = 0;
639 size_t bytes_read;
640 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
641 char local_buffer[READER_BUFFER_SIZE], *buffer;
642 scm_t_port *pt = SCM_PTAB_ENTRY (port);
643 SCM str;
644
645 scm_ungetc (chr, port);
646 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
647 &bytes_read);
648 if (bytes_read > 0)
649 ends_with_colon = buffer[bytes_read - 1] == ':';
650
651 if (postfix && ends_with_colon && (bytes_read > 1))
652 {
653 str = scm_from_stringn (buffer, bytes_read - 1,
654 pt->encoding, pt->ilseq_handler);
655
656 if (SCM_CASE_INSENSITIVE_P)
657 str = scm_string_downcase_x (str);
658 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
659 }
660 else
661 {
662 str = scm_from_stringn (buffer, bytes_read,
663 pt->encoding, pt->ilseq_handler);
664
665 if (SCM_CASE_INSENSITIVE_P)
666 str = scm_string_downcase_x (str);
667 result = scm_string_to_symbol (str);
668 }
669
670 SCM_COL (port) += scm_i_string_length (str);
671 return result;
672 }
673
674 static SCM
675 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
676 #define FUNC_NAME "scm_lreadr"
677 {
678 SCM result;
679 size_t read;
680 char local_buffer[READER_BUFFER_SIZE], *buffer;
681 unsigned int radix;
682 SCM str;
683 scm_t_port *pt;
684
685 switch (chr)
686 {
687 case 'B':
688 case 'b':
689 radix = 2;
690 break;
691
692 case 'o':
693 case 'O':
694 radix = 8;
695 break;
696
697 case 'd':
698 case 'D':
699 radix = 10;
700 break;
701
702 case 'x':
703 case 'X':
704 radix = 16;
705 break;
706
707 default:
708 scm_ungetc (chr, port);
709 scm_ungetc ('#', port);
710 radix = 10;
711 }
712
713 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
714 &read);
715
716 pt = SCM_PTAB_ENTRY (port);
717 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
718
719 result = scm_string_to_number (str, scm_from_uint (radix));
720
721 SCM_COL (port) += scm_i_string_length (str);
722
723 if (scm_is_true (result))
724 return result;
725
726 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
727
728 return SCM_BOOL_F;
729 }
730 #undef FUNC_NAME
731
732 static SCM
733 scm_read_quote (int chr, SCM port)
734 {
735 SCM p;
736 long line = SCM_LINUM (port);
737 int column = SCM_COL (port) - 1;
738
739 switch (chr)
740 {
741 case '`':
742 p = scm_sym_quasiquote;
743 break;
744
745 case '\'':
746 p = scm_sym_quote;
747 break;
748
749 case ',':
750 {
751 scm_t_wchar c;
752
753 c = scm_getc (port);
754 if ('@' == c)
755 p = scm_sym_uq_splicing;
756 else
757 {
758 scm_ungetc (c, port);
759 p = scm_sym_unquote;
760 }
761 break;
762 }
763
764 default:
765 fprintf (stderr, "%s: unhandled quote character (%i)\n",
766 "scm_read_quote", chr);
767 abort ();
768 }
769
770 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
771 return maybe_annotate_source (p, port, line, column);
772 }
773
774 SCM_SYMBOL (sym_syntax, "syntax");
775 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
776 SCM_SYMBOL (sym_unsyntax, "unsyntax");
777 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
778
779 static SCM
780 scm_read_syntax (int chr, SCM port)
781 {
782 SCM p;
783 long line = SCM_LINUM (port);
784 int column = SCM_COL (port) - 1;
785
786 switch (chr)
787 {
788 case '`':
789 p = sym_quasisyntax;
790 break;
791
792 case '\'':
793 p = sym_syntax;
794 break;
795
796 case ',':
797 {
798 int c;
799
800 c = scm_getc (port);
801 if ('@' == c)
802 p = sym_unsyntax_splicing;
803 else
804 {
805 scm_ungetc (c, port);
806 p = sym_unsyntax;
807 }
808 break;
809 }
810
811 default:
812 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
813 "scm_read_syntax", chr);
814 abort ();
815 }
816
817 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
818 return maybe_annotate_source (p, port, line, column);
819 }
820
821 static SCM
822 scm_read_nil (int chr, SCM port)
823 {
824 SCM id = scm_read_mixed_case_symbol (chr, port);
825
826 if (!scm_is_eq (id, sym_nil))
827 scm_i_input_error ("scm_read_nil", port,
828 "unexpected input while reading #nil: ~a",
829 scm_list_1 (id));
830
831 return SCM_ELISP_NIL;
832 }
833
834 static SCM
835 scm_read_semicolon_comment (int chr, SCM port)
836 {
837 int c;
838
839 /* We use the get_byte here because there is no need to get the
840 locale correct with comment input. This presumes that newline
841 always represents itself no matter what the encoding is. */
842 for (c = scm_get_byte_or_eof (port);
843 (c != EOF) && (c != '\n');
844 c = scm_get_byte_or_eof (port));
845
846 return SCM_UNSPECIFIED;
847 }
848
849 \f
850 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
851
852 static SCM
853 scm_read_boolean (int chr, SCM port)
854 {
855 switch (chr)
856 {
857 case 't':
858 case 'T':
859 return SCM_BOOL_T;
860
861 case 'f':
862 case 'F':
863 return SCM_BOOL_F;
864 }
865
866 return SCM_UNSPECIFIED;
867 }
868
869 static SCM
870 scm_read_character (scm_t_wchar chr, SCM port)
871 #define FUNC_NAME "scm_lreadr"
872 {
873 char buffer[READER_CHAR_NAME_MAX_SIZE];
874 SCM charname;
875 size_t charname_len, bytes_read;
876 scm_t_wchar cp;
877 int overflow;
878 scm_t_port *pt;
879
880 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
881 if (overflow)
882 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
883
884 if (bytes_read == 0)
885 {
886 chr = scm_getc (port);
887 if (chr == EOF)
888 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
889 "while reading character", SCM_EOL);
890
891 /* CHR must be a token delimiter, like a whitespace. */
892 return (SCM_MAKE_CHAR (chr));
893 }
894
895 pt = SCM_PTAB_ENTRY (port);
896
897 /* Simple ASCII characters can be processed immediately. Also, simple
898 ISO-8859-1 characters can be processed immediately if the encoding for this
899 port is ISO-8859-1. */
900 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
901 {
902 SCM_COL (port) += 1;
903 return SCM_MAKE_CHAR (buffer[0]);
904 }
905
906 /* Otherwise, convert the buffer into a proper scheme string for
907 processing. */
908 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
909 pt->ilseq_handler);
910 charname_len = scm_i_string_length (charname);
911 SCM_COL (port) += charname_len;
912 cp = scm_i_string_ref (charname, 0);
913 if (charname_len == 1)
914 return SCM_MAKE_CHAR (cp);
915
916 /* Ignore dotted circles, which may be used to keep combining characters from
917 combining with the backslash in #\charname. */
918 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
919 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
920
921 if (cp >= '0' && cp < '8')
922 {
923 /* Dirk:FIXME:: This type of character syntax is not R5RS
924 * compliant. Further, it should be verified that the constant
925 * does only consist of octal digits. */
926 SCM p = scm_string_to_number (charname, scm_from_uint (8));
927 if (SCM_I_INUMP (p))
928 {
929 scm_t_wchar c = scm_to_uint32 (p);
930 if (SCM_IS_UNICODE_CHAR (c))
931 return SCM_MAKE_CHAR (c);
932 else
933 scm_i_input_error (FUNC_NAME, port,
934 "out-of-range octal character escape: ~a",
935 scm_list_1 (charname));
936 }
937 }
938
939 if (cp == 'x' && (charname_len > 1))
940 {
941 SCM p;
942
943 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
944 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
945 scm_from_uint (16));
946 if (SCM_I_INUMP (p))
947 {
948 scm_t_wchar c = scm_to_uint32 (p);
949 if (SCM_IS_UNICODE_CHAR (c))
950 return SCM_MAKE_CHAR (c);
951 else
952 scm_i_input_error (FUNC_NAME, port,
953 "out-of-range hex character escape: ~a",
954 scm_list_1 (charname));
955 }
956 }
957
958 /* The names of characters should never have non-Latin1
959 characters. */
960 if (scm_i_is_narrow_string (charname)
961 || scm_i_try_narrow_string (charname))
962 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
963 charname_len);
964 if (scm_is_true (ch))
965 return ch;
966 }
967
968 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
969 scm_list_1 (charname));
970
971 return SCM_UNSPECIFIED;
972 }
973 #undef FUNC_NAME
974
975 static SCM
976 scm_read_keyword (int chr, SCM port)
977 {
978 SCM symbol;
979
980 /* Read the symbol that comprises the keyword. Doing this instead of
981 invoking a specific symbol reader function allows `scm_read_keyword ()'
982 to adapt to the delimiters currently valid of symbols.
983
984 XXX: This implementation allows sloppy syntaxes like `#: key'. */
985 symbol = scm_read_expression (port);
986 if (!scm_is_symbol (symbol))
987 scm_i_input_error ("scm_read_keyword", port,
988 "keyword prefix `~a' not followed by a symbol: ~s",
989 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
990
991 return (scm_symbol_to_keyword (symbol));
992 }
993
994 static SCM
995 scm_read_vector (int chr, SCM port, long line, int column)
996 {
997 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
998 guarantee that it's going to do what we want. After all, this is an
999 implementation detail of `scm_read_vector ()', not a desirable
1000 property. */
1001 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1002 port, line, column);
1003 }
1004
1005 static SCM
1006 scm_read_array (int chr, SCM port, long line, int column)
1007 {
1008 SCM result = scm_i_read_array (port, chr);
1009 if (scm_is_false (result))
1010 return result;
1011 else
1012 return maybe_annotate_source (result, port, line, column);
1013 }
1014
1015 static SCM
1016 scm_read_srfi4_vector (int chr, SCM port, long line, int column)
1017 {
1018 return scm_read_array (chr, port, line, column);
1019 }
1020
1021 static SCM
1022 scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
1023 {
1024 chr = scm_getc (port);
1025 if (chr != 'u')
1026 goto syntax;
1027
1028 chr = scm_getc (port);
1029 if (chr != '8')
1030 goto syntax;
1031
1032 chr = scm_getc (port);
1033 if (chr != '(')
1034 goto syntax;
1035
1036 return maybe_annotate_source
1037 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1038 port, line, column);
1039
1040 syntax:
1041 scm_i_input_error ("read_bytevector", port,
1042 "invalid bytevector prefix",
1043 SCM_MAKE_CHAR (chr));
1044 return SCM_UNSPECIFIED;
1045 }
1046
1047 static SCM
1048 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
1049 {
1050 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1051 terribly inefficient but who cares? */
1052 SCM s_bits = SCM_EOL;
1053
1054 for (chr = scm_getc (port);
1055 (chr != EOF) && ((chr == '0') || (chr == '1'));
1056 chr = scm_getc (port))
1057 {
1058 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1059 }
1060
1061 if (chr != EOF)
1062 scm_ungetc (chr, port);
1063
1064 return maybe_annotate_source
1065 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1066 port, line, column);
1067 }
1068
1069 static SCM
1070 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1071 {
1072 int bang_seen = 0;
1073
1074 for (;;)
1075 {
1076 int c = scm_getc (port);
1077
1078 if (c == EOF)
1079 scm_i_input_error ("skip_block_comment", port,
1080 "unterminated `#! ... !#' comment", SCM_EOL);
1081
1082 if (c == '!')
1083 bang_seen = 1;
1084 else if (c == '#' && bang_seen)
1085 break;
1086 else
1087 bang_seen = 0;
1088 }
1089
1090 return SCM_UNSPECIFIED;
1091 }
1092
1093 static SCM
1094 scm_read_shebang (scm_t_wchar chr, SCM port)
1095 {
1096 int c = 0;
1097 if ((c = scm_get_byte_or_eof (port)) != 'r')
1098 {
1099 scm_ungetc (c, port);
1100 return scm_read_scsh_block_comment (chr, port);
1101 }
1102 if ((c = scm_get_byte_or_eof (port)) != '6')
1103 {
1104 scm_ungetc (c, port);
1105 scm_ungetc ('r', port);
1106 return scm_read_scsh_block_comment (chr, port);
1107 }
1108 if ((c = scm_get_byte_or_eof (port)) != 'r')
1109 {
1110 scm_ungetc (c, port);
1111 scm_ungetc ('6', port);
1112 scm_ungetc ('r', port);
1113 return scm_read_scsh_block_comment (chr, port);
1114 }
1115 if ((c = scm_get_byte_or_eof (port)) != 's')
1116 {
1117 scm_ungetc (c, port);
1118 scm_ungetc ('r', port);
1119 scm_ungetc ('6', port);
1120 scm_ungetc ('r', port);
1121 return scm_read_scsh_block_comment (chr, port);
1122 }
1123
1124 return SCM_UNSPECIFIED;
1125 }
1126
1127 static SCM
1128 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1129 {
1130 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1131 nested. So care must be taken. */
1132 int nesting_level = 1;
1133
1134 int a = scm_getc (port);
1135
1136 if (a == EOF)
1137 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1138 "unterminated `#| ... |#' comment", SCM_EOL);
1139
1140 while (nesting_level > 0)
1141 {
1142 int b = scm_getc (port);
1143
1144 if (b == EOF)
1145 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1146 "unterminated `#| ... |#' comment", SCM_EOL);
1147
1148 if (a == '|' && b == '#')
1149 {
1150 nesting_level--;
1151 b = EOF;
1152 }
1153 else if (a == '#' && b == '|')
1154 {
1155 nesting_level++;
1156 b = EOF;
1157 }
1158
1159 a = b;
1160 }
1161
1162 return SCM_UNSPECIFIED;
1163 }
1164
1165 static SCM
1166 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1167 {
1168 scm_t_wchar c;
1169
1170 c = flush_ws (port, (char *) NULL);
1171 if (EOF == c)
1172 scm_i_input_error ("read_commented_expression", port,
1173 "no expression after #; comment", SCM_EOL);
1174 scm_ungetc (c, port);
1175 scm_read_expression (port);
1176 return SCM_UNSPECIFIED;
1177 }
1178
1179 static SCM
1180 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1181 {
1182 /* Guile's extended symbol read syntax looks like this:
1183
1184 #{This is all a symbol name}#
1185
1186 So here, CHR is expected to be `{'. */
1187 int saw_brace = 0;
1188 size_t len = 0;
1189 SCM buf = scm_i_make_string (1024, NULL, 0);
1190
1191 buf = scm_i_string_start_writing (buf);
1192
1193 while ((chr = scm_getc (port)) != EOF)
1194 {
1195 if (saw_brace)
1196 {
1197 if (chr == '#')
1198 {
1199 break;
1200 }
1201 else
1202 {
1203 saw_brace = 0;
1204 scm_i_string_set_x (buf, len++, '}');
1205 }
1206 }
1207
1208 if (chr == '}')
1209 saw_brace = 1;
1210 else if (chr == '\\')
1211 {
1212 /* It used to be that print.c would print extended-read-syntax
1213 symbols with backslashes before "non-standard" chars, but
1214 this routine wouldn't do anything with those escapes.
1215 Bummer. What we've done is to change print.c to output
1216 R6RS hex escapes for those characters, relying on the fact
1217 that the extended read syntax would never put a `\' before
1218 an `x'. For now, we just ignore other instances of
1219 backslash in the string. */
1220 switch ((chr = scm_getc (port)))
1221 {
1222 case EOF:
1223 goto done;
1224 case 'x':
1225 {
1226 scm_t_wchar c;
1227
1228 SCM_READ_HEX_ESCAPE (10, ';');
1229 scm_i_string_set_x (buf, len++, c);
1230 break;
1231
1232 str_eof:
1233 chr = EOF;
1234 goto done;
1235
1236 bad_escaped:
1237 scm_i_string_stop_writing ();
1238 scm_i_input_error ("scm_read_extended_symbol", port,
1239 "illegal character in escape sequence: ~S",
1240 scm_list_1 (SCM_MAKE_CHAR (c)));
1241 break;
1242 }
1243 default:
1244 scm_i_string_set_x (buf, len++, chr);
1245 break;
1246 }
1247 }
1248 else
1249 scm_i_string_set_x (buf, len++, chr);
1250
1251 if (len >= scm_i_string_length (buf) - 2)
1252 {
1253 SCM addy;
1254
1255 scm_i_string_stop_writing ();
1256 addy = scm_i_make_string (1024, NULL, 0);
1257 buf = scm_string_append (scm_list_2 (buf, addy));
1258 len = 0;
1259 buf = scm_i_string_start_writing (buf);
1260 }
1261 }
1262
1263 done:
1264 scm_i_string_stop_writing ();
1265 if (chr == EOF)
1266 scm_i_input_error ("scm_read_extended_symbol", port,
1267 "end of file while reading symbol", SCM_EOL);
1268
1269 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1270 }
1271
1272
1273 \f
1274 /* Top-level token readers, i.e., dispatchers. */
1275
1276 static SCM
1277 scm_read_sharp_extension (int chr, SCM port)
1278 {
1279 SCM proc;
1280
1281 proc = scm_get_hash_procedure (chr);
1282 if (scm_is_true (scm_procedure_p (proc)))
1283 {
1284 long line = SCM_LINUM (port);
1285 int column = SCM_COL (port) - 2;
1286 SCM got;
1287
1288 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1289
1290 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1291 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1292
1293 return got;
1294 }
1295
1296 return SCM_UNSPECIFIED;
1297 }
1298
1299 /* The reader for the sharp `#' character. It basically dispatches reads
1300 among the above token readers. */
1301 static SCM
1302 scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
1303 #define FUNC_NAME "scm_lreadr"
1304 {
1305 SCM result;
1306
1307 chr = scm_getc (port);
1308
1309 result = scm_read_sharp_extension (chr, port);
1310 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1311 return result;
1312
1313 switch (chr)
1314 {
1315 case '\\':
1316 return (scm_read_character (chr, port));
1317 case '(':
1318 return (scm_read_vector (chr, port, line, column));
1319 case 's':
1320 case 'u':
1321 case 'f':
1322 case 'c':
1323 /* This one may return either a boolean or an SRFI-4 vector. */
1324 return (scm_read_srfi4_vector (chr, port, line, column));
1325 case 'v':
1326 return (scm_read_bytevector (chr, port, line, column));
1327 case '*':
1328 return (scm_read_guile_bit_vector (chr, port, line, column));
1329 case 't':
1330 case 'T':
1331 case 'F':
1332 return (scm_read_boolean (chr, port));
1333 case ':':
1334 return (scm_read_keyword (chr, port));
1335 case '0': case '1': case '2': case '3': case '4':
1336 case '5': case '6': case '7': case '8': case '9':
1337 case '@':
1338 #if SCM_ENABLE_DEPRECATED
1339 /* See below for 'i' and 'e'. */
1340 case 'a':
1341 case 'y':
1342 case 'h':
1343 case 'l':
1344 #endif
1345 return (scm_read_array (chr, port, line, column));
1346
1347 case 'i':
1348 case 'e':
1349 #if SCM_ENABLE_DEPRECATED
1350 {
1351 /* When next char is '(', it really is an old-style
1352 uniform array. */
1353 scm_t_wchar next_c = scm_getc (port);
1354 if (next_c != EOF)
1355 scm_ungetc (next_c, port);
1356 if (next_c == '(')
1357 return scm_read_array (chr, port, line, column);
1358 /* Fall through. */
1359 }
1360 #endif
1361 case 'b':
1362 case 'B':
1363 case 'o':
1364 case 'O':
1365 case 'd':
1366 case 'D':
1367 case 'x':
1368 case 'X':
1369 case 'I':
1370 case 'E':
1371 return (scm_read_number_and_radix (chr, port));
1372 case '{':
1373 return (scm_read_extended_symbol (chr, port));
1374 case '!':
1375 return (scm_read_shebang (chr, port));
1376 case ';':
1377 return (scm_read_commented_expression (chr, port));
1378 case '`':
1379 case '\'':
1380 case ',':
1381 return (scm_read_syntax (chr, port));
1382 case 'n':
1383 return (scm_read_nil (chr, port));
1384 default:
1385 result = scm_read_sharp_extension (chr, port);
1386 if (scm_is_eq (result, SCM_UNSPECIFIED))
1387 {
1388 /* To remain compatible with 1.8 and earlier, the following
1389 characters have lower precedence than `read-hash-extend'
1390 characters. */
1391 switch (chr)
1392 {
1393 case '|':
1394 return scm_read_r6rs_block_comment (chr, port);
1395 default:
1396 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1397 scm_list_1 (SCM_MAKE_CHAR (chr)));
1398 }
1399 }
1400 else
1401 return result;
1402 }
1403
1404 return SCM_UNSPECIFIED;
1405 }
1406 #undef FUNC_NAME
1407
1408 static SCM
1409 scm_read_expression (SCM port)
1410 #define FUNC_NAME "scm_read_expression"
1411 {
1412 while (1)
1413 {
1414 scm_t_wchar chr;
1415
1416 chr = scm_getc (port);
1417
1418 switch (chr)
1419 {
1420 case SCM_WHITE_SPACES:
1421 case SCM_LINE_INCREMENTORS:
1422 break;
1423 case ';':
1424 (void) scm_read_semicolon_comment (chr, port);
1425 break;
1426 case '[':
1427 if (!SCM_SQUARE_BRACKETS_P)
1428 return (scm_read_mixed_case_symbol (chr, port));
1429 /* otherwise fall through */
1430 case '(':
1431 return (scm_read_sexp (chr, port));
1432 case '"':
1433 return (scm_read_string (chr, port));
1434 case '\'':
1435 case '`':
1436 case ',':
1437 return (scm_read_quote (chr, port));
1438 case '#':
1439 {
1440 long line = SCM_LINUM (port);
1441 int column = SCM_COL (port) - 1;
1442 SCM result = scm_read_sharp (chr, port, line, column);
1443 if (scm_is_eq (result, SCM_UNSPECIFIED))
1444 /* We read a comment or some such. */
1445 break;
1446 else
1447 return result;
1448 }
1449 case ')':
1450 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1451 break;
1452 case ']':
1453 if (SCM_SQUARE_BRACKETS_P)
1454 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1455 /* otherwise fall through */
1456 case EOF:
1457 return SCM_EOF_VAL;
1458 case ':':
1459 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1460 return scm_symbol_to_keyword (scm_read_expression (port));
1461 /* Fall through. */
1462
1463 default:
1464 {
1465 if (((chr >= '0') && (chr <= '9'))
1466 || (strchr ("+-.", chr)))
1467 return (scm_read_number (chr, port));
1468 else
1469 return (scm_read_mixed_case_symbol (chr, port));
1470 }
1471 }
1472 }
1473 }
1474 #undef FUNC_NAME
1475
1476 \f
1477 /* Actual reader. */
1478
1479 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1480 (SCM port),
1481 "Read an s-expression from the input port @var{port}, or from\n"
1482 "the current input port if @var{port} is not specified.\n"
1483 "Any whitespace before the next token is discarded.")
1484 #define FUNC_NAME s_scm_read
1485 {
1486 int c;
1487
1488 if (SCM_UNBNDP (port))
1489 port = scm_current_input_port ();
1490 SCM_VALIDATE_OPINPORT (1, port);
1491
1492 c = flush_ws (port, (char *) NULL);
1493 if (EOF == c)
1494 return SCM_EOF_VAL;
1495 scm_ungetc (c, port);
1496
1497 return (scm_read_expression (port));
1498 }
1499 #undef FUNC_NAME
1500
1501
1502 \f
1503
1504 /* Manipulate the read-hash-procedures alist. This could be written in
1505 Scheme, but maybe it will also be used by C code during initialisation. */
1506 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1507 (SCM chr, SCM proc),
1508 "Install the procedure @var{proc} for reading expressions\n"
1509 "starting with the character sequence @code{#} and @var{chr}.\n"
1510 "@var{proc} will be called with two arguments: the character\n"
1511 "@var{chr} and the port to read further data from. The object\n"
1512 "returned will be the return value of @code{read}. \n"
1513 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1514 )
1515 #define FUNC_NAME s_scm_read_hash_extend
1516 {
1517 SCM this;
1518 SCM prev;
1519
1520 SCM_VALIDATE_CHAR (1, chr);
1521 SCM_ASSERT (scm_is_false (proc)
1522 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1523 proc, SCM_ARG2, FUNC_NAME);
1524
1525 /* Check if chr is already in the alist. */
1526 this = scm_i_read_hash_procedures_ref ();
1527 prev = SCM_BOOL_F;
1528 while (1)
1529 {
1530 if (scm_is_null (this))
1531 {
1532 /* not found, so add it to the beginning. */
1533 if (scm_is_true (proc))
1534 {
1535 SCM new = scm_cons (scm_cons (chr, proc),
1536 scm_i_read_hash_procedures_ref ());
1537 scm_i_read_hash_procedures_set_x (new);
1538 }
1539 break;
1540 }
1541 if (scm_is_eq (chr, SCM_CAAR (this)))
1542 {
1543 /* already in the alist. */
1544 if (scm_is_false (proc))
1545 {
1546 /* remove it. */
1547 if (scm_is_false (prev))
1548 {
1549 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1550 scm_i_read_hash_procedures_set_x (rest);
1551 }
1552 else
1553 scm_set_cdr_x (prev, SCM_CDR (this));
1554 }
1555 else
1556 {
1557 /* replace it. */
1558 scm_set_cdr_x (SCM_CAR (this), proc);
1559 }
1560 break;
1561 }
1562 prev = this;
1563 this = SCM_CDR (this);
1564 }
1565
1566 return SCM_UNSPECIFIED;
1567 }
1568 #undef FUNC_NAME
1569
1570 /* Recover the read-hash procedure corresponding to char c. */
1571 static SCM
1572 scm_get_hash_procedure (int c)
1573 {
1574 SCM rest = scm_i_read_hash_procedures_ref ();
1575
1576 while (1)
1577 {
1578 if (scm_is_null (rest))
1579 return SCM_BOOL_F;
1580
1581 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1582 return SCM_CDAR (rest);
1583
1584 rest = SCM_CDR (rest);
1585 }
1586 }
1587
1588 #define SCM_ENCODING_SEARCH_SIZE (500)
1589
1590 /* Search the first few hundred characters of a file for an Emacs-like coding
1591 declaration. Returns either NULL or a string whose storage has been
1592 allocated with `scm_gc_malloc ()'. */
1593 char *
1594 scm_i_scan_for_encoding (SCM port)
1595 {
1596 scm_t_port *pt;
1597 char header[SCM_ENCODING_SEARCH_SIZE+1];
1598 size_t bytes_read, encoding_length, i;
1599 char *encoding = NULL;
1600 int utf8_bom = 0;
1601 char *pos, *encoding_start;
1602 int in_comment;
1603
1604 pt = SCM_PTAB_ENTRY (port);
1605
1606 if (pt->rw_active == SCM_PORT_WRITE)
1607 scm_flush (port);
1608
1609 if (pt->rw_random)
1610 pt->rw_active = SCM_PORT_READ;
1611
1612 if (pt->read_pos == pt->read_end)
1613 {
1614 /* We can use the read buffer, and thus avoid a seek. */
1615 if (scm_fill_input (port) == EOF)
1616 return NULL;
1617
1618 bytes_read = pt->read_end - pt->read_pos;
1619 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1620 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1621
1622 if (bytes_read <= 1)
1623 /* An unbuffered port -- don't scan. */
1624 return NULL;
1625
1626 memcpy (header, pt->read_pos, bytes_read);
1627 header[bytes_read] = '\0';
1628 }
1629 else
1630 {
1631 /* Try to read some bytes and then seek back. Not all ports
1632 support seeking back; and indeed some file ports (like
1633 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1634 check performed by SCM_FPORT_FDES---but fail to seek
1635 backwards. Hence this block comes second. We prefer to use
1636 the read buffer in-place. */
1637 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1638 return NULL;
1639
1640 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1641 header[bytes_read] = '\0';
1642 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1643 }
1644
1645 if (bytes_read > 3
1646 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1647 utf8_bom = 1;
1648
1649 /* search past "coding[:=]" */
1650 pos = header;
1651 while (1)
1652 {
1653 if ((pos = strstr(pos, "coding")) == NULL)
1654 return NULL;
1655
1656 pos += strlen("coding");
1657 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1658 (*pos == ':' || *pos == '='))
1659 {
1660 pos ++;
1661 break;
1662 }
1663 }
1664
1665 /* skip spaces */
1666 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1667 (*pos == ' ' || *pos == '\t'))
1668 pos ++;
1669
1670 /* grab the next token */
1671 encoding_start = pos;
1672 i = 0;
1673 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1674 && encoding_start + i - header < bytes_read
1675 && (isalnum ((int) encoding_start[i])
1676 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1677 i++;
1678
1679 encoding_length = i;
1680 if (encoding_length == 0)
1681 return NULL;
1682
1683 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1684 for (i = 0; i < encoding_length; i++)
1685 encoding[i] = toupper ((int) encoding[i]);
1686
1687 /* push backwards to make sure we were in a comment */
1688 in_comment = 0;
1689 pos = encoding_start;
1690 while (pos >= header)
1691 {
1692 if (*pos == ';')
1693 {
1694 in_comment = 1;
1695 break;
1696 }
1697 else if (*pos == '\n' || pos == header)
1698 {
1699 /* This wasn't in a semicolon comment. Check for a
1700 hash-bang comment. */
1701 char *beg = strstr (header, "#!");
1702 char *end = strstr (header, "!#");
1703 if (beg < encoding_start && encoding_start + encoding_length <= end)
1704 in_comment = 1;
1705 break;
1706 }
1707 else
1708 {
1709 pos --;
1710 continue;
1711 }
1712 }
1713 if (!in_comment)
1714 /* This wasn't in a comment */
1715 return NULL;
1716
1717 if (utf8_bom && strcmp(encoding, "UTF-8"))
1718 scm_misc_error (NULL,
1719 "the port input declares the encoding ~s but is encoded as UTF-8",
1720 scm_list_1 (scm_from_locale_string (encoding)));
1721
1722 return encoding;
1723 }
1724
1725 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1726 (SCM port),
1727 "Scans the port for an Emacs-like character coding declaration\n"
1728 "near the top of the contents of a port with random-accessible contents.\n"
1729 "The coding declaration is of the form\n"
1730 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1731 "\n"
1732 "Returns a string containing the character encoding of the file\n"
1733 "if a declaration was found, or @code{#f} otherwise.\n")
1734 #define FUNC_NAME s_scm_file_encoding
1735 {
1736 char *enc;
1737 SCM s_enc;
1738
1739 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1740
1741 enc = scm_i_scan_for_encoding (port);
1742 if (enc == NULL)
1743 return SCM_BOOL_F;
1744 else
1745 {
1746 s_enc = scm_from_locale_string (enc);
1747 return s_enc;
1748 }
1749
1750 return SCM_BOOL_F;
1751 }
1752 #undef FUNC_NAME
1753
1754 void
1755 scm_init_read ()
1756 {
1757 SCM read_hash_procs;
1758
1759 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1760
1761 scm_i_read_hash_procedures =
1762 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1763
1764 scm_init_opts (scm_read_options, scm_read_opts);
1765 #include "libguile/read.x"
1766 }
1767
1768 /*
1769 Local Variables:
1770 c-file-style: "gnu"
1771 End:
1772 */