add guild docs
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
2 * Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static inline SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static inline void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static inline int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 register scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port);
360
361
362 static SCM
363 scm_read_sexp (scm_t_wchar chr, SCM port)
364 #define FUNC_NAME "scm_i_lreadparen"
365 {
366 int c;
367 SCM tmp, tl, ans = SCM_EOL;
368 const int terminating_char = ((chr == '[') ? ']' : ')');
369
370 /* Need to capture line and column numbers here. */
371 long line = SCM_LINUM (port);
372 int column = SCM_COL (port) - 1;
373
374 c = flush_ws (port, FUNC_NAME);
375 if (terminating_char == c)
376 return SCM_EOL;
377
378 scm_ungetc (c, port);
379 if (scm_is_eq (scm_sym_dot,
380 (tmp = scm_read_expression (port))))
381 {
382 ans = scm_read_expression (port);
383 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
384 scm_i_input_error (FUNC_NAME, port, "missing close paren",
385 SCM_EOL);
386 return ans;
387 }
388
389 /* Build the head of the list structure. */
390 ans = tl = scm_cons (tmp, SCM_EOL);
391
392 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
393 {
394 SCM new_tail;
395
396 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
397 scm_i_input_error (FUNC_NAME, port,
398 "in pair: mismatched close paren: ~A",
399 scm_list_1 (SCM_MAKE_CHAR (c)));
400
401 scm_ungetc (c, port);
402 tmp = scm_read_expression (port);
403
404 if (scm_is_eq (scm_sym_dot, tmp))
405 {
406 SCM_SETCDR (tl, tmp = scm_read_expression (port));
407
408 c = flush_ws (port, FUNC_NAME);
409 if (terminating_char != c)
410 scm_i_input_error (FUNC_NAME, port,
411 "in pair: missing close paren", SCM_EOL);
412 goto exit;
413 }
414
415 new_tail = scm_cons (tmp, SCM_EOL);
416 SCM_SETCDR (tl, new_tail);
417 tl = new_tail;
418 }
419
420 exit:
421 if (SCM_RECORD_POSITIONS_P)
422 scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
423
424 return ans;
425 }
426 #undef FUNC_NAME
427
428
429 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
430 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
431 found. */
432 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
433 do \
434 { \
435 scm_t_wchar a; \
436 size_t i = 0; \
437 c = 0; \
438 while (i < ndigits) \
439 { \
440 a = scm_getc (port); \
441 if (a == EOF) \
442 goto str_eof; \
443 if (terminator \
444 && (a == (scm_t_wchar) terminator) \
445 && (i > 0)) \
446 break; \
447 if ('0' <= a && a <= '9') \
448 a -= '0'; \
449 else if ('A' <= a && a <= 'F') \
450 a = a - 'A' + 10; \
451 else if ('a' <= a && a <= 'f') \
452 a = a - 'a' + 10; \
453 else \
454 { \
455 c = a; \
456 goto bad_escaped; \
457 } \
458 c = c * 16 + a; \
459 i ++; \
460 } \
461 } while (0)
462
463 static void
464 skip_intraline_whitespace (SCM port)
465 {
466 scm_t_wchar c;
467
468 do
469 {
470 c = scm_getc (port);
471 if (c == EOF)
472 return;
473 }
474 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
475
476 scm_ungetc (c, port);
477 }
478
479 static SCM
480 scm_read_string (int chr, SCM port)
481 #define FUNC_NAME "scm_lreadr"
482 {
483 /* For strings smaller than C_STR, this function creates only one Scheme
484 object (the string returned). */
485
486 SCM str = SCM_BOOL_F;
487 unsigned c_str_len = 0;
488 scm_t_wchar c;
489
490 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
491 while ('"' != (c = scm_getc (port)))
492 {
493 if (c == EOF)
494 {
495 str_eof:
496 scm_i_input_error (FUNC_NAME, port,
497 "end of file in string constant", SCM_EOL);
498 }
499
500 if (c_str_len + 1 >= scm_i_string_length (str))
501 {
502 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
503
504 str = scm_string_append (scm_list_2 (str, addy));
505 }
506
507 if (c == '\\')
508 {
509 switch (c = scm_getc (port))
510 {
511 case EOF:
512 goto str_eof;
513 case '"':
514 case '\\':
515 break;
516 case '\n':
517 if (SCM_HUNGRY_EOL_ESCAPES_P)
518 skip_intraline_whitespace (port);
519 continue;
520 case '0':
521 c = '\0';
522 break;
523 case 'f':
524 c = '\f';
525 break;
526 case 'n':
527 c = '\n';
528 break;
529 case 'r':
530 c = '\r';
531 break;
532 case 't':
533 c = '\t';
534 break;
535 case 'a':
536 c = '\007';
537 break;
538 case 'v':
539 c = '\v';
540 break;
541 case 'b':
542 c = '\010';
543 break;
544 case 'x':
545 if (SCM_R6RS_ESCAPES_P)
546 SCM_READ_HEX_ESCAPE (10, ';');
547 else
548 SCM_READ_HEX_ESCAPE (2, '\0');
549 break;
550 case 'u':
551 if (!SCM_R6RS_ESCAPES_P)
552 {
553 SCM_READ_HEX_ESCAPE (4, '\0');
554 break;
555 }
556 case 'U':
557 if (!SCM_R6RS_ESCAPES_P)
558 {
559 SCM_READ_HEX_ESCAPE (6, '\0');
560 break;
561 }
562 default:
563 bad_escaped:
564 scm_i_input_error (FUNC_NAME, port,
565 "illegal character in escape sequence: ~S",
566 scm_list_1 (SCM_MAKE_CHAR (c)));
567 }
568 }
569 str = scm_i_string_start_writing (str);
570 scm_i_string_set_x (str, c_str_len++, c);
571 scm_i_string_stop_writing ();
572 }
573
574 if (c_str_len > 0)
575 {
576 return scm_i_substring_copy (str, 0, c_str_len);
577 }
578
579 return scm_nullstr;
580 }
581 #undef FUNC_NAME
582
583
584 static SCM
585 scm_read_number (scm_t_wchar chr, SCM port)
586 {
587 SCM result, str = SCM_EOL;
588 char buffer[READER_BUFFER_SIZE];
589 char *overflow_buffer = NULL;
590 size_t bytes_read;
591 int overflow;
592 scm_t_port *pt = SCM_PTAB_ENTRY (port);
593
594 scm_ungetc (chr, port);
595 overflow = read_complete_token (port, buffer, sizeof (buffer),
596 &overflow_buffer, &bytes_read);
597
598 if (!overflow)
599 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
600 else
601 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
602 pt->ilseq_handler);
603
604 result = scm_string_to_number (str, SCM_UNDEFINED);
605 if (!scm_is_true (result))
606 {
607 /* Return a symbol instead of a number */
608 if (SCM_CASE_INSENSITIVE_P)
609 str = scm_string_downcase_x (str);
610 result = scm_string_to_symbol (str);
611 }
612
613 if (overflow)
614 free (overflow_buffer);
615 SCM_COL (port) += scm_i_string_length (str);
616 return result;
617 }
618
619 static SCM
620 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
621 {
622 SCM result;
623 int ends_with_colon = 0;
624 size_t bytes_read;
625 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
626 int overflow;
627 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
628 scm_t_port *pt = SCM_PTAB_ENTRY (port);
629 SCM str;
630
631 scm_ungetc (chr, port);
632 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
633 &overflow_buffer, &bytes_read);
634 if (bytes_read > 0)
635 {
636 if (!overflow)
637 ends_with_colon = buffer[bytes_read - 1] == ':';
638 else
639 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
640 }
641
642 if (postfix && ends_with_colon && (bytes_read > 1))
643 {
644 if (!overflow)
645 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
646 else
647 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
648 pt->ilseq_handler);
649
650 if (SCM_CASE_INSENSITIVE_P)
651 str = scm_string_downcase_x (str);
652 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
653 }
654 else
655 {
656 if (!overflow)
657 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
658 else
659 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
660 pt->ilseq_handler);
661
662 if (SCM_CASE_INSENSITIVE_P)
663 str = scm_string_downcase_x (str);
664 result = scm_string_to_symbol (str);
665 }
666
667 if (overflow)
668 free (overflow_buffer);
669 SCM_COL (port) += scm_i_string_length (str);
670 return result;
671 }
672
673 static SCM
674 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
675 #define FUNC_NAME "scm_lreadr"
676 {
677 SCM result;
678 size_t read;
679 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
680 int overflow;
681 unsigned int radix;
682 SCM str;
683 scm_t_port *pt;
684
685 switch (chr)
686 {
687 case 'B':
688 case 'b':
689 radix = 2;
690 break;
691
692 case 'o':
693 case 'O':
694 radix = 8;
695 break;
696
697 case 'd':
698 case 'D':
699 radix = 10;
700 break;
701
702 case 'x':
703 case 'X':
704 radix = 16;
705 break;
706
707 default:
708 scm_ungetc (chr, port);
709 scm_ungetc ('#', port);
710 radix = 10;
711 }
712
713 overflow = read_complete_token (port, buffer, sizeof (buffer),
714 &overflow_buffer, &read);
715
716 pt = SCM_PTAB_ENTRY (port);
717 if (!overflow)
718 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
719 else
720 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
721 pt->ilseq_handler);
722
723 result = scm_string_to_number (str, scm_from_uint (radix));
724
725 if (overflow)
726 free (overflow_buffer);
727
728 SCM_COL (port) += scm_i_string_length (str);
729
730 if (scm_is_true (result))
731 return result;
732
733 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
734
735 return SCM_BOOL_F;
736 }
737 #undef FUNC_NAME
738
739 static SCM
740 scm_read_quote (int chr, SCM port)
741 {
742 SCM p;
743 long line = SCM_LINUM (port);
744 int column = SCM_COL (port) - 1;
745
746 switch (chr)
747 {
748 case '`':
749 p = scm_sym_quasiquote;
750 break;
751
752 case '\'':
753 p = scm_sym_quote;
754 break;
755
756 case ',':
757 {
758 scm_t_wchar c;
759
760 c = scm_getc (port);
761 if ('@' == c)
762 p = scm_sym_uq_splicing;
763 else
764 {
765 scm_ungetc (c, port);
766 p = scm_sym_unquote;
767 }
768 break;
769 }
770
771 default:
772 fprintf (stderr, "%s: unhandled quote character (%i)\n",
773 "scm_read_quote", chr);
774 abort ();
775 }
776
777 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
778 if (SCM_RECORD_POSITIONS_P)
779 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
780
781 return p;
782 }
783
784 SCM_SYMBOL (sym_syntax, "syntax");
785 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
786 SCM_SYMBOL (sym_unsyntax, "unsyntax");
787 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
788
789 static SCM
790 scm_read_syntax (int chr, SCM port)
791 {
792 SCM p;
793 long line = SCM_LINUM (port);
794 int column = SCM_COL (port) - 1;
795
796 switch (chr)
797 {
798 case '`':
799 p = sym_quasisyntax;
800 break;
801
802 case '\'':
803 p = sym_syntax;
804 break;
805
806 case ',':
807 {
808 int c;
809
810 c = scm_getc (port);
811 if ('@' == c)
812 p = sym_unsyntax_splicing;
813 else
814 {
815 scm_ungetc (c, port);
816 p = sym_unsyntax;
817 }
818 break;
819 }
820
821 default:
822 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
823 "scm_read_syntax", chr);
824 abort ();
825 }
826
827 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
828 if (SCM_RECORD_POSITIONS_P)
829 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
830
831 return p;
832 }
833
834 static inline SCM
835 scm_read_nil (int chr, SCM port)
836 {
837 SCM id = scm_read_mixed_case_symbol (chr, port);
838
839 if (!scm_is_eq (id, sym_nil))
840 scm_i_input_error ("scm_read_nil", port,
841 "unexpected input while reading #nil: ~a",
842 scm_list_1 (id));
843
844 return SCM_ELISP_NIL;
845 }
846
847 static inline SCM
848 scm_read_semicolon_comment (int chr, SCM port)
849 {
850 int c;
851
852 /* We use the get_byte here because there is no need to get the
853 locale correct with comment input. This presumes that newline
854 always represents itself no matter what the encoding is. */
855 for (c = scm_get_byte_or_eof (port);
856 (c != EOF) && (c != '\n');
857 c = scm_get_byte_or_eof (port));
858
859 return SCM_UNSPECIFIED;
860 }
861
862 \f
863 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
864
865 static SCM
866 scm_read_boolean (int chr, SCM port)
867 {
868 switch (chr)
869 {
870 case 't':
871 case 'T':
872 return SCM_BOOL_T;
873
874 case 'f':
875 case 'F':
876 return SCM_BOOL_F;
877 }
878
879 return SCM_UNSPECIFIED;
880 }
881
882 static SCM
883 scm_read_character (scm_t_wchar chr, SCM port)
884 #define FUNC_NAME "scm_lreadr"
885 {
886 char buffer[READER_CHAR_NAME_MAX_SIZE];
887 SCM charname;
888 size_t charname_len, bytes_read;
889 scm_t_wchar cp;
890 int overflow;
891 scm_t_port *pt;
892
893 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
894 if (overflow)
895 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
896
897 if (bytes_read == 0)
898 {
899 chr = scm_getc (port);
900 if (chr == EOF)
901 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
902 "while reading character", SCM_EOL);
903
904 /* CHR must be a token delimiter, like a whitespace. */
905 return (SCM_MAKE_CHAR (chr));
906 }
907
908 pt = SCM_PTAB_ENTRY (port);
909
910 /* Simple ASCII characters can be processed immediately. Also, simple
911 ISO-8859-1 characters can be processed immediately if the encoding for this
912 port is ISO-8859-1. */
913 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
914 {
915 SCM_COL (port) += 1;
916 return SCM_MAKE_CHAR (buffer[0]);
917 }
918
919 /* Otherwise, convert the buffer into a proper scheme string for
920 processing. */
921 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
922 pt->ilseq_handler);
923 charname_len = scm_i_string_length (charname);
924 SCM_COL (port) += charname_len;
925 cp = scm_i_string_ref (charname, 0);
926 if (charname_len == 1)
927 return SCM_MAKE_CHAR (cp);
928
929 /* Ignore dotted circles, which may be used to keep combining characters from
930 combining with the backslash in #\charname. */
931 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
932 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
933
934 if (cp >= '0' && cp < '8')
935 {
936 /* Dirk:FIXME:: This type of character syntax is not R5RS
937 * compliant. Further, it should be verified that the constant
938 * does only consist of octal digits. */
939 SCM p = scm_string_to_number (charname, scm_from_uint (8));
940 if (SCM_I_INUMP (p))
941 {
942 scm_t_wchar c = scm_to_uint32 (p);
943 if (SCM_IS_UNICODE_CHAR (c))
944 return SCM_MAKE_CHAR (c);
945 else
946 scm_i_input_error (FUNC_NAME, port,
947 "out-of-range octal character escape: ~a",
948 scm_list_1 (charname));
949 }
950 }
951
952 if (cp == 'x' && (charname_len > 1))
953 {
954 SCM p;
955
956 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
957 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
958 scm_from_uint (16));
959 if (SCM_I_INUMP (p))
960 {
961 scm_t_wchar c = scm_to_uint32 (p);
962 if (SCM_IS_UNICODE_CHAR (c))
963 return SCM_MAKE_CHAR (c);
964 else
965 scm_i_input_error (FUNC_NAME, port,
966 "out-of-range hex character escape: ~a",
967 scm_list_1 (charname));
968 }
969 }
970
971 /* The names of characters should never have non-Latin1
972 characters. */
973 if (scm_i_is_narrow_string (charname)
974 || scm_i_try_narrow_string (charname))
975 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
976 charname_len);
977 if (scm_is_true (ch))
978 return ch;
979 }
980
981 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
982 scm_list_1 (charname));
983
984 return SCM_UNSPECIFIED;
985 }
986 #undef FUNC_NAME
987
988 static inline SCM
989 scm_read_keyword (int chr, SCM port)
990 {
991 SCM symbol;
992
993 /* Read the symbol that comprises the keyword. Doing this instead of
994 invoking a specific symbol reader function allows `scm_read_keyword ()'
995 to adapt to the delimiters currently valid of symbols.
996
997 XXX: This implementation allows sloppy syntaxes like `#: key'. */
998 symbol = scm_read_expression (port);
999 if (!scm_is_symbol (symbol))
1000 scm_i_input_error ("scm_read_keyword", port,
1001 "keyword prefix `~a' not followed by a symbol: ~s",
1002 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1003
1004 return (scm_symbol_to_keyword (symbol));
1005 }
1006
1007 static inline SCM
1008 scm_read_vector (int chr, SCM port)
1009 {
1010 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1011 guarantee that it's going to do what we want. After all, this is an
1012 implementation detail of `scm_read_vector ()', not a desirable
1013 property. */
1014 return (scm_vector (scm_read_sexp (chr, port)));
1015 }
1016
1017 static inline SCM
1018 scm_read_srfi4_vector (int chr, SCM port)
1019 {
1020 return scm_i_read_array (port, chr);
1021 }
1022
1023 static SCM
1024 scm_read_bytevector (scm_t_wchar chr, SCM port)
1025 {
1026 chr = scm_getc (port);
1027 if (chr != 'u')
1028 goto syntax;
1029
1030 chr = scm_getc (port);
1031 if (chr != '8')
1032 goto syntax;
1033
1034 chr = scm_getc (port);
1035 if (chr != '(')
1036 goto syntax;
1037
1038 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1039
1040 syntax:
1041 scm_i_input_error ("read_bytevector", port,
1042 "invalid bytevector prefix",
1043 SCM_MAKE_CHAR (chr));
1044 return SCM_UNSPECIFIED;
1045 }
1046
1047 static SCM
1048 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1049 {
1050 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1051 terribly inefficient but who cares? */
1052 SCM s_bits = SCM_EOL;
1053
1054 for (chr = scm_getc (port);
1055 (chr != EOF) && ((chr == '0') || (chr == '1'));
1056 chr = scm_getc (port))
1057 {
1058 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1059 }
1060
1061 if (chr != EOF)
1062 scm_ungetc (chr, port);
1063
1064 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1065 }
1066
1067 static inline SCM
1068 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1069 {
1070 int bang_seen = 0;
1071
1072 for (;;)
1073 {
1074 int c = scm_getc (port);
1075
1076 if (c == EOF)
1077 scm_i_input_error ("skip_block_comment", port,
1078 "unterminated `#! ... !#' comment", SCM_EOL);
1079
1080 if (c == '!')
1081 bang_seen = 1;
1082 else if (c == '#' && bang_seen)
1083 break;
1084 else
1085 bang_seen = 0;
1086 }
1087
1088 return SCM_UNSPECIFIED;
1089 }
1090
1091 static SCM
1092 scm_read_shebang (scm_t_wchar chr, SCM port)
1093 {
1094 int c = 0;
1095 if ((c = scm_get_byte_or_eof (port)) != 'r')
1096 {
1097 scm_ungetc (c, port);
1098 return scm_read_scsh_block_comment (chr, port);
1099 }
1100 if ((c = scm_get_byte_or_eof (port)) != '6')
1101 {
1102 scm_ungetc (c, port);
1103 scm_ungetc ('r', port);
1104 return scm_read_scsh_block_comment (chr, port);
1105 }
1106 if ((c = scm_get_byte_or_eof (port)) != 'r')
1107 {
1108 scm_ungetc (c, port);
1109 scm_ungetc ('6', port);
1110 scm_ungetc ('r', port);
1111 return scm_read_scsh_block_comment (chr, port);
1112 }
1113 if ((c = scm_get_byte_or_eof (port)) != 's')
1114 {
1115 scm_ungetc (c, port);
1116 scm_ungetc ('r', port);
1117 scm_ungetc ('6', port);
1118 scm_ungetc ('r', port);
1119 return scm_read_scsh_block_comment (chr, port);
1120 }
1121
1122 return SCM_UNSPECIFIED;
1123 }
1124
1125 static SCM
1126 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1127 {
1128 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1129 nested. So care must be taken. */
1130 int nesting_level = 1;
1131 int opening_seen = 0, closing_seen = 0;
1132
1133 while (nesting_level > 0)
1134 {
1135 int c = scm_getc (port);
1136
1137 if (c == EOF)
1138 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1139 "unterminated `#| ... |#' comment", SCM_EOL);
1140
1141 if (opening_seen)
1142 {
1143 if (c == '|')
1144 nesting_level++;
1145 opening_seen = 0;
1146 }
1147 else if (closing_seen)
1148 {
1149 if (c == '#')
1150 nesting_level--;
1151 closing_seen = 0;
1152 }
1153 else if (c == '|')
1154 closing_seen = 1;
1155 else if (c == '#')
1156 opening_seen = 1;
1157 else
1158 opening_seen = closing_seen = 0;
1159 }
1160
1161 return SCM_UNSPECIFIED;
1162 }
1163
1164 static SCM
1165 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1166 {
1167 scm_t_wchar c;
1168
1169 c = flush_ws (port, (char *) NULL);
1170 if (EOF == c)
1171 scm_i_input_error ("read_commented_expression", port,
1172 "no expression after #; comment", SCM_EOL);
1173 scm_ungetc (c, port);
1174 scm_read_expression (port);
1175 return SCM_UNSPECIFIED;
1176 }
1177
1178 static SCM
1179 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1180 {
1181 /* Guile's extended symbol read syntax looks like this:
1182
1183 #{This is all a symbol name}#
1184
1185 So here, CHR is expected to be `{'. */
1186 int saw_brace = 0;
1187 size_t len = 0;
1188 SCM buf = scm_i_make_string (1024, NULL, 0);
1189
1190 buf = scm_i_string_start_writing (buf);
1191
1192 while ((chr = scm_getc (port)) != EOF)
1193 {
1194 if (saw_brace)
1195 {
1196 if (chr == '#')
1197 {
1198 break;
1199 }
1200 else
1201 {
1202 saw_brace = 0;
1203 scm_i_string_set_x (buf, len++, '}');
1204 }
1205 }
1206
1207 if (chr == '}')
1208 saw_brace = 1;
1209 else if (chr == '\\')
1210 {
1211 /* It used to be that print.c would print extended-read-syntax
1212 symbols with backslashes before "non-standard" chars, but
1213 this routine wouldn't do anything with those escapes.
1214 Bummer. What we've done is to change print.c to output
1215 R6RS hex escapes for those characters, relying on the fact
1216 that the extended read syntax would never put a `\' before
1217 an `x'. For now, we just ignore other instances of
1218 backslash in the string. */
1219 switch ((chr = scm_getc (port)))
1220 {
1221 case EOF:
1222 goto done;
1223 case 'x':
1224 {
1225 scm_t_wchar c;
1226
1227 SCM_READ_HEX_ESCAPE (10, ';');
1228 scm_i_string_set_x (buf, len++, c);
1229 break;
1230
1231 str_eof:
1232 chr = EOF;
1233 goto done;
1234
1235 bad_escaped:
1236 scm_i_string_stop_writing ();
1237 scm_i_input_error ("scm_read_extended_symbol", port,
1238 "illegal character in escape sequence: ~S",
1239 scm_list_1 (SCM_MAKE_CHAR (c)));
1240 break;
1241 }
1242 default:
1243 scm_i_string_set_x (buf, len++, chr);
1244 break;
1245 }
1246 }
1247 else
1248 scm_i_string_set_x (buf, len++, chr);
1249
1250 if (len >= scm_i_string_length (buf) - 2)
1251 {
1252 SCM addy;
1253
1254 scm_i_string_stop_writing ();
1255 addy = scm_i_make_string (1024, NULL, 0);
1256 buf = scm_string_append (scm_list_2 (buf, addy));
1257 len = 0;
1258 buf = scm_i_string_start_writing (buf);
1259 }
1260 }
1261
1262 done:
1263 scm_i_string_stop_writing ();
1264 if (chr == EOF)
1265 scm_i_input_error ("scm_read_extended_symbol", port,
1266 "end of file while reading symbol", SCM_EOL);
1267
1268 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1269 }
1270
1271
1272 \f
1273 /* Top-level token readers, i.e., dispatchers. */
1274
1275 static SCM
1276 scm_read_sharp_extension (int chr, SCM port)
1277 {
1278 SCM proc;
1279
1280 proc = scm_get_hash_procedure (chr);
1281 if (scm_is_true (scm_procedure_p (proc)))
1282 {
1283 long line = SCM_LINUM (port);
1284 int column = SCM_COL (port) - 2;
1285 SCM got;
1286
1287 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1288
1289 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1290 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1291
1292 return got;
1293 }
1294
1295 return SCM_UNSPECIFIED;
1296 }
1297
1298 /* The reader for the sharp `#' character. It basically dispatches reads
1299 among the above token readers. */
1300 static SCM
1301 scm_read_sharp (scm_t_wchar chr, SCM port)
1302 #define FUNC_NAME "scm_lreadr"
1303 {
1304 SCM result;
1305
1306 chr = scm_getc (port);
1307
1308 result = scm_read_sharp_extension (chr, port);
1309 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1310 return result;
1311
1312 switch (chr)
1313 {
1314 case '\\':
1315 return (scm_read_character (chr, port));
1316 case '(':
1317 return (scm_read_vector (chr, port));
1318 case 's':
1319 case 'u':
1320 case 'f':
1321 case 'c':
1322 /* This one may return either a boolean or an SRFI-4 vector. */
1323 return (scm_read_srfi4_vector (chr, port));
1324 case 'v':
1325 return (scm_read_bytevector (chr, port));
1326 case '*':
1327 return (scm_read_guile_bit_vector (chr, port));
1328 case 't':
1329 case 'T':
1330 case 'F':
1331 /* This one may return either a boolean or an SRFI-4 vector. */
1332 return (scm_read_boolean (chr, port));
1333 case ':':
1334 return (scm_read_keyword (chr, port));
1335 case '0': case '1': case '2': case '3': case '4':
1336 case '5': case '6': case '7': case '8': case '9':
1337 case '@':
1338 #if SCM_ENABLE_DEPRECATED
1339 /* See below for 'i' and 'e'. */
1340 case 'a':
1341 case 'y':
1342 case 'h':
1343 case 'l':
1344 #endif
1345 return (scm_i_read_array (port, chr));
1346
1347 case 'i':
1348 case 'e':
1349 #if SCM_ENABLE_DEPRECATED
1350 {
1351 /* When next char is '(', it really is an old-style
1352 uniform array. */
1353 scm_t_wchar next_c = scm_getc (port);
1354 if (next_c != EOF)
1355 scm_ungetc (next_c, port);
1356 if (next_c == '(')
1357 return scm_i_read_array (port, chr);
1358 /* Fall through. */
1359 }
1360 #endif
1361 case 'b':
1362 case 'B':
1363 case 'o':
1364 case 'O':
1365 case 'd':
1366 case 'D':
1367 case 'x':
1368 case 'X':
1369 case 'I':
1370 case 'E':
1371 return (scm_read_number_and_radix (chr, port));
1372 case '{':
1373 return (scm_read_extended_symbol (chr, port));
1374 case '!':
1375 return (scm_read_shebang (chr, port));
1376 case ';':
1377 return (scm_read_commented_expression (chr, port));
1378 case '`':
1379 case '\'':
1380 case ',':
1381 return (scm_read_syntax (chr, port));
1382 case 'n':
1383 return (scm_read_nil (chr, port));
1384 default:
1385 result = scm_read_sharp_extension (chr, port);
1386 if (scm_is_eq (result, SCM_UNSPECIFIED))
1387 {
1388 /* To remain compatible with 1.8 and earlier, the following
1389 characters have lower precedence than `read-hash-extend'
1390 characters. */
1391 switch (chr)
1392 {
1393 case '|':
1394 return scm_read_r6rs_block_comment (chr, port);
1395 default:
1396 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1397 scm_list_1 (SCM_MAKE_CHAR (chr)));
1398 }
1399 }
1400 else
1401 return result;
1402 }
1403
1404 return SCM_UNSPECIFIED;
1405 }
1406 #undef FUNC_NAME
1407
1408 static SCM
1409 scm_read_expression (SCM port)
1410 #define FUNC_NAME "scm_read_expression"
1411 {
1412 while (1)
1413 {
1414 register scm_t_wchar chr;
1415
1416 chr = scm_getc (port);
1417
1418 switch (chr)
1419 {
1420 case SCM_WHITE_SPACES:
1421 case SCM_LINE_INCREMENTORS:
1422 break;
1423 case ';':
1424 (void) scm_read_semicolon_comment (chr, port);
1425 break;
1426 case '[':
1427 if (!SCM_SQUARE_BRACKETS_P)
1428 return (scm_read_mixed_case_symbol (chr, port));
1429 /* otherwise fall through */
1430 case '(':
1431 return (scm_read_sexp (chr, port));
1432 case '"':
1433 return (scm_read_string (chr, port));
1434 case '\'':
1435 case '`':
1436 case ',':
1437 return (scm_read_quote (chr, port));
1438 case '#':
1439 {
1440 SCM result;
1441 result = scm_read_sharp (chr, port);
1442 if (scm_is_eq (result, SCM_UNSPECIFIED))
1443 /* We read a comment or some such. */
1444 break;
1445 else
1446 return result;
1447 }
1448 case ')':
1449 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1450 break;
1451 case ']':
1452 if (SCM_SQUARE_BRACKETS_P)
1453 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1454 /* otherwise fall through */
1455 case EOF:
1456 return SCM_EOF_VAL;
1457 case ':':
1458 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1459 return scm_symbol_to_keyword (scm_read_expression (port));
1460 /* Fall through. */
1461
1462 default:
1463 {
1464 if (((chr >= '0') && (chr <= '9'))
1465 || (strchr ("+-.", chr)))
1466 return (scm_read_number (chr, port));
1467 else
1468 return (scm_read_mixed_case_symbol (chr, port));
1469 }
1470 }
1471 }
1472 }
1473 #undef FUNC_NAME
1474
1475 \f
1476 /* Actual reader. */
1477
1478 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1479 (SCM port),
1480 "Read an s-expression from the input port @var{port}, or from\n"
1481 "the current input port if @var{port} is not specified.\n"
1482 "Any whitespace before the next token is discarded.")
1483 #define FUNC_NAME s_scm_read
1484 {
1485 int c;
1486
1487 if (SCM_UNBNDP (port))
1488 port = scm_current_input_port ();
1489 SCM_VALIDATE_OPINPORT (1, port);
1490
1491 c = flush_ws (port, (char *) NULL);
1492 if (EOF == c)
1493 return SCM_EOF_VAL;
1494 scm_ungetc (c, port);
1495
1496 return (scm_read_expression (port));
1497 }
1498 #undef FUNC_NAME
1499
1500
1501 \f
1502
1503 /* Manipulate the read-hash-procedures alist. This could be written in
1504 Scheme, but maybe it will also be used by C code during initialisation. */
1505 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1506 (SCM chr, SCM proc),
1507 "Install the procedure @var{proc} for reading expressions\n"
1508 "starting with the character sequence @code{#} and @var{chr}.\n"
1509 "@var{proc} will be called with two arguments: the character\n"
1510 "@var{chr} and the port to read further data from. The object\n"
1511 "returned will be the return value of @code{read}. \n"
1512 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1513 )
1514 #define FUNC_NAME s_scm_read_hash_extend
1515 {
1516 SCM this;
1517 SCM prev;
1518
1519 SCM_VALIDATE_CHAR (1, chr);
1520 SCM_ASSERT (scm_is_false (proc)
1521 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1522 proc, SCM_ARG2, FUNC_NAME);
1523
1524 /* Check if chr is already in the alist. */
1525 this = scm_i_read_hash_procedures_ref ();
1526 prev = SCM_BOOL_F;
1527 while (1)
1528 {
1529 if (scm_is_null (this))
1530 {
1531 /* not found, so add it to the beginning. */
1532 if (scm_is_true (proc))
1533 {
1534 SCM new = scm_cons (scm_cons (chr, proc),
1535 scm_i_read_hash_procedures_ref ());
1536 scm_i_read_hash_procedures_set_x (new);
1537 }
1538 break;
1539 }
1540 if (scm_is_eq (chr, SCM_CAAR (this)))
1541 {
1542 /* already in the alist. */
1543 if (scm_is_false (proc))
1544 {
1545 /* remove it. */
1546 if (scm_is_false (prev))
1547 {
1548 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1549 scm_i_read_hash_procedures_set_x (rest);
1550 }
1551 else
1552 scm_set_cdr_x (prev, SCM_CDR (this));
1553 }
1554 else
1555 {
1556 /* replace it. */
1557 scm_set_cdr_x (SCM_CAR (this), proc);
1558 }
1559 break;
1560 }
1561 prev = this;
1562 this = SCM_CDR (this);
1563 }
1564
1565 return SCM_UNSPECIFIED;
1566 }
1567 #undef FUNC_NAME
1568
1569 /* Recover the read-hash procedure corresponding to char c. */
1570 static SCM
1571 scm_get_hash_procedure (int c)
1572 {
1573 SCM rest = scm_i_read_hash_procedures_ref ();
1574
1575 while (1)
1576 {
1577 if (scm_is_null (rest))
1578 return SCM_BOOL_F;
1579
1580 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1581 return SCM_CDAR (rest);
1582
1583 rest = SCM_CDR (rest);
1584 }
1585 }
1586
1587 #define SCM_ENCODING_SEARCH_SIZE (500)
1588
1589 /* Search the first few hundred characters of a file for an Emacs-like coding
1590 declaration. Returns either NULL or a string whose storage has been
1591 allocated with `scm_gc_malloc ()'. */
1592 char *
1593 scm_i_scan_for_encoding (SCM port)
1594 {
1595 scm_t_port *pt;
1596 char header[SCM_ENCODING_SEARCH_SIZE+1];
1597 size_t bytes_read, encoding_length, i;
1598 char *encoding = NULL;
1599 int utf8_bom = 0;
1600 char *pos, *encoding_start;
1601 int in_comment;
1602
1603 pt = SCM_PTAB_ENTRY (port);
1604
1605 if (pt->rw_active == SCM_PORT_WRITE)
1606 scm_flush (port);
1607
1608 if (pt->rw_random)
1609 pt->rw_active = SCM_PORT_READ;
1610
1611 if (pt->read_pos == pt->read_end)
1612 {
1613 /* We can use the read buffer, and thus avoid a seek. */
1614 if (scm_fill_input (port) == EOF)
1615 return NULL;
1616
1617 bytes_read = pt->read_end - pt->read_pos;
1618 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1619 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1620
1621 if (bytes_read <= 1)
1622 /* An unbuffered port -- don't scan. */
1623 return NULL;
1624
1625 memcpy (header, pt->read_pos, bytes_read);
1626 header[bytes_read] = '\0';
1627 }
1628 else
1629 {
1630 /* Try to read some bytes and then seek back. Not all ports
1631 support seeking back; and indeed some file ports (like
1632 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1633 check performed by SCM_FPORT_FDES---but fail to seek
1634 backwards. Hence this block comes second. We prefer to use
1635 the read buffer in-place. */
1636 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1637 return NULL;
1638
1639 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1640 header[bytes_read] = '\0';
1641 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1642 }
1643
1644 if (bytes_read > 3
1645 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1646 utf8_bom = 1;
1647
1648 /* search past "coding[:=]" */
1649 pos = header;
1650 while (1)
1651 {
1652 if ((pos = strstr(pos, "coding")) == NULL)
1653 return NULL;
1654
1655 pos += strlen("coding");
1656 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1657 (*pos == ':' || *pos == '='))
1658 {
1659 pos ++;
1660 break;
1661 }
1662 }
1663
1664 /* skip spaces */
1665 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1666 (*pos == ' ' || *pos == '\t'))
1667 pos ++;
1668
1669 /* grab the next token */
1670 encoding_start = pos;
1671 i = 0;
1672 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1673 && encoding_start + i - header < bytes_read
1674 && (isalnum ((int) encoding_start[i])
1675 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1676 i++;
1677
1678 encoding_length = i;
1679 if (encoding_length == 0)
1680 return NULL;
1681
1682 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1683 for (i = 0; i < encoding_length; i++)
1684 encoding[i] = toupper ((int) encoding[i]);
1685
1686 /* push backwards to make sure we were in a comment */
1687 in_comment = 0;
1688 pos = encoding_start;
1689 while (pos >= header)
1690 {
1691 if (*pos == ';')
1692 {
1693 in_comment = 1;
1694 break;
1695 }
1696 else if (*pos == '\n' || pos == header)
1697 {
1698 /* This wasn't in a semicolon comment. Check for a
1699 hash-bang comment. */
1700 char *beg = strstr (header, "#!");
1701 char *end = strstr (header, "!#");
1702 if (beg < encoding_start && encoding_start + encoding_length <= end)
1703 in_comment = 1;
1704 break;
1705 }
1706 else
1707 {
1708 pos --;
1709 continue;
1710 }
1711 }
1712 if (!in_comment)
1713 /* This wasn't in a comment */
1714 return NULL;
1715
1716 if (utf8_bom && strcmp(encoding, "UTF-8"))
1717 scm_misc_error (NULL,
1718 "the port input declares the encoding ~s but is encoded as UTF-8",
1719 scm_list_1 (scm_from_locale_string (encoding)));
1720
1721 return encoding;
1722 }
1723
1724 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1725 (SCM port),
1726 "Scans the port for an Emacs-like character coding declaration\n"
1727 "near the top of the contents of a port with random-accessible contents.\n"
1728 "The coding declaration is of the form\n"
1729 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1730 "\n"
1731 "Returns a string containing the character encoding of the file\n"
1732 "if a declaration was found, or @code{#f} otherwise.\n")
1733 #define FUNC_NAME s_scm_file_encoding
1734 {
1735 char *enc;
1736 SCM s_enc;
1737
1738 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1739
1740 enc = scm_i_scan_for_encoding (port);
1741 if (enc == NULL)
1742 return SCM_BOOL_F;
1743 else
1744 {
1745 s_enc = scm_from_locale_string (enc);
1746 return s_enc;
1747 }
1748
1749 return SCM_BOOL_F;
1750 }
1751 #undef FUNC_NAME
1752
1753 void
1754 scm_init_read ()
1755 {
1756 SCM read_hash_procs;
1757
1758 read_hash_procs = scm_make_fluid ();
1759 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1760
1761 scm_i_read_hash_procedures =
1762 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1763
1764 scm_init_opts (scm_read_options, scm_read_opts);
1765 #include "libguile/read.x"
1766 }
1767
1768 /*
1769 Local Variables:
1770 c-file-style: "gnu"
1771 End:
1772 */