read: Avoid `void *' pointer arithmetic.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static int
211 read_token (SCM port, char *buf, size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
239 if the token doesn't fit in BUFFER_SIZE bytes. */
240 static char *
241 read_complete_token (SCM port, char *buffer, size_t buffer_size,
242 size_t *read)
243 {
244 int overflow = 0;
245 size_t bytes_read, overflow_size = 0;
246 char *overflow_buffer = NULL;
247
248 do
249 {
250 overflow = read_token (port, buffer, buffer_size, &bytes_read);
251 if (bytes_read == 0)
252 break;
253 if (overflow || overflow_size != 0)
254 {
255 if (overflow_size == 0)
256 {
257 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
258 memcpy (overflow_buffer, buffer, bytes_read);
259 overflow_size = bytes_read;
260 }
261 else
262 {
263 char *new_buf =
264 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
265
266 memcpy (new_buf, overflow_buffer, overflow_size);
267 memcpy (new_buf + overflow_size, buffer, bytes_read);
268
269 overflow_buffer = new_buf;
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size > 0 ? overflow_buffer : buffer);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port, long line, int column);
360
361
362 static SCM
363 maybe_annotate_source (SCM x, SCM port, long line, int column)
364 {
365 if (SCM_RECORD_POSITIONS_P)
366 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
367 return x;
368 }
369
370 static SCM
371 scm_read_sexp (scm_t_wchar chr, SCM port)
372 #define FUNC_NAME "scm_i_lreadparen"
373 {
374 int c;
375 SCM tmp, tl, ans = SCM_EOL;
376 const int terminating_char = ((chr == '[') ? ']' : ')');
377
378 /* Need to capture line and column numbers here. */
379 long line = SCM_LINUM (port);
380 int column = SCM_COL (port) - 1;
381
382 c = flush_ws (port, FUNC_NAME);
383 if (terminating_char == c)
384 return SCM_EOL;
385
386 scm_ungetc (c, port);
387 tmp = scm_read_expression (port);
388
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
393 {
394 ans = scm_read_expression (port);
395 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
396 scm_i_input_error (FUNC_NAME, port, "missing close paren",
397 SCM_EOL);
398 return ans;
399 }
400
401 /* Build the head of the list structure. */
402 ans = tl = scm_cons (tmp, SCM_EOL);
403
404 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
405 {
406 SCM new_tail;
407
408 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c)));
412
413 scm_ungetc (c, port);
414 tmp = scm_read_expression (port);
415
416 /* See above note about scm_sym_dot. */
417 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
418 {
419 SCM_SETCDR (tl, scm_read_expression (port));
420
421 c = flush_ws (port, FUNC_NAME);
422 if (terminating_char != c)
423 scm_i_input_error (FUNC_NAME, port,
424 "in pair: missing close paren", SCM_EOL);
425 goto exit;
426 }
427
428 new_tail = scm_cons (tmp, SCM_EOL);
429 SCM_SETCDR (tl, new_tail);
430 tl = new_tail;
431 }
432
433 exit:
434 return maybe_annotate_source (ans, port, line, column);
435 }
436 #undef FUNC_NAME
437
438
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
471 } while (0)
472
473 static void
474 skip_intraline_whitespace (SCM port)
475 {
476 scm_t_wchar c;
477
478 do
479 {
480 c = scm_getc (port);
481 if (c == EOF)
482 return;
483 }
484 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
485
486 scm_ungetc (c, port);
487 }
488
489 static SCM
490 scm_read_string (int chr, SCM port)
491 #define FUNC_NAME "scm_lreadr"
492 {
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
495
496 SCM str = SCM_BOOL_F;
497 unsigned c_str_len = 0;
498 scm_t_wchar c;
499
500 /* Need to capture line and column numbers here. */
501 long line = SCM_LINUM (port);
502 int column = SCM_COL (port) - 1;
503
504 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
505 while ('"' != (c = scm_getc (port)))
506 {
507 if (c == EOF)
508 {
509 str_eof:
510 scm_i_input_error (FUNC_NAME, port,
511 "end of file in string constant", SCM_EOL);
512 }
513
514 if (c_str_len + 1 >= scm_i_string_length (str))
515 {
516 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
517
518 str = scm_string_append (scm_list_2 (str, addy));
519 }
520
521 if (c == '\\')
522 {
523 switch (c = scm_getc (port))
524 {
525 case EOF:
526 goto str_eof;
527 case '"':
528 case '\\':
529 break;
530 case '\n':
531 if (SCM_HUNGRY_EOL_ESCAPES_P)
532 skip_intraline_whitespace (port);
533 continue;
534 case '0':
535 c = '\0';
536 break;
537 case 'f':
538 c = '\f';
539 break;
540 case 'n':
541 c = '\n';
542 break;
543 case 'r':
544 c = '\r';
545 break;
546 case 't':
547 c = '\t';
548 break;
549 case 'a':
550 c = '\007';
551 break;
552 case 'v':
553 c = '\v';
554 break;
555 case 'b':
556 c = '\010';
557 break;
558 case 'x':
559 if (SCM_R6RS_ESCAPES_P)
560 SCM_READ_HEX_ESCAPE (10, ';');
561 else
562 SCM_READ_HEX_ESCAPE (2, '\0');
563 break;
564 case 'u':
565 if (!SCM_R6RS_ESCAPES_P)
566 {
567 SCM_READ_HEX_ESCAPE (4, '\0');
568 break;
569 }
570 case 'U':
571 if (!SCM_R6RS_ESCAPES_P)
572 {
573 SCM_READ_HEX_ESCAPE (6, '\0');
574 break;
575 }
576 default:
577 bad_escaped:
578 scm_i_input_error (FUNC_NAME, port,
579 "illegal character in escape sequence: ~S",
580 scm_list_1 (SCM_MAKE_CHAR (c)));
581 }
582 }
583 str = scm_i_string_start_writing (str);
584 scm_i_string_set_x (str, c_str_len++, c);
585 scm_i_string_stop_writing ();
586 }
587 return maybe_annotate_source (scm_i_substring_copy (str, 0, c_str_len),
588 port, line, column);
589 }
590 #undef FUNC_NAME
591
592
593 static SCM
594 scm_read_number (scm_t_wchar chr, SCM port)
595 {
596 SCM result, str = SCM_EOL;
597 char local_buffer[READER_BUFFER_SIZE], *buffer;
598 size_t bytes_read;
599 scm_t_port *pt = SCM_PTAB_ENTRY (port);
600
601 /* Need to capture line and column numbers here. */
602 long line = SCM_LINUM (port);
603 int column = SCM_COL (port) - 1;
604
605 scm_ungetc (chr, port);
606 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
607 &bytes_read);
608
609 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
610
611 result = scm_string_to_number (str, SCM_UNDEFINED);
612 if (scm_is_false (result))
613 {
614 /* Return a symbol instead of a number */
615 if (SCM_CASE_INSENSITIVE_P)
616 str = scm_string_downcase_x (str);
617 result = scm_string_to_symbol (str);
618 }
619 else if (SCM_NIMP (result))
620 result = maybe_annotate_source (result, port, line, column);
621
622 SCM_COL (port) += scm_i_string_length (str);
623 return result;
624 }
625
626 static SCM
627 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
628 {
629 SCM result;
630 int ends_with_colon = 0;
631 size_t bytes_read;
632 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
633 char local_buffer[READER_BUFFER_SIZE], *buffer;
634 scm_t_port *pt = SCM_PTAB_ENTRY (port);
635 SCM str;
636
637 scm_ungetc (chr, port);
638 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
639 &bytes_read);
640 if (bytes_read > 0)
641 ends_with_colon = buffer[bytes_read - 1] == ':';
642
643 if (postfix && ends_with_colon && (bytes_read > 1))
644 {
645 str = scm_from_stringn (buffer, bytes_read - 1,
646 pt->encoding, pt->ilseq_handler);
647
648 if (SCM_CASE_INSENSITIVE_P)
649 str = scm_string_downcase_x (str);
650 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
651 }
652 else
653 {
654 str = scm_from_stringn (buffer, bytes_read,
655 pt->encoding, pt->ilseq_handler);
656
657 if (SCM_CASE_INSENSITIVE_P)
658 str = scm_string_downcase_x (str);
659 result = scm_string_to_symbol (str);
660 }
661
662 SCM_COL (port) += scm_i_string_length (str);
663 return result;
664 }
665
666 static SCM
667 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
668 #define FUNC_NAME "scm_lreadr"
669 {
670 SCM result;
671 size_t read;
672 char local_buffer[READER_BUFFER_SIZE], *buffer;
673 unsigned int radix;
674 SCM str;
675 scm_t_port *pt;
676
677 switch (chr)
678 {
679 case 'B':
680 case 'b':
681 radix = 2;
682 break;
683
684 case 'o':
685 case 'O':
686 radix = 8;
687 break;
688
689 case 'd':
690 case 'D':
691 radix = 10;
692 break;
693
694 case 'x':
695 case 'X':
696 radix = 16;
697 break;
698
699 default:
700 scm_ungetc (chr, port);
701 scm_ungetc ('#', port);
702 radix = 10;
703 }
704
705 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
706 &read);
707
708 pt = SCM_PTAB_ENTRY (port);
709 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
710
711 result = scm_string_to_number (str, scm_from_uint (radix));
712
713 SCM_COL (port) += scm_i_string_length (str);
714
715 if (scm_is_true (result))
716 return result;
717
718 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
719
720 return SCM_BOOL_F;
721 }
722 #undef FUNC_NAME
723
724 static SCM
725 scm_read_quote (int chr, SCM port)
726 {
727 SCM p;
728 long line = SCM_LINUM (port);
729 int column = SCM_COL (port) - 1;
730
731 switch (chr)
732 {
733 case '`':
734 p = scm_sym_quasiquote;
735 break;
736
737 case '\'':
738 p = scm_sym_quote;
739 break;
740
741 case ',':
742 {
743 scm_t_wchar c;
744
745 c = scm_getc (port);
746 if ('@' == c)
747 p = scm_sym_uq_splicing;
748 else
749 {
750 scm_ungetc (c, port);
751 p = scm_sym_unquote;
752 }
753 break;
754 }
755
756 default:
757 fprintf (stderr, "%s: unhandled quote character (%i)\n",
758 "scm_read_quote", chr);
759 abort ();
760 }
761
762 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
763 return maybe_annotate_source (p, port, line, column);
764 }
765
766 SCM_SYMBOL (sym_syntax, "syntax");
767 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
768 SCM_SYMBOL (sym_unsyntax, "unsyntax");
769 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
770
771 static SCM
772 scm_read_syntax (int chr, SCM port)
773 {
774 SCM p;
775 long line = SCM_LINUM (port);
776 int column = SCM_COL (port) - 1;
777
778 switch (chr)
779 {
780 case '`':
781 p = sym_quasisyntax;
782 break;
783
784 case '\'':
785 p = sym_syntax;
786 break;
787
788 case ',':
789 {
790 int c;
791
792 c = scm_getc (port);
793 if ('@' == c)
794 p = sym_unsyntax_splicing;
795 else
796 {
797 scm_ungetc (c, port);
798 p = sym_unsyntax;
799 }
800 break;
801 }
802
803 default:
804 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
805 "scm_read_syntax", chr);
806 abort ();
807 }
808
809 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
810 return maybe_annotate_source (p, port, line, column);
811 }
812
813 static SCM
814 scm_read_nil (int chr, SCM port)
815 {
816 SCM id = scm_read_mixed_case_symbol (chr, port);
817
818 if (!scm_is_eq (id, sym_nil))
819 scm_i_input_error ("scm_read_nil", port,
820 "unexpected input while reading #nil: ~a",
821 scm_list_1 (id));
822
823 return SCM_ELISP_NIL;
824 }
825
826 static SCM
827 scm_read_semicolon_comment (int chr, SCM port)
828 {
829 int c;
830
831 /* We use the get_byte here because there is no need to get the
832 locale correct with comment input. This presumes that newline
833 always represents itself no matter what the encoding is. */
834 for (c = scm_get_byte_or_eof (port);
835 (c != EOF) && (c != '\n');
836 c = scm_get_byte_or_eof (port));
837
838 return SCM_UNSPECIFIED;
839 }
840
841 \f
842 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
843
844 static SCM
845 scm_read_boolean (int chr, SCM port)
846 {
847 switch (chr)
848 {
849 case 't':
850 case 'T':
851 return SCM_BOOL_T;
852
853 case 'f':
854 case 'F':
855 return SCM_BOOL_F;
856 }
857
858 return SCM_UNSPECIFIED;
859 }
860
861 static SCM
862 scm_read_character (scm_t_wchar chr, SCM port)
863 #define FUNC_NAME "scm_lreadr"
864 {
865 char buffer[READER_CHAR_NAME_MAX_SIZE];
866 SCM charname;
867 size_t charname_len, bytes_read;
868 scm_t_wchar cp;
869 int overflow;
870 scm_t_port *pt;
871
872 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
873 if (overflow)
874 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
875
876 if (bytes_read == 0)
877 {
878 chr = scm_getc (port);
879 if (chr == EOF)
880 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
881 "while reading character", SCM_EOL);
882
883 /* CHR must be a token delimiter, like a whitespace. */
884 return (SCM_MAKE_CHAR (chr));
885 }
886
887 pt = SCM_PTAB_ENTRY (port);
888
889 /* Simple ASCII characters can be processed immediately. Also, simple
890 ISO-8859-1 characters can be processed immediately if the encoding for this
891 port is ISO-8859-1. */
892 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
893 {
894 SCM_COL (port) += 1;
895 return SCM_MAKE_CHAR (buffer[0]);
896 }
897
898 /* Otherwise, convert the buffer into a proper scheme string for
899 processing. */
900 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
901 pt->ilseq_handler);
902 charname_len = scm_i_string_length (charname);
903 SCM_COL (port) += charname_len;
904 cp = scm_i_string_ref (charname, 0);
905 if (charname_len == 1)
906 return SCM_MAKE_CHAR (cp);
907
908 /* Ignore dotted circles, which may be used to keep combining characters from
909 combining with the backslash in #\charname. */
910 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
911 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
912
913 if (cp >= '0' && cp < '8')
914 {
915 /* Dirk:FIXME:: This type of character syntax is not R5RS
916 * compliant. Further, it should be verified that the constant
917 * does only consist of octal digits. */
918 SCM p = scm_string_to_number (charname, scm_from_uint (8));
919 if (SCM_I_INUMP (p))
920 {
921 scm_t_wchar c = scm_to_uint32 (p);
922 if (SCM_IS_UNICODE_CHAR (c))
923 return SCM_MAKE_CHAR (c);
924 else
925 scm_i_input_error (FUNC_NAME, port,
926 "out-of-range octal character escape: ~a",
927 scm_list_1 (charname));
928 }
929 }
930
931 if (cp == 'x' && (charname_len > 1))
932 {
933 SCM p;
934
935 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
936 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
937 scm_from_uint (16));
938 if (SCM_I_INUMP (p))
939 {
940 scm_t_wchar c = scm_to_uint32 (p);
941 if (SCM_IS_UNICODE_CHAR (c))
942 return SCM_MAKE_CHAR (c);
943 else
944 scm_i_input_error (FUNC_NAME, port,
945 "out-of-range hex character escape: ~a",
946 scm_list_1 (charname));
947 }
948 }
949
950 /* The names of characters should never have non-Latin1
951 characters. */
952 if (scm_i_is_narrow_string (charname)
953 || scm_i_try_narrow_string (charname))
954 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
955 charname_len);
956 if (scm_is_true (ch))
957 return ch;
958 }
959
960 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
961 scm_list_1 (charname));
962
963 return SCM_UNSPECIFIED;
964 }
965 #undef FUNC_NAME
966
967 static SCM
968 scm_read_keyword (int chr, SCM port)
969 {
970 SCM symbol;
971
972 /* Read the symbol that comprises the keyword. Doing this instead of
973 invoking a specific symbol reader function allows `scm_read_keyword ()'
974 to adapt to the delimiters currently valid of symbols.
975
976 XXX: This implementation allows sloppy syntaxes like `#: key'. */
977 symbol = scm_read_expression (port);
978 if (!scm_is_symbol (symbol))
979 scm_i_input_error ("scm_read_keyword", port,
980 "keyword prefix `~a' not followed by a symbol: ~s",
981 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
982
983 return (scm_symbol_to_keyword (symbol));
984 }
985
986 static SCM
987 scm_read_vector (int chr, SCM port, long line, int column)
988 {
989 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
990 guarantee that it's going to do what we want. After all, this is an
991 implementation detail of `scm_read_vector ()', not a desirable
992 property. */
993 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
994 port, line, column);
995 }
996
997 static SCM
998 scm_read_array (int chr, SCM port, long line, int column)
999 {
1000 SCM result = scm_i_read_array (port, chr);
1001 if (scm_is_false (result))
1002 return result;
1003 else
1004 return maybe_annotate_source (result, port, line, column);
1005 }
1006
1007 static SCM
1008 scm_read_srfi4_vector (int chr, SCM port, long line, int column)
1009 {
1010 return scm_read_array (chr, port, line, column);
1011 }
1012
1013 static SCM
1014 scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
1015 {
1016 chr = scm_getc (port);
1017 if (chr != 'u')
1018 goto syntax;
1019
1020 chr = scm_getc (port);
1021 if (chr != '8')
1022 goto syntax;
1023
1024 chr = scm_getc (port);
1025 if (chr != '(')
1026 goto syntax;
1027
1028 return maybe_annotate_source
1029 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1030 port, line, column);
1031
1032 syntax:
1033 scm_i_input_error ("read_bytevector", port,
1034 "invalid bytevector prefix",
1035 SCM_MAKE_CHAR (chr));
1036 return SCM_UNSPECIFIED;
1037 }
1038
1039 static SCM
1040 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
1041 {
1042 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1043 terribly inefficient but who cares? */
1044 SCM s_bits = SCM_EOL;
1045
1046 for (chr = scm_getc (port);
1047 (chr != EOF) && ((chr == '0') || (chr == '1'));
1048 chr = scm_getc (port))
1049 {
1050 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1051 }
1052
1053 if (chr != EOF)
1054 scm_ungetc (chr, port);
1055
1056 return maybe_annotate_source
1057 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1058 port, line, column);
1059 }
1060
1061 static SCM
1062 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1063 {
1064 int bang_seen = 0;
1065
1066 for (;;)
1067 {
1068 int c = scm_getc (port);
1069
1070 if (c == EOF)
1071 scm_i_input_error ("skip_block_comment", port,
1072 "unterminated `#! ... !#' comment", SCM_EOL);
1073
1074 if (c == '!')
1075 bang_seen = 1;
1076 else if (c == '#' && bang_seen)
1077 break;
1078 else
1079 bang_seen = 0;
1080 }
1081
1082 return SCM_UNSPECIFIED;
1083 }
1084
1085 static SCM
1086 scm_read_shebang (scm_t_wchar chr, SCM port)
1087 {
1088 int c = 0;
1089 if ((c = scm_get_byte_or_eof (port)) != 'r')
1090 {
1091 scm_ungetc (c, port);
1092 return scm_read_scsh_block_comment (chr, port);
1093 }
1094 if ((c = scm_get_byte_or_eof (port)) != '6')
1095 {
1096 scm_ungetc (c, port);
1097 scm_ungetc ('r', port);
1098 return scm_read_scsh_block_comment (chr, port);
1099 }
1100 if ((c = scm_get_byte_or_eof (port)) != 'r')
1101 {
1102 scm_ungetc (c, port);
1103 scm_ungetc ('6', port);
1104 scm_ungetc ('r', port);
1105 return scm_read_scsh_block_comment (chr, port);
1106 }
1107 if ((c = scm_get_byte_or_eof (port)) != 's')
1108 {
1109 scm_ungetc (c, port);
1110 scm_ungetc ('r', port);
1111 scm_ungetc ('6', port);
1112 scm_ungetc ('r', port);
1113 return scm_read_scsh_block_comment (chr, port);
1114 }
1115
1116 return SCM_UNSPECIFIED;
1117 }
1118
1119 static SCM
1120 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1121 {
1122 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1123 nested. So care must be taken. */
1124 int nesting_level = 1;
1125
1126 int a = scm_getc (port);
1127
1128 if (a == EOF)
1129 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1130 "unterminated `#| ... |#' comment", SCM_EOL);
1131
1132 while (nesting_level > 0)
1133 {
1134 int b = scm_getc (port);
1135
1136 if (b == EOF)
1137 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1138 "unterminated `#| ... |#' comment", SCM_EOL);
1139
1140 if (a == '|' && b == '#')
1141 {
1142 nesting_level--;
1143 b = EOF;
1144 }
1145 else if (a == '#' && b == '|')
1146 {
1147 nesting_level++;
1148 b = EOF;
1149 }
1150
1151 a = b;
1152 }
1153
1154 return SCM_UNSPECIFIED;
1155 }
1156
1157 static SCM
1158 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1159 {
1160 scm_t_wchar c;
1161
1162 c = flush_ws (port, (char *) NULL);
1163 if (EOF == c)
1164 scm_i_input_error ("read_commented_expression", port,
1165 "no expression after #; comment", SCM_EOL);
1166 scm_ungetc (c, port);
1167 scm_read_expression (port);
1168 return SCM_UNSPECIFIED;
1169 }
1170
1171 static SCM
1172 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1173 {
1174 /* Guile's extended symbol read syntax looks like this:
1175
1176 #{This is all a symbol name}#
1177
1178 So here, CHR is expected to be `{'. */
1179 int saw_brace = 0;
1180 size_t len = 0;
1181 SCM buf = scm_i_make_string (1024, NULL, 0);
1182
1183 buf = scm_i_string_start_writing (buf);
1184
1185 while ((chr = scm_getc (port)) != EOF)
1186 {
1187 if (saw_brace)
1188 {
1189 if (chr == '#')
1190 {
1191 break;
1192 }
1193 else
1194 {
1195 saw_brace = 0;
1196 scm_i_string_set_x (buf, len++, '}');
1197 }
1198 }
1199
1200 if (chr == '}')
1201 saw_brace = 1;
1202 else if (chr == '\\')
1203 {
1204 /* It used to be that print.c would print extended-read-syntax
1205 symbols with backslashes before "non-standard" chars, but
1206 this routine wouldn't do anything with those escapes.
1207 Bummer. What we've done is to change print.c to output
1208 R6RS hex escapes for those characters, relying on the fact
1209 that the extended read syntax would never put a `\' before
1210 an `x'. For now, we just ignore other instances of
1211 backslash in the string. */
1212 switch ((chr = scm_getc (port)))
1213 {
1214 case EOF:
1215 goto done;
1216 case 'x':
1217 {
1218 scm_t_wchar c;
1219
1220 SCM_READ_HEX_ESCAPE (10, ';');
1221 scm_i_string_set_x (buf, len++, c);
1222 break;
1223
1224 str_eof:
1225 chr = EOF;
1226 goto done;
1227
1228 bad_escaped:
1229 scm_i_string_stop_writing ();
1230 scm_i_input_error ("scm_read_extended_symbol", port,
1231 "illegal character in escape sequence: ~S",
1232 scm_list_1 (SCM_MAKE_CHAR (c)));
1233 break;
1234 }
1235 default:
1236 scm_i_string_set_x (buf, len++, chr);
1237 break;
1238 }
1239 }
1240 else
1241 scm_i_string_set_x (buf, len++, chr);
1242
1243 if (len >= scm_i_string_length (buf) - 2)
1244 {
1245 SCM addy;
1246
1247 scm_i_string_stop_writing ();
1248 addy = scm_i_make_string (1024, NULL, 0);
1249 buf = scm_string_append (scm_list_2 (buf, addy));
1250 len = 0;
1251 buf = scm_i_string_start_writing (buf);
1252 }
1253 }
1254
1255 done:
1256 scm_i_string_stop_writing ();
1257 if (chr == EOF)
1258 scm_i_input_error ("scm_read_extended_symbol", port,
1259 "end of file while reading symbol", SCM_EOL);
1260
1261 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1262 }
1263
1264
1265 \f
1266 /* Top-level token readers, i.e., dispatchers. */
1267
1268 static SCM
1269 scm_read_sharp_extension (int chr, SCM port)
1270 {
1271 SCM proc;
1272
1273 proc = scm_get_hash_procedure (chr);
1274 if (scm_is_true (scm_procedure_p (proc)))
1275 {
1276 long line = SCM_LINUM (port);
1277 int column = SCM_COL (port) - 2;
1278 SCM got;
1279
1280 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1281
1282 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1283 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1284
1285 return got;
1286 }
1287
1288 return SCM_UNSPECIFIED;
1289 }
1290
1291 /* The reader for the sharp `#' character. It basically dispatches reads
1292 among the above token readers. */
1293 static SCM
1294 scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
1295 #define FUNC_NAME "scm_lreadr"
1296 {
1297 SCM result;
1298
1299 chr = scm_getc (port);
1300
1301 result = scm_read_sharp_extension (chr, port);
1302 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1303 return result;
1304
1305 switch (chr)
1306 {
1307 case '\\':
1308 return (scm_read_character (chr, port));
1309 case '(':
1310 return (scm_read_vector (chr, port, line, column));
1311 case 's':
1312 case 'u':
1313 case 'f':
1314 case 'c':
1315 /* This one may return either a boolean or an SRFI-4 vector. */
1316 return (scm_read_srfi4_vector (chr, port, line, column));
1317 case 'v':
1318 return (scm_read_bytevector (chr, port, line, column));
1319 case '*':
1320 return (scm_read_guile_bit_vector (chr, port, line, column));
1321 case 't':
1322 case 'T':
1323 case 'F':
1324 return (scm_read_boolean (chr, port));
1325 case ':':
1326 return (scm_read_keyword (chr, port));
1327 case '0': case '1': case '2': case '3': case '4':
1328 case '5': case '6': case '7': case '8': case '9':
1329 case '@':
1330 #if SCM_ENABLE_DEPRECATED
1331 /* See below for 'i' and 'e'. */
1332 case 'a':
1333 case 'y':
1334 case 'h':
1335 case 'l':
1336 #endif
1337 return (scm_read_array (chr, port, line, column));
1338
1339 case 'i':
1340 case 'e':
1341 #if SCM_ENABLE_DEPRECATED
1342 {
1343 /* When next char is '(', it really is an old-style
1344 uniform array. */
1345 scm_t_wchar next_c = scm_getc (port);
1346 if (next_c != EOF)
1347 scm_ungetc (next_c, port);
1348 if (next_c == '(')
1349 return scm_read_array (chr, port, line, column);
1350 /* Fall through. */
1351 }
1352 #endif
1353 case 'b':
1354 case 'B':
1355 case 'o':
1356 case 'O':
1357 case 'd':
1358 case 'D':
1359 case 'x':
1360 case 'X':
1361 case 'I':
1362 case 'E':
1363 return (scm_read_number_and_radix (chr, port));
1364 case '{':
1365 return (scm_read_extended_symbol (chr, port));
1366 case '!':
1367 return (scm_read_shebang (chr, port));
1368 case ';':
1369 return (scm_read_commented_expression (chr, port));
1370 case '`':
1371 case '\'':
1372 case ',':
1373 return (scm_read_syntax (chr, port));
1374 case 'n':
1375 return (scm_read_nil (chr, port));
1376 default:
1377 result = scm_read_sharp_extension (chr, port);
1378 if (scm_is_eq (result, SCM_UNSPECIFIED))
1379 {
1380 /* To remain compatible with 1.8 and earlier, the following
1381 characters have lower precedence than `read-hash-extend'
1382 characters. */
1383 switch (chr)
1384 {
1385 case '|':
1386 return scm_read_r6rs_block_comment (chr, port);
1387 default:
1388 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1389 scm_list_1 (SCM_MAKE_CHAR (chr)));
1390 }
1391 }
1392 else
1393 return result;
1394 }
1395
1396 return SCM_UNSPECIFIED;
1397 }
1398 #undef FUNC_NAME
1399
1400 static SCM
1401 scm_read_expression (SCM port)
1402 #define FUNC_NAME "scm_read_expression"
1403 {
1404 while (1)
1405 {
1406 scm_t_wchar chr;
1407
1408 chr = scm_getc (port);
1409
1410 switch (chr)
1411 {
1412 case SCM_WHITE_SPACES:
1413 case SCM_LINE_INCREMENTORS:
1414 break;
1415 case ';':
1416 (void) scm_read_semicolon_comment (chr, port);
1417 break;
1418 case '[':
1419 if (!SCM_SQUARE_BRACKETS_P)
1420 return (scm_read_mixed_case_symbol (chr, port));
1421 /* otherwise fall through */
1422 case '(':
1423 return (scm_read_sexp (chr, port));
1424 case '"':
1425 return (scm_read_string (chr, port));
1426 case '\'':
1427 case '`':
1428 case ',':
1429 return (scm_read_quote (chr, port));
1430 case '#':
1431 {
1432 long line = SCM_LINUM (port);
1433 int column = SCM_COL (port) - 1;
1434 SCM result = scm_read_sharp (chr, port, line, column);
1435 if (scm_is_eq (result, SCM_UNSPECIFIED))
1436 /* We read a comment or some such. */
1437 break;
1438 else
1439 return result;
1440 }
1441 case ')':
1442 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1443 break;
1444 case ']':
1445 if (SCM_SQUARE_BRACKETS_P)
1446 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1447 /* otherwise fall through */
1448 case EOF:
1449 return SCM_EOF_VAL;
1450 case ':':
1451 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1452 return scm_symbol_to_keyword (scm_read_expression (port));
1453 /* Fall through. */
1454
1455 default:
1456 {
1457 if (((chr >= '0') && (chr <= '9'))
1458 || (strchr ("+-.", chr)))
1459 return (scm_read_number (chr, port));
1460 else
1461 return (scm_read_mixed_case_symbol (chr, port));
1462 }
1463 }
1464 }
1465 }
1466 #undef FUNC_NAME
1467
1468 \f
1469 /* Actual reader. */
1470
1471 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1472 (SCM port),
1473 "Read an s-expression from the input port @var{port}, or from\n"
1474 "the current input port if @var{port} is not specified.\n"
1475 "Any whitespace before the next token is discarded.")
1476 #define FUNC_NAME s_scm_read
1477 {
1478 int c;
1479
1480 if (SCM_UNBNDP (port))
1481 port = scm_current_input_port ();
1482 SCM_VALIDATE_OPINPORT (1, port);
1483
1484 c = flush_ws (port, (char *) NULL);
1485 if (EOF == c)
1486 return SCM_EOF_VAL;
1487 scm_ungetc (c, port);
1488
1489 return (scm_read_expression (port));
1490 }
1491 #undef FUNC_NAME
1492
1493
1494 \f
1495
1496 /* Manipulate the read-hash-procedures alist. This could be written in
1497 Scheme, but maybe it will also be used by C code during initialisation. */
1498 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1499 (SCM chr, SCM proc),
1500 "Install the procedure @var{proc} for reading expressions\n"
1501 "starting with the character sequence @code{#} and @var{chr}.\n"
1502 "@var{proc} will be called with two arguments: the character\n"
1503 "@var{chr} and the port to read further data from. The object\n"
1504 "returned will be the return value of @code{read}. \n"
1505 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1506 )
1507 #define FUNC_NAME s_scm_read_hash_extend
1508 {
1509 SCM this;
1510 SCM prev;
1511
1512 SCM_VALIDATE_CHAR (1, chr);
1513 SCM_ASSERT (scm_is_false (proc)
1514 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1515 proc, SCM_ARG2, FUNC_NAME);
1516
1517 /* Check if chr is already in the alist. */
1518 this = scm_i_read_hash_procedures_ref ();
1519 prev = SCM_BOOL_F;
1520 while (1)
1521 {
1522 if (scm_is_null (this))
1523 {
1524 /* not found, so add it to the beginning. */
1525 if (scm_is_true (proc))
1526 {
1527 SCM new = scm_cons (scm_cons (chr, proc),
1528 scm_i_read_hash_procedures_ref ());
1529 scm_i_read_hash_procedures_set_x (new);
1530 }
1531 break;
1532 }
1533 if (scm_is_eq (chr, SCM_CAAR (this)))
1534 {
1535 /* already in the alist. */
1536 if (scm_is_false (proc))
1537 {
1538 /* remove it. */
1539 if (scm_is_false (prev))
1540 {
1541 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1542 scm_i_read_hash_procedures_set_x (rest);
1543 }
1544 else
1545 scm_set_cdr_x (prev, SCM_CDR (this));
1546 }
1547 else
1548 {
1549 /* replace it. */
1550 scm_set_cdr_x (SCM_CAR (this), proc);
1551 }
1552 break;
1553 }
1554 prev = this;
1555 this = SCM_CDR (this);
1556 }
1557
1558 return SCM_UNSPECIFIED;
1559 }
1560 #undef FUNC_NAME
1561
1562 /* Recover the read-hash procedure corresponding to char c. */
1563 static SCM
1564 scm_get_hash_procedure (int c)
1565 {
1566 SCM rest = scm_i_read_hash_procedures_ref ();
1567
1568 while (1)
1569 {
1570 if (scm_is_null (rest))
1571 return SCM_BOOL_F;
1572
1573 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1574 return SCM_CDAR (rest);
1575
1576 rest = SCM_CDR (rest);
1577 }
1578 }
1579
1580 #define SCM_ENCODING_SEARCH_SIZE (500)
1581
1582 /* Search the first few hundred characters of a file for an Emacs-like coding
1583 declaration. Returns either NULL or a string whose storage has been
1584 allocated with `scm_gc_malloc ()'. */
1585 char *
1586 scm_i_scan_for_encoding (SCM port)
1587 {
1588 scm_t_port *pt;
1589 char header[SCM_ENCODING_SEARCH_SIZE+1];
1590 size_t bytes_read, encoding_length, i;
1591 char *encoding = NULL;
1592 int utf8_bom = 0;
1593 char *pos, *encoding_start;
1594 int in_comment;
1595
1596 pt = SCM_PTAB_ENTRY (port);
1597
1598 if (pt->rw_active == SCM_PORT_WRITE)
1599 scm_flush (port);
1600
1601 if (pt->rw_random)
1602 pt->rw_active = SCM_PORT_READ;
1603
1604 if (pt->read_pos == pt->read_end)
1605 {
1606 /* We can use the read buffer, and thus avoid a seek. */
1607 if (scm_fill_input (port) == EOF)
1608 return NULL;
1609
1610 bytes_read = pt->read_end - pt->read_pos;
1611 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1612 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1613
1614 if (bytes_read <= 1)
1615 /* An unbuffered port -- don't scan. */
1616 return NULL;
1617
1618 memcpy (header, pt->read_pos, bytes_read);
1619 header[bytes_read] = '\0';
1620 }
1621 else
1622 {
1623 /* Try to read some bytes and then seek back. Not all ports
1624 support seeking back; and indeed some file ports (like
1625 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1626 check performed by SCM_FPORT_FDES---but fail to seek
1627 backwards. Hence this block comes second. We prefer to use
1628 the read buffer in-place. */
1629 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1630 return NULL;
1631
1632 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1633 header[bytes_read] = '\0';
1634 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1635 }
1636
1637 if (bytes_read > 3
1638 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1639 utf8_bom = 1;
1640
1641 /* search past "coding[:=]" */
1642 pos = header;
1643 while (1)
1644 {
1645 if ((pos = strstr(pos, "coding")) == NULL)
1646 return NULL;
1647
1648 pos += strlen("coding");
1649 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1650 (*pos == ':' || *pos == '='))
1651 {
1652 pos ++;
1653 break;
1654 }
1655 }
1656
1657 /* skip spaces */
1658 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1659 (*pos == ' ' || *pos == '\t'))
1660 pos ++;
1661
1662 /* grab the next token */
1663 encoding_start = pos;
1664 i = 0;
1665 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1666 && encoding_start + i - header < bytes_read
1667 && (isalnum ((int) encoding_start[i])
1668 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1669 i++;
1670
1671 encoding_length = i;
1672 if (encoding_length == 0)
1673 return NULL;
1674
1675 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1676 for (i = 0; i < encoding_length; i++)
1677 encoding[i] = toupper ((int) encoding[i]);
1678
1679 /* push backwards to make sure we were in a comment */
1680 in_comment = 0;
1681 pos = encoding_start;
1682 while (pos >= header)
1683 {
1684 if (*pos == ';')
1685 {
1686 in_comment = 1;
1687 break;
1688 }
1689 else if (*pos == '\n' || pos == header)
1690 {
1691 /* This wasn't in a semicolon comment. Check for a
1692 hash-bang comment. */
1693 char *beg = strstr (header, "#!");
1694 char *end = strstr (header, "!#");
1695 if (beg < encoding_start && encoding_start + encoding_length <= end)
1696 in_comment = 1;
1697 break;
1698 }
1699 else
1700 {
1701 pos --;
1702 continue;
1703 }
1704 }
1705 if (!in_comment)
1706 /* This wasn't in a comment */
1707 return NULL;
1708
1709 if (utf8_bom && strcmp(encoding, "UTF-8"))
1710 scm_misc_error (NULL,
1711 "the port input declares the encoding ~s but is encoded as UTF-8",
1712 scm_list_1 (scm_from_locale_string (encoding)));
1713
1714 return encoding;
1715 }
1716
1717 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1718 (SCM port),
1719 "Scans the port for an Emacs-like character coding declaration\n"
1720 "near the top of the contents of a port with random-accessible contents.\n"
1721 "The coding declaration is of the form\n"
1722 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1723 "\n"
1724 "Returns a string containing the character encoding of the file\n"
1725 "if a declaration was found, or @code{#f} otherwise.\n")
1726 #define FUNC_NAME s_scm_file_encoding
1727 {
1728 char *enc;
1729 SCM s_enc;
1730
1731 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1732
1733 enc = scm_i_scan_for_encoding (port);
1734 if (enc == NULL)
1735 return SCM_BOOL_F;
1736 else
1737 {
1738 s_enc = scm_from_locale_string (enc);
1739 return s_enc;
1740 }
1741
1742 return SCM_BOOL_F;
1743 }
1744 #undef FUNC_NAME
1745
1746 void
1747 scm_init_read ()
1748 {
1749 SCM read_hash_procs;
1750
1751 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1752
1753 scm_i_read_hash_procedures =
1754 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1755
1756 scm_init_opts (scm_read_options, scm_read_opts);
1757 #include "libguile/read.x"
1758 }
1759
1760 /*
1761 Local Variables:
1762 c-file-style: "gnu"
1763 End:
1764 */