Remove inline and register attributes from read.c
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port);
360
361
362 static SCM
363 scm_read_sexp (scm_t_wchar chr, SCM port)
364 #define FUNC_NAME "scm_i_lreadparen"
365 {
366 int c;
367 SCM tmp, tl, ans = SCM_EOL;
368 const int terminating_char = ((chr == '[') ? ']' : ')');
369
370 /* Need to capture line and column numbers here. */
371 long line = SCM_LINUM (port);
372 int column = SCM_COL (port) - 1;
373
374 c = flush_ws (port, FUNC_NAME);
375 if (terminating_char == c)
376 return SCM_EOL;
377
378 scm_ungetc (c, port);
379 tmp = scm_read_expression (port);
380
381 /* Note that it is possible for scm_read_expression to return
382 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
383 check that it's a real dot by checking `c'. */
384 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
392
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
396 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
397 {
398 SCM new_tail;
399
400 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
401 scm_i_input_error (FUNC_NAME, port,
402 "in pair: mismatched close paren: ~A",
403 scm_list_1 (SCM_MAKE_CHAR (c)));
404
405 scm_ungetc (c, port);
406 tmp = scm_read_expression (port);
407
408 /* See above note about scm_sym_dot. */
409 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
410 {
411 SCM_SETCDR (tl, scm_read_expression (port));
412
413 c = flush_ws (port, FUNC_NAME);
414 if (terminating_char != c)
415 scm_i_input_error (FUNC_NAME, port,
416 "in pair: missing close paren", SCM_EOL);
417 goto exit;
418 }
419
420 new_tail = scm_cons (tmp, SCM_EOL);
421 SCM_SETCDR (tl, new_tail);
422 tl = new_tail;
423 }
424
425 exit:
426 if (SCM_RECORD_POSITIONS_P)
427 scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
428
429 return ans;
430 }
431 #undef FUNC_NAME
432
433
434 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
435 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
436 found. */
437 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
438 do \
439 { \
440 scm_t_wchar a; \
441 size_t i = 0; \
442 c = 0; \
443 while (i < ndigits) \
444 { \
445 a = scm_getc (port); \
446 if (a == EOF) \
447 goto str_eof; \
448 if (terminator \
449 && (a == (scm_t_wchar) terminator) \
450 && (i > 0)) \
451 break; \
452 if ('0' <= a && a <= '9') \
453 a -= '0'; \
454 else if ('A' <= a && a <= 'F') \
455 a = a - 'A' + 10; \
456 else if ('a' <= a && a <= 'f') \
457 a = a - 'a' + 10; \
458 else \
459 { \
460 c = a; \
461 goto bad_escaped; \
462 } \
463 c = c * 16 + a; \
464 i ++; \
465 } \
466 } while (0)
467
468 static void
469 skip_intraline_whitespace (SCM port)
470 {
471 scm_t_wchar c;
472
473 do
474 {
475 c = scm_getc (port);
476 if (c == EOF)
477 return;
478 }
479 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
480
481 scm_ungetc (c, port);
482 }
483
484 static SCM
485 scm_read_string (int chr, SCM port)
486 #define FUNC_NAME "scm_lreadr"
487 {
488 /* For strings smaller than C_STR, this function creates only one Scheme
489 object (the string returned). */
490
491 SCM str = SCM_BOOL_F;
492 unsigned c_str_len = 0;
493 scm_t_wchar c;
494
495 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
496 while ('"' != (c = scm_getc (port)))
497 {
498 if (c == EOF)
499 {
500 str_eof:
501 scm_i_input_error (FUNC_NAME, port,
502 "end of file in string constant", SCM_EOL);
503 }
504
505 if (c_str_len + 1 >= scm_i_string_length (str))
506 {
507 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
508
509 str = scm_string_append (scm_list_2 (str, addy));
510 }
511
512 if (c == '\\')
513 {
514 switch (c = scm_getc (port))
515 {
516 case EOF:
517 goto str_eof;
518 case '"':
519 case '\\':
520 break;
521 case '\n':
522 if (SCM_HUNGRY_EOL_ESCAPES_P)
523 skip_intraline_whitespace (port);
524 continue;
525 case '0':
526 c = '\0';
527 break;
528 case 'f':
529 c = '\f';
530 break;
531 case 'n':
532 c = '\n';
533 break;
534 case 'r':
535 c = '\r';
536 break;
537 case 't':
538 c = '\t';
539 break;
540 case 'a':
541 c = '\007';
542 break;
543 case 'v':
544 c = '\v';
545 break;
546 case 'b':
547 c = '\010';
548 break;
549 case 'x':
550 if (SCM_R6RS_ESCAPES_P)
551 SCM_READ_HEX_ESCAPE (10, ';');
552 else
553 SCM_READ_HEX_ESCAPE (2, '\0');
554 break;
555 case 'u':
556 if (!SCM_R6RS_ESCAPES_P)
557 {
558 SCM_READ_HEX_ESCAPE (4, '\0');
559 break;
560 }
561 case 'U':
562 if (!SCM_R6RS_ESCAPES_P)
563 {
564 SCM_READ_HEX_ESCAPE (6, '\0');
565 break;
566 }
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
577 }
578
579 if (c_str_len > 0)
580 {
581 return scm_i_substring_copy (str, 0, c_str_len);
582 }
583
584 return scm_nullstr;
585 }
586 #undef FUNC_NAME
587
588
589 static SCM
590 scm_read_number (scm_t_wchar chr, SCM port)
591 {
592 SCM result, str = SCM_EOL;
593 char buffer[READER_BUFFER_SIZE];
594 char *overflow_buffer = NULL;
595 size_t bytes_read;
596 int overflow;
597 scm_t_port *pt = SCM_PTAB_ENTRY (port);
598
599 scm_ungetc (chr, port);
600 overflow = read_complete_token (port, buffer, sizeof (buffer),
601 &overflow_buffer, &bytes_read);
602
603 if (!overflow)
604 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
605 else
606 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
607 pt->ilseq_handler);
608
609 result = scm_string_to_number (str, SCM_UNDEFINED);
610 if (!scm_is_true (result))
611 {
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P)
614 str = scm_string_downcase_x (str);
615 result = scm_string_to_symbol (str);
616 }
617
618 if (overflow)
619 free (overflow_buffer);
620 SCM_COL (port) += scm_i_string_length (str);
621 return result;
622 }
623
624 static SCM
625 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
626 {
627 SCM result;
628 int ends_with_colon = 0;
629 size_t bytes_read;
630 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
631 int overflow;
632 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
633 scm_t_port *pt = SCM_PTAB_ENTRY (port);
634 SCM str;
635
636 scm_ungetc (chr, port);
637 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
638 &overflow_buffer, &bytes_read);
639 if (bytes_read > 0)
640 {
641 if (!overflow)
642 ends_with_colon = buffer[bytes_read - 1] == ':';
643 else
644 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
645 }
646
647 if (postfix && ends_with_colon && (bytes_read > 1))
648 {
649 if (!overflow)
650 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
651 else
652 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
653 pt->ilseq_handler);
654
655 if (SCM_CASE_INSENSITIVE_P)
656 str = scm_string_downcase_x (str);
657 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
658 }
659 else
660 {
661 if (!overflow)
662 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
663 else
664 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
665 pt->ilseq_handler);
666
667 if (SCM_CASE_INSENSITIVE_P)
668 str = scm_string_downcase_x (str);
669 result = scm_string_to_symbol (str);
670 }
671
672 if (overflow)
673 free (overflow_buffer);
674 SCM_COL (port) += scm_i_string_length (str);
675 return result;
676 }
677
678 static SCM
679 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
680 #define FUNC_NAME "scm_lreadr"
681 {
682 SCM result;
683 size_t read;
684 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
685 int overflow;
686 unsigned int radix;
687 SCM str;
688 scm_t_port *pt;
689
690 switch (chr)
691 {
692 case 'B':
693 case 'b':
694 radix = 2;
695 break;
696
697 case 'o':
698 case 'O':
699 radix = 8;
700 break;
701
702 case 'd':
703 case 'D':
704 radix = 10;
705 break;
706
707 case 'x':
708 case 'X':
709 radix = 16;
710 break;
711
712 default:
713 scm_ungetc (chr, port);
714 scm_ungetc ('#', port);
715 radix = 10;
716 }
717
718 overflow = read_complete_token (port, buffer, sizeof (buffer),
719 &overflow_buffer, &read);
720
721 pt = SCM_PTAB_ENTRY (port);
722 if (!overflow)
723 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
724 else
725 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
726 pt->ilseq_handler);
727
728 result = scm_string_to_number (str, scm_from_uint (radix));
729
730 if (overflow)
731 free (overflow_buffer);
732
733 SCM_COL (port) += scm_i_string_length (str);
734
735 if (scm_is_true (result))
736 return result;
737
738 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
739
740 return SCM_BOOL_F;
741 }
742 #undef FUNC_NAME
743
744 static SCM
745 scm_read_quote (int chr, SCM port)
746 {
747 SCM p;
748 long line = SCM_LINUM (port);
749 int column = SCM_COL (port) - 1;
750
751 switch (chr)
752 {
753 case '`':
754 p = scm_sym_quasiquote;
755 break;
756
757 case '\'':
758 p = scm_sym_quote;
759 break;
760
761 case ',':
762 {
763 scm_t_wchar c;
764
765 c = scm_getc (port);
766 if ('@' == c)
767 p = scm_sym_uq_splicing;
768 else
769 {
770 scm_ungetc (c, port);
771 p = scm_sym_unquote;
772 }
773 break;
774 }
775
776 default:
777 fprintf (stderr, "%s: unhandled quote character (%i)\n",
778 "scm_read_quote", chr);
779 abort ();
780 }
781
782 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
783 if (SCM_RECORD_POSITIONS_P)
784 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
785
786 return p;
787 }
788
789 SCM_SYMBOL (sym_syntax, "syntax");
790 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
791 SCM_SYMBOL (sym_unsyntax, "unsyntax");
792 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
793
794 static SCM
795 scm_read_syntax (int chr, SCM port)
796 {
797 SCM p;
798 long line = SCM_LINUM (port);
799 int column = SCM_COL (port) - 1;
800
801 switch (chr)
802 {
803 case '`':
804 p = sym_quasisyntax;
805 break;
806
807 case '\'':
808 p = sym_syntax;
809 break;
810
811 case ',':
812 {
813 int c;
814
815 c = scm_getc (port);
816 if ('@' == c)
817 p = sym_unsyntax_splicing;
818 else
819 {
820 scm_ungetc (c, port);
821 p = sym_unsyntax;
822 }
823 break;
824 }
825
826 default:
827 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr);
829 abort ();
830 }
831
832 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
833 if (SCM_RECORD_POSITIONS_P)
834 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
835
836 return p;
837 }
838
839 static SCM
840 scm_read_nil (int chr, SCM port)
841 {
842 SCM id = scm_read_mixed_case_symbol (chr, port);
843
844 if (!scm_is_eq (id, sym_nil))
845 scm_i_input_error ("scm_read_nil", port,
846 "unexpected input while reading #nil: ~a",
847 scm_list_1 (id));
848
849 return SCM_ELISP_NIL;
850 }
851
852 static SCM
853 scm_read_semicolon_comment (int chr, SCM port)
854 {
855 int c;
856
857 /* We use the get_byte here because there is no need to get the
858 locale correct with comment input. This presumes that newline
859 always represents itself no matter what the encoding is. */
860 for (c = scm_get_byte_or_eof (port);
861 (c != EOF) && (c != '\n');
862 c = scm_get_byte_or_eof (port));
863
864 return SCM_UNSPECIFIED;
865 }
866
867 \f
868 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
869
870 static SCM
871 scm_read_boolean (int chr, SCM port)
872 {
873 switch (chr)
874 {
875 case 't':
876 case 'T':
877 return SCM_BOOL_T;
878
879 case 'f':
880 case 'F':
881 return SCM_BOOL_F;
882 }
883
884 return SCM_UNSPECIFIED;
885 }
886
887 static SCM
888 scm_read_character (scm_t_wchar chr, SCM port)
889 #define FUNC_NAME "scm_lreadr"
890 {
891 char buffer[READER_CHAR_NAME_MAX_SIZE];
892 SCM charname;
893 size_t charname_len, bytes_read;
894 scm_t_wchar cp;
895 int overflow;
896 scm_t_port *pt;
897
898 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
899 if (overflow)
900 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
901
902 if (bytes_read == 0)
903 {
904 chr = scm_getc (port);
905 if (chr == EOF)
906 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
907 "while reading character", SCM_EOL);
908
909 /* CHR must be a token delimiter, like a whitespace. */
910 return (SCM_MAKE_CHAR (chr));
911 }
912
913 pt = SCM_PTAB_ENTRY (port);
914
915 /* Simple ASCII characters can be processed immediately. Also, simple
916 ISO-8859-1 characters can be processed immediately if the encoding for this
917 port is ISO-8859-1. */
918 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
919 {
920 SCM_COL (port) += 1;
921 return SCM_MAKE_CHAR (buffer[0]);
922 }
923
924 /* Otherwise, convert the buffer into a proper scheme string for
925 processing. */
926 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
927 pt->ilseq_handler);
928 charname_len = scm_i_string_length (charname);
929 SCM_COL (port) += charname_len;
930 cp = scm_i_string_ref (charname, 0);
931 if (charname_len == 1)
932 return SCM_MAKE_CHAR (cp);
933
934 /* Ignore dotted circles, which may be used to keep combining characters from
935 combining with the backslash in #\charname. */
936 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
937 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
938
939 if (cp >= '0' && cp < '8')
940 {
941 /* Dirk:FIXME:: This type of character syntax is not R5RS
942 * compliant. Further, it should be verified that the constant
943 * does only consist of octal digits. */
944 SCM p = scm_string_to_number (charname, scm_from_uint (8));
945 if (SCM_I_INUMP (p))
946 {
947 scm_t_wchar c = scm_to_uint32 (p);
948 if (SCM_IS_UNICODE_CHAR (c))
949 return SCM_MAKE_CHAR (c);
950 else
951 scm_i_input_error (FUNC_NAME, port,
952 "out-of-range octal character escape: ~a",
953 scm_list_1 (charname));
954 }
955 }
956
957 if (cp == 'x' && (charname_len > 1))
958 {
959 SCM p;
960
961 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
962 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
963 scm_from_uint (16));
964 if (SCM_I_INUMP (p))
965 {
966 scm_t_wchar c = scm_to_uint32 (p);
967 if (SCM_IS_UNICODE_CHAR (c))
968 return SCM_MAKE_CHAR (c);
969 else
970 scm_i_input_error (FUNC_NAME, port,
971 "out-of-range hex character escape: ~a",
972 scm_list_1 (charname));
973 }
974 }
975
976 /* The names of characters should never have non-Latin1
977 characters. */
978 if (scm_i_is_narrow_string (charname)
979 || scm_i_try_narrow_string (charname))
980 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
981 charname_len);
982 if (scm_is_true (ch))
983 return ch;
984 }
985
986 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
987 scm_list_1 (charname));
988
989 return SCM_UNSPECIFIED;
990 }
991 #undef FUNC_NAME
992
993 static SCM
994 scm_read_keyword (int chr, SCM port)
995 {
996 SCM symbol;
997
998 /* Read the symbol that comprises the keyword. Doing this instead of
999 invoking a specific symbol reader function allows `scm_read_keyword ()'
1000 to adapt to the delimiters currently valid of symbols.
1001
1002 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1003 symbol = scm_read_expression (port);
1004 if (!scm_is_symbol (symbol))
1005 scm_i_input_error ("scm_read_keyword", port,
1006 "keyword prefix `~a' not followed by a symbol: ~s",
1007 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009 return (scm_symbol_to_keyword (symbol));
1010 }
1011
1012 static SCM
1013 scm_read_vector (int chr, SCM port)
1014 {
1015 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016 guarantee that it's going to do what we want. After all, this is an
1017 implementation detail of `scm_read_vector ()', not a desirable
1018 property. */
1019 return (scm_vector (scm_read_sexp (chr, port)));
1020 }
1021
1022 static SCM
1023 scm_read_srfi4_vector (int chr, SCM port)
1024 {
1025 return scm_i_read_array (port, chr);
1026 }
1027
1028 static SCM
1029 scm_read_bytevector (scm_t_wchar chr, SCM port)
1030 {
1031 chr = scm_getc (port);
1032 if (chr != 'u')
1033 goto syntax;
1034
1035 chr = scm_getc (port);
1036 if (chr != '8')
1037 goto syntax;
1038
1039 chr = scm_getc (port);
1040 if (chr != '(')
1041 goto syntax;
1042
1043 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045 syntax:
1046 scm_i_input_error ("read_bytevector", port,
1047 "invalid bytevector prefix",
1048 SCM_MAKE_CHAR (chr));
1049 return SCM_UNSPECIFIED;
1050 }
1051
1052 static SCM
1053 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1054 {
1055 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1056 terribly inefficient but who cares? */
1057 SCM s_bits = SCM_EOL;
1058
1059 for (chr = scm_getc (port);
1060 (chr != EOF) && ((chr == '0') || (chr == '1'));
1061 chr = scm_getc (port))
1062 {
1063 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1064 }
1065
1066 if (chr != EOF)
1067 scm_ungetc (chr, port);
1068
1069 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070 }
1071
1072 static SCM
1073 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1074 {
1075 int bang_seen = 0;
1076
1077 for (;;)
1078 {
1079 int c = scm_getc (port);
1080
1081 if (c == EOF)
1082 scm_i_input_error ("skip_block_comment", port,
1083 "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085 if (c == '!')
1086 bang_seen = 1;
1087 else if (c == '#' && bang_seen)
1088 break;
1089 else
1090 bang_seen = 0;
1091 }
1092
1093 return SCM_UNSPECIFIED;
1094 }
1095
1096 static SCM
1097 scm_read_shebang (scm_t_wchar chr, SCM port)
1098 {
1099 int c = 0;
1100 if ((c = scm_get_byte_or_eof (port)) != 'r')
1101 {
1102 scm_ungetc (c, port);
1103 return scm_read_scsh_block_comment (chr, port);
1104 }
1105 if ((c = scm_get_byte_or_eof (port)) != '6')
1106 {
1107 scm_ungetc (c, port);
1108 scm_ungetc ('r', port);
1109 return scm_read_scsh_block_comment (chr, port);
1110 }
1111 if ((c = scm_get_byte_or_eof (port)) != 'r')
1112 {
1113 scm_ungetc (c, port);
1114 scm_ungetc ('6', port);
1115 scm_ungetc ('r', port);
1116 return scm_read_scsh_block_comment (chr, port);
1117 }
1118 if ((c = scm_get_byte_or_eof (port)) != 's')
1119 {
1120 scm_ungetc (c, port);
1121 scm_ungetc ('r', port);
1122 scm_ungetc ('6', port);
1123 scm_ungetc ('r', port);
1124 return scm_read_scsh_block_comment (chr, port);
1125 }
1126
1127 return SCM_UNSPECIFIED;
1128 }
1129
1130 static SCM
1131 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132 {
1133 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134 nested. So care must be taken. */
1135 int nesting_level = 1;
1136
1137 int a = scm_getc (port);
1138
1139 if (a == EOF)
1140 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1141 "unterminated `#| ... |#' comment", SCM_EOL);
1142
1143 while (nesting_level > 0)
1144 {
1145 int b = scm_getc (port);
1146
1147 if (b == EOF)
1148 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1149 "unterminated `#| ... |#' comment", SCM_EOL);
1150
1151 if (a == '|' && b == '#')
1152 {
1153 nesting_level--;
1154 b = EOF;
1155 }
1156 else if (a == '#' && b == '|')
1157 {
1158 nesting_level++;
1159 b = EOF;
1160 }
1161
1162 a = b;
1163 }
1164
1165 return SCM_UNSPECIFIED;
1166 }
1167
1168 static SCM
1169 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1170 {
1171 scm_t_wchar c;
1172
1173 c = flush_ws (port, (char *) NULL);
1174 if (EOF == c)
1175 scm_i_input_error ("read_commented_expression", port,
1176 "no expression after #; comment", SCM_EOL);
1177 scm_ungetc (c, port);
1178 scm_read_expression (port);
1179 return SCM_UNSPECIFIED;
1180 }
1181
1182 static SCM
1183 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1184 {
1185 /* Guile's extended symbol read syntax looks like this:
1186
1187 #{This is all a symbol name}#
1188
1189 So here, CHR is expected to be `{'. */
1190 int saw_brace = 0;
1191 size_t len = 0;
1192 SCM buf = scm_i_make_string (1024, NULL, 0);
1193
1194 buf = scm_i_string_start_writing (buf);
1195
1196 while ((chr = scm_getc (port)) != EOF)
1197 {
1198 if (saw_brace)
1199 {
1200 if (chr == '#')
1201 {
1202 break;
1203 }
1204 else
1205 {
1206 saw_brace = 0;
1207 scm_i_string_set_x (buf, len++, '}');
1208 }
1209 }
1210
1211 if (chr == '}')
1212 saw_brace = 1;
1213 else if (chr == '\\')
1214 {
1215 /* It used to be that print.c would print extended-read-syntax
1216 symbols with backslashes before "non-standard" chars, but
1217 this routine wouldn't do anything with those escapes.
1218 Bummer. What we've done is to change print.c to output
1219 R6RS hex escapes for those characters, relying on the fact
1220 that the extended read syntax would never put a `\' before
1221 an `x'. For now, we just ignore other instances of
1222 backslash in the string. */
1223 switch ((chr = scm_getc (port)))
1224 {
1225 case EOF:
1226 goto done;
1227 case 'x':
1228 {
1229 scm_t_wchar c;
1230
1231 SCM_READ_HEX_ESCAPE (10, ';');
1232 scm_i_string_set_x (buf, len++, c);
1233 break;
1234
1235 str_eof:
1236 chr = EOF;
1237 goto done;
1238
1239 bad_escaped:
1240 scm_i_string_stop_writing ();
1241 scm_i_input_error ("scm_read_extended_symbol", port,
1242 "illegal character in escape sequence: ~S",
1243 scm_list_1 (SCM_MAKE_CHAR (c)));
1244 break;
1245 }
1246 default:
1247 scm_i_string_set_x (buf, len++, chr);
1248 break;
1249 }
1250 }
1251 else
1252 scm_i_string_set_x (buf, len++, chr);
1253
1254 if (len >= scm_i_string_length (buf) - 2)
1255 {
1256 SCM addy;
1257
1258 scm_i_string_stop_writing ();
1259 addy = scm_i_make_string (1024, NULL, 0);
1260 buf = scm_string_append (scm_list_2 (buf, addy));
1261 len = 0;
1262 buf = scm_i_string_start_writing (buf);
1263 }
1264 }
1265
1266 done:
1267 scm_i_string_stop_writing ();
1268 if (chr == EOF)
1269 scm_i_input_error ("scm_read_extended_symbol", port,
1270 "end of file while reading symbol", SCM_EOL);
1271
1272 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1273 }
1274
1275
1276 \f
1277 /* Top-level token readers, i.e., dispatchers. */
1278
1279 static SCM
1280 scm_read_sharp_extension (int chr, SCM port)
1281 {
1282 SCM proc;
1283
1284 proc = scm_get_hash_procedure (chr);
1285 if (scm_is_true (scm_procedure_p (proc)))
1286 {
1287 long line = SCM_LINUM (port);
1288 int column = SCM_COL (port) - 2;
1289 SCM got;
1290
1291 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1292
1293 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1294 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1295
1296 return got;
1297 }
1298
1299 return SCM_UNSPECIFIED;
1300 }
1301
1302 /* The reader for the sharp `#' character. It basically dispatches reads
1303 among the above token readers. */
1304 static SCM
1305 scm_read_sharp (scm_t_wchar chr, SCM port)
1306 #define FUNC_NAME "scm_lreadr"
1307 {
1308 SCM result;
1309
1310 chr = scm_getc (port);
1311
1312 result = scm_read_sharp_extension (chr, port);
1313 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1314 return result;
1315
1316 switch (chr)
1317 {
1318 case '\\':
1319 return (scm_read_character (chr, port));
1320 case '(':
1321 return (scm_read_vector (chr, port));
1322 case 's':
1323 case 'u':
1324 case 'f':
1325 case 'c':
1326 /* This one may return either a boolean or an SRFI-4 vector. */
1327 return (scm_read_srfi4_vector (chr, port));
1328 case 'v':
1329 return (scm_read_bytevector (chr, port));
1330 case '*':
1331 return (scm_read_guile_bit_vector (chr, port));
1332 case 't':
1333 case 'T':
1334 case 'F':
1335 return (scm_read_boolean (chr, port));
1336 case ':':
1337 return (scm_read_keyword (chr, port));
1338 case '0': case '1': case '2': case '3': case '4':
1339 case '5': case '6': case '7': case '8': case '9':
1340 case '@':
1341 #if SCM_ENABLE_DEPRECATED
1342 /* See below for 'i' and 'e'. */
1343 case 'a':
1344 case 'y':
1345 case 'h':
1346 case 'l':
1347 #endif
1348 return (scm_i_read_array (port, chr));
1349
1350 case 'i':
1351 case 'e':
1352 #if SCM_ENABLE_DEPRECATED
1353 {
1354 /* When next char is '(', it really is an old-style
1355 uniform array. */
1356 scm_t_wchar next_c = scm_getc (port);
1357 if (next_c != EOF)
1358 scm_ungetc (next_c, port);
1359 if (next_c == '(')
1360 return scm_i_read_array (port, chr);
1361 /* Fall through. */
1362 }
1363 #endif
1364 case 'b':
1365 case 'B':
1366 case 'o':
1367 case 'O':
1368 case 'd':
1369 case 'D':
1370 case 'x':
1371 case 'X':
1372 case 'I':
1373 case 'E':
1374 return (scm_read_number_and_radix (chr, port));
1375 case '{':
1376 return (scm_read_extended_symbol (chr, port));
1377 case '!':
1378 return (scm_read_shebang (chr, port));
1379 case ';':
1380 return (scm_read_commented_expression (chr, port));
1381 case '`':
1382 case '\'':
1383 case ',':
1384 return (scm_read_syntax (chr, port));
1385 case 'n':
1386 return (scm_read_nil (chr, port));
1387 default:
1388 result = scm_read_sharp_extension (chr, port);
1389 if (scm_is_eq (result, SCM_UNSPECIFIED))
1390 {
1391 /* To remain compatible with 1.8 and earlier, the following
1392 characters have lower precedence than `read-hash-extend'
1393 characters. */
1394 switch (chr)
1395 {
1396 case '|':
1397 return scm_read_r6rs_block_comment (chr, port);
1398 default:
1399 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1400 scm_list_1 (SCM_MAKE_CHAR (chr)));
1401 }
1402 }
1403 else
1404 return result;
1405 }
1406
1407 return SCM_UNSPECIFIED;
1408 }
1409 #undef FUNC_NAME
1410
1411 static SCM
1412 scm_read_expression (SCM port)
1413 #define FUNC_NAME "scm_read_expression"
1414 {
1415 while (1)
1416 {
1417 scm_t_wchar chr;
1418
1419 chr = scm_getc (port);
1420
1421 switch (chr)
1422 {
1423 case SCM_WHITE_SPACES:
1424 case SCM_LINE_INCREMENTORS:
1425 break;
1426 case ';':
1427 (void) scm_read_semicolon_comment (chr, port);
1428 break;
1429 case '[':
1430 if (!SCM_SQUARE_BRACKETS_P)
1431 return (scm_read_mixed_case_symbol (chr, port));
1432 /* otherwise fall through */
1433 case '(':
1434 return (scm_read_sexp (chr, port));
1435 case '"':
1436 return (scm_read_string (chr, port));
1437 case '\'':
1438 case '`':
1439 case ',':
1440 return (scm_read_quote (chr, port));
1441 case '#':
1442 {
1443 SCM result;
1444 result = scm_read_sharp (chr, port);
1445 if (scm_is_eq (result, SCM_UNSPECIFIED))
1446 /* We read a comment or some such. */
1447 break;
1448 else
1449 return result;
1450 }
1451 case ')':
1452 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1453 break;
1454 case ']':
1455 if (SCM_SQUARE_BRACKETS_P)
1456 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1457 /* otherwise fall through */
1458 case EOF:
1459 return SCM_EOF_VAL;
1460 case ':':
1461 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1462 return scm_symbol_to_keyword (scm_read_expression (port));
1463 /* Fall through. */
1464
1465 default:
1466 {
1467 if (((chr >= '0') && (chr <= '9'))
1468 || (strchr ("+-.", chr)))
1469 return (scm_read_number (chr, port));
1470 else
1471 return (scm_read_mixed_case_symbol (chr, port));
1472 }
1473 }
1474 }
1475 }
1476 #undef FUNC_NAME
1477
1478 \f
1479 /* Actual reader. */
1480
1481 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1482 (SCM port),
1483 "Read an s-expression from the input port @var{port}, or from\n"
1484 "the current input port if @var{port} is not specified.\n"
1485 "Any whitespace before the next token is discarded.")
1486 #define FUNC_NAME s_scm_read
1487 {
1488 int c;
1489
1490 if (SCM_UNBNDP (port))
1491 port = scm_current_input_port ();
1492 SCM_VALIDATE_OPINPORT (1, port);
1493
1494 c = flush_ws (port, (char *) NULL);
1495 if (EOF == c)
1496 return SCM_EOF_VAL;
1497 scm_ungetc (c, port);
1498
1499 return (scm_read_expression (port));
1500 }
1501 #undef FUNC_NAME
1502
1503
1504 \f
1505
1506 /* Manipulate the read-hash-procedures alist. This could be written in
1507 Scheme, but maybe it will also be used by C code during initialisation. */
1508 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1509 (SCM chr, SCM proc),
1510 "Install the procedure @var{proc} for reading expressions\n"
1511 "starting with the character sequence @code{#} and @var{chr}.\n"
1512 "@var{proc} will be called with two arguments: the character\n"
1513 "@var{chr} and the port to read further data from. The object\n"
1514 "returned will be the return value of @code{read}. \n"
1515 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1516 )
1517 #define FUNC_NAME s_scm_read_hash_extend
1518 {
1519 SCM this;
1520 SCM prev;
1521
1522 SCM_VALIDATE_CHAR (1, chr);
1523 SCM_ASSERT (scm_is_false (proc)
1524 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1525 proc, SCM_ARG2, FUNC_NAME);
1526
1527 /* Check if chr is already in the alist. */
1528 this = scm_i_read_hash_procedures_ref ();
1529 prev = SCM_BOOL_F;
1530 while (1)
1531 {
1532 if (scm_is_null (this))
1533 {
1534 /* not found, so add it to the beginning. */
1535 if (scm_is_true (proc))
1536 {
1537 SCM new = scm_cons (scm_cons (chr, proc),
1538 scm_i_read_hash_procedures_ref ());
1539 scm_i_read_hash_procedures_set_x (new);
1540 }
1541 break;
1542 }
1543 if (scm_is_eq (chr, SCM_CAAR (this)))
1544 {
1545 /* already in the alist. */
1546 if (scm_is_false (proc))
1547 {
1548 /* remove it. */
1549 if (scm_is_false (prev))
1550 {
1551 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1552 scm_i_read_hash_procedures_set_x (rest);
1553 }
1554 else
1555 scm_set_cdr_x (prev, SCM_CDR (this));
1556 }
1557 else
1558 {
1559 /* replace it. */
1560 scm_set_cdr_x (SCM_CAR (this), proc);
1561 }
1562 break;
1563 }
1564 prev = this;
1565 this = SCM_CDR (this);
1566 }
1567
1568 return SCM_UNSPECIFIED;
1569 }
1570 #undef FUNC_NAME
1571
1572 /* Recover the read-hash procedure corresponding to char c. */
1573 static SCM
1574 scm_get_hash_procedure (int c)
1575 {
1576 SCM rest = scm_i_read_hash_procedures_ref ();
1577
1578 while (1)
1579 {
1580 if (scm_is_null (rest))
1581 return SCM_BOOL_F;
1582
1583 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1584 return SCM_CDAR (rest);
1585
1586 rest = SCM_CDR (rest);
1587 }
1588 }
1589
1590 #define SCM_ENCODING_SEARCH_SIZE (500)
1591
1592 /* Search the first few hundred characters of a file for an Emacs-like coding
1593 declaration. Returns either NULL or a string whose storage has been
1594 allocated with `scm_gc_malloc ()'. */
1595 char *
1596 scm_i_scan_for_encoding (SCM port)
1597 {
1598 scm_t_port *pt;
1599 char header[SCM_ENCODING_SEARCH_SIZE+1];
1600 size_t bytes_read, encoding_length, i;
1601 char *encoding = NULL;
1602 int utf8_bom = 0;
1603 char *pos, *encoding_start;
1604 int in_comment;
1605
1606 pt = SCM_PTAB_ENTRY (port);
1607
1608 if (pt->rw_active == SCM_PORT_WRITE)
1609 scm_flush (port);
1610
1611 if (pt->rw_random)
1612 pt->rw_active = SCM_PORT_READ;
1613
1614 if (pt->read_pos == pt->read_end)
1615 {
1616 /* We can use the read buffer, and thus avoid a seek. */
1617 if (scm_fill_input (port) == EOF)
1618 return NULL;
1619
1620 bytes_read = pt->read_end - pt->read_pos;
1621 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1622 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1623
1624 if (bytes_read <= 1)
1625 /* An unbuffered port -- don't scan. */
1626 return NULL;
1627
1628 memcpy (header, pt->read_pos, bytes_read);
1629 header[bytes_read] = '\0';
1630 }
1631 else
1632 {
1633 /* Try to read some bytes and then seek back. Not all ports
1634 support seeking back; and indeed some file ports (like
1635 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1636 check performed by SCM_FPORT_FDES---but fail to seek
1637 backwards. Hence this block comes second. We prefer to use
1638 the read buffer in-place. */
1639 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1640 return NULL;
1641
1642 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1643 header[bytes_read] = '\0';
1644 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1645 }
1646
1647 if (bytes_read > 3
1648 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1649 utf8_bom = 1;
1650
1651 /* search past "coding[:=]" */
1652 pos = header;
1653 while (1)
1654 {
1655 if ((pos = strstr(pos, "coding")) == NULL)
1656 return NULL;
1657
1658 pos += strlen("coding");
1659 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1660 (*pos == ':' || *pos == '='))
1661 {
1662 pos ++;
1663 break;
1664 }
1665 }
1666
1667 /* skip spaces */
1668 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1669 (*pos == ' ' || *pos == '\t'))
1670 pos ++;
1671
1672 /* grab the next token */
1673 encoding_start = pos;
1674 i = 0;
1675 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1676 && encoding_start + i - header < bytes_read
1677 && (isalnum ((int) encoding_start[i])
1678 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1679 i++;
1680
1681 encoding_length = i;
1682 if (encoding_length == 0)
1683 return NULL;
1684
1685 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1686 for (i = 0; i < encoding_length; i++)
1687 encoding[i] = toupper ((int) encoding[i]);
1688
1689 /* push backwards to make sure we were in a comment */
1690 in_comment = 0;
1691 pos = encoding_start;
1692 while (pos >= header)
1693 {
1694 if (*pos == ';')
1695 {
1696 in_comment = 1;
1697 break;
1698 }
1699 else if (*pos == '\n' || pos == header)
1700 {
1701 /* This wasn't in a semicolon comment. Check for a
1702 hash-bang comment. */
1703 char *beg = strstr (header, "#!");
1704 char *end = strstr (header, "!#");
1705 if (beg < encoding_start && encoding_start + encoding_length <= end)
1706 in_comment = 1;
1707 break;
1708 }
1709 else
1710 {
1711 pos --;
1712 continue;
1713 }
1714 }
1715 if (!in_comment)
1716 /* This wasn't in a comment */
1717 return NULL;
1718
1719 if (utf8_bom && strcmp(encoding, "UTF-8"))
1720 scm_misc_error (NULL,
1721 "the port input declares the encoding ~s but is encoded as UTF-8",
1722 scm_list_1 (scm_from_locale_string (encoding)));
1723
1724 return encoding;
1725 }
1726
1727 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1728 (SCM port),
1729 "Scans the port for an Emacs-like character coding declaration\n"
1730 "near the top of the contents of a port with random-accessible contents.\n"
1731 "The coding declaration is of the form\n"
1732 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1733 "\n"
1734 "Returns a string containing the character encoding of the file\n"
1735 "if a declaration was found, or @code{#f} otherwise.\n")
1736 #define FUNC_NAME s_scm_file_encoding
1737 {
1738 char *enc;
1739 SCM s_enc;
1740
1741 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1742
1743 enc = scm_i_scan_for_encoding (port);
1744 if (enc == NULL)
1745 return SCM_BOOL_F;
1746 else
1747 {
1748 s_enc = scm_from_locale_string (enc);
1749 return s_enc;
1750 }
1751
1752 return SCM_BOOL_F;
1753 }
1754 #undef FUNC_NAME
1755
1756 void
1757 scm_init_read ()
1758 {
1759 SCM read_hash_procs;
1760
1761 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1762
1763 scm_i_read_hash_procedures =
1764 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1765
1766 scm_init_opts (scm_read_options, scm_read_opts);
1767 #include "libguile/read.x"
1768 }
1769
1770 /*
1771 Local Variables:
1772 c-file-style: "gnu"
1773 End:
1774 */