Unoptimize 'read' to return freshly allocated empty strings
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
54
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
57
58
59 \f
60
61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
64 SCM_SYMBOL (sym_nil, "nil");
65
66 scm_t_option scm_read_opts[] = {
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
81 { 0, },
82 };
83
84 /*
85 Give meaningful error messages for errors
86
87 We use the format
88
89 FILE:LINE:COL: MESSAGE
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
95 */
96
97
98 void
99 scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
101 {
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
105
106 SCM string_port = scm_open_output_string ();
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
110 scm_list_4 (fn,
111 scm_from_long (SCM_LINUM (port) + 1),
112 scm_from_int (SCM_COL (port) + 1),
113 scm_from_locale_string (message)));
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function? scm_from_locale_string (function) : SCM_BOOL_F,
119 string,
120 arg,
121 SCM_BOOL_F);
122 }
123
124
125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
126 (SCM setting),
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
131 {
132 SCM ans = scm_options (setting,
133 scm_read_opts,
134 FUNC_NAME);
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
137 return ans;
138 }
139 #undef FUNC_NAME
140
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM *scm_i_read_hash_procedures;
144
145 static SCM
146 scm_i_read_hash_procedures_ref (void)
147 {
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149 }
150
151 static void
152 scm_i_read_hash_procedures_set_x (SCM value)
153 {
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155 }
156
157 \f
158 /* Token readers. */
159
160
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
163
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
166
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
169
170
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
175
176 #ifdef MSDOS
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179 #else
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 #endif
182
183
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186 #define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
190
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
199 /* Read an SCSH block comment. */
200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
203 static SCM scm_read_shebang (scm_t_wchar, SCM);
204 static SCM scm_get_hash_procedure (int);
205
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
210 static int
211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
214
215 while (*read < buf_size)
216 {
217 int chr;
218
219 chr = scm_get_byte_or_eof (port);
220
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
234
235 return 1;
236 }
237
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243 static int
244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246 {
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
249
250 *overflow_buffer = NULL;
251 overflow_size = 0;
252
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
273 }
274 while (overflow);
275
276 if (overflow_size)
277 *read = overflow_size;
278 else
279 *read = bytes_read;
280
281 return (overflow_size != 0);
282 }
283
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286 static int
287 flush_ws (SCM port, const char *eoferr)
288 {
289 scm_t_wchar c;
290 while (1)
291 switch (c = scm_getc (port))
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
296 {
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
301 }
302 return c;
303
304 case ';':
305 lp:
306 switch (c = scm_getc (port))
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
316
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
324 scm_read_shebang (c, port);
325 break;
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
342 case SCM_LINE_INCREMENTORS:
343 case SCM_SINGLE_SPACES:
344 case '\t':
345 break;
346
347 default:
348 return c;
349 }
350
351 return 0;
352 }
353
354
355 \f
356 /* Token readers. */
357
358 static SCM scm_read_expression (SCM port);
359 static SCM scm_read_sharp (int chr, SCM port);
360
361
362 static SCM
363 maybe_annotate_source (SCM x, SCM port, long line, int column)
364 {
365 if (SCM_RECORD_POSITIONS_P)
366 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
367 return x;
368 }
369
370 static SCM
371 scm_read_sexp (scm_t_wchar chr, SCM port)
372 #define FUNC_NAME "scm_i_lreadparen"
373 {
374 int c;
375 SCM tmp, tl, ans = SCM_EOL;
376 const int terminating_char = ((chr == '[') ? ']' : ')');
377
378 /* Need to capture line and column numbers here. */
379 long line = SCM_LINUM (port);
380 int column = SCM_COL (port) - 1;
381
382 c = flush_ws (port, FUNC_NAME);
383 if (terminating_char == c)
384 return SCM_EOL;
385
386 scm_ungetc (c, port);
387 tmp = scm_read_expression (port);
388
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
393 {
394 ans = scm_read_expression (port);
395 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
396 scm_i_input_error (FUNC_NAME, port, "missing close paren",
397 SCM_EOL);
398 return ans;
399 }
400
401 /* Build the head of the list structure. */
402 ans = tl = scm_cons (tmp, SCM_EOL);
403
404 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
405 {
406 SCM new_tail;
407
408 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c)));
412
413 scm_ungetc (c, port);
414 tmp = scm_read_expression (port);
415
416 /* See above note about scm_sym_dot. */
417 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
418 {
419 SCM_SETCDR (tl, scm_read_expression (port));
420
421 c = flush_ws (port, FUNC_NAME);
422 if (terminating_char != c)
423 scm_i_input_error (FUNC_NAME, port,
424 "in pair: missing close paren", SCM_EOL);
425 goto exit;
426 }
427
428 new_tail = scm_cons (tmp, SCM_EOL);
429 SCM_SETCDR (tl, new_tail);
430 tl = new_tail;
431 }
432
433 exit:
434 return maybe_annotate_source (ans, port, line, column);
435 }
436 #undef FUNC_NAME
437
438
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
471 } while (0)
472
473 static void
474 skip_intraline_whitespace (SCM port)
475 {
476 scm_t_wchar c;
477
478 do
479 {
480 c = scm_getc (port);
481 if (c == EOF)
482 return;
483 }
484 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
485
486 scm_ungetc (c, port);
487 }
488
489 static SCM
490 scm_read_string (int chr, SCM port)
491 #define FUNC_NAME "scm_lreadr"
492 {
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
495
496 SCM str = SCM_BOOL_F;
497 unsigned c_str_len = 0;
498 scm_t_wchar c;
499
500 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
501 while ('"' != (c = scm_getc (port)))
502 {
503 if (c == EOF)
504 {
505 str_eof:
506 scm_i_input_error (FUNC_NAME, port,
507 "end of file in string constant", SCM_EOL);
508 }
509
510 if (c_str_len + 1 >= scm_i_string_length (str))
511 {
512 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
513
514 str = scm_string_append (scm_list_2 (str, addy));
515 }
516
517 if (c == '\\')
518 {
519 switch (c = scm_getc (port))
520 {
521 case EOF:
522 goto str_eof;
523 case '"':
524 case '\\':
525 break;
526 case '\n':
527 if (SCM_HUNGRY_EOL_ESCAPES_P)
528 skip_intraline_whitespace (port);
529 continue;
530 case '0':
531 c = '\0';
532 break;
533 case 'f':
534 c = '\f';
535 break;
536 case 'n':
537 c = '\n';
538 break;
539 case 'r':
540 c = '\r';
541 break;
542 case 't':
543 c = '\t';
544 break;
545 case 'a':
546 c = '\007';
547 break;
548 case 'v':
549 c = '\v';
550 break;
551 case 'b':
552 c = '\010';
553 break;
554 case 'x':
555 if (SCM_R6RS_ESCAPES_P)
556 SCM_READ_HEX_ESCAPE (10, ';');
557 else
558 SCM_READ_HEX_ESCAPE (2, '\0');
559 break;
560 case 'u':
561 if (!SCM_R6RS_ESCAPES_P)
562 {
563 SCM_READ_HEX_ESCAPE (4, '\0');
564 break;
565 }
566 case 'U':
567 if (!SCM_R6RS_ESCAPES_P)
568 {
569 SCM_READ_HEX_ESCAPE (6, '\0');
570 break;
571 }
572 default:
573 bad_escaped:
574 scm_i_input_error (FUNC_NAME, port,
575 "illegal character in escape sequence: ~S",
576 scm_list_1 (SCM_MAKE_CHAR (c)));
577 }
578 }
579 str = scm_i_string_start_writing (str);
580 scm_i_string_set_x (str, c_str_len++, c);
581 scm_i_string_stop_writing ();
582 }
583 return scm_i_substring_copy (str, 0, c_str_len);
584 }
585 #undef FUNC_NAME
586
587
588 static SCM
589 scm_read_number (scm_t_wchar chr, SCM port)
590 {
591 SCM result, str = SCM_EOL;
592 char buffer[READER_BUFFER_SIZE];
593 char *overflow_buffer = NULL;
594 size_t bytes_read;
595 int overflow;
596 scm_t_port *pt = SCM_PTAB_ENTRY (port);
597
598 scm_ungetc (chr, port);
599 overflow = read_complete_token (port, buffer, sizeof (buffer),
600 &overflow_buffer, &bytes_read);
601
602 if (!overflow)
603 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
604 else
605 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
606 pt->ilseq_handler);
607
608 result = scm_string_to_number (str, SCM_UNDEFINED);
609 if (!scm_is_true (result))
610 {
611 /* Return a symbol instead of a number */
612 if (SCM_CASE_INSENSITIVE_P)
613 str = scm_string_downcase_x (str);
614 result = scm_string_to_symbol (str);
615 }
616
617 if (overflow)
618 free (overflow_buffer);
619 SCM_COL (port) += scm_i_string_length (str);
620 return result;
621 }
622
623 static SCM
624 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
625 {
626 SCM result;
627 int ends_with_colon = 0;
628 size_t bytes_read;
629 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
630 int overflow;
631 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
632 scm_t_port *pt = SCM_PTAB_ENTRY (port);
633 SCM str;
634
635 scm_ungetc (chr, port);
636 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
637 &overflow_buffer, &bytes_read);
638 if (bytes_read > 0)
639 {
640 if (!overflow)
641 ends_with_colon = buffer[bytes_read - 1] == ':';
642 else
643 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
644 }
645
646 if (postfix && ends_with_colon && (bytes_read > 1))
647 {
648 if (!overflow)
649 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
650 else
651 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
652 pt->ilseq_handler);
653
654 if (SCM_CASE_INSENSITIVE_P)
655 str = scm_string_downcase_x (str);
656 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
657 }
658 else
659 {
660 if (!overflow)
661 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
662 else
663 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
664 pt->ilseq_handler);
665
666 if (SCM_CASE_INSENSITIVE_P)
667 str = scm_string_downcase_x (str);
668 result = scm_string_to_symbol (str);
669 }
670
671 if (overflow)
672 free (overflow_buffer);
673 SCM_COL (port) += scm_i_string_length (str);
674 return result;
675 }
676
677 static SCM
678 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
679 #define FUNC_NAME "scm_lreadr"
680 {
681 SCM result;
682 size_t read;
683 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
684 int overflow;
685 unsigned int radix;
686 SCM str;
687 scm_t_port *pt;
688
689 switch (chr)
690 {
691 case 'B':
692 case 'b':
693 radix = 2;
694 break;
695
696 case 'o':
697 case 'O':
698 radix = 8;
699 break;
700
701 case 'd':
702 case 'D':
703 radix = 10;
704 break;
705
706 case 'x':
707 case 'X':
708 radix = 16;
709 break;
710
711 default:
712 scm_ungetc (chr, port);
713 scm_ungetc ('#', port);
714 radix = 10;
715 }
716
717 overflow = read_complete_token (port, buffer, sizeof (buffer),
718 &overflow_buffer, &read);
719
720 pt = SCM_PTAB_ENTRY (port);
721 if (!overflow)
722 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
723 else
724 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
725 pt->ilseq_handler);
726
727 result = scm_string_to_number (str, scm_from_uint (radix));
728
729 if (overflow)
730 free (overflow_buffer);
731
732 SCM_COL (port) += scm_i_string_length (str);
733
734 if (scm_is_true (result))
735 return result;
736
737 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
738
739 return SCM_BOOL_F;
740 }
741 #undef FUNC_NAME
742
743 static SCM
744 scm_read_quote (int chr, SCM port)
745 {
746 SCM p;
747 long line = SCM_LINUM (port);
748 int column = SCM_COL (port) - 1;
749
750 switch (chr)
751 {
752 case '`':
753 p = scm_sym_quasiquote;
754 break;
755
756 case '\'':
757 p = scm_sym_quote;
758 break;
759
760 case ',':
761 {
762 scm_t_wchar c;
763
764 c = scm_getc (port);
765 if ('@' == c)
766 p = scm_sym_uq_splicing;
767 else
768 {
769 scm_ungetc (c, port);
770 p = scm_sym_unquote;
771 }
772 break;
773 }
774
775 default:
776 fprintf (stderr, "%s: unhandled quote character (%i)\n",
777 "scm_read_quote", chr);
778 abort ();
779 }
780
781 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
782 return maybe_annotate_source (p, port, line, column);
783 }
784
785 SCM_SYMBOL (sym_syntax, "syntax");
786 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
787 SCM_SYMBOL (sym_unsyntax, "unsyntax");
788 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
789
790 static SCM
791 scm_read_syntax (int chr, SCM port)
792 {
793 SCM p;
794 long line = SCM_LINUM (port);
795 int column = SCM_COL (port) - 1;
796
797 switch (chr)
798 {
799 case '`':
800 p = sym_quasisyntax;
801 break;
802
803 case '\'':
804 p = sym_syntax;
805 break;
806
807 case ',':
808 {
809 int c;
810
811 c = scm_getc (port);
812 if ('@' == c)
813 p = sym_unsyntax_splicing;
814 else
815 {
816 scm_ungetc (c, port);
817 p = sym_unsyntax;
818 }
819 break;
820 }
821
822 default:
823 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
824 "scm_read_syntax", chr);
825 abort ();
826 }
827
828 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
829 return maybe_annotate_source (p, port, line, column);
830 }
831
832 static SCM
833 scm_read_nil (int chr, SCM port)
834 {
835 SCM id = scm_read_mixed_case_symbol (chr, port);
836
837 if (!scm_is_eq (id, sym_nil))
838 scm_i_input_error ("scm_read_nil", port,
839 "unexpected input while reading #nil: ~a",
840 scm_list_1 (id));
841
842 return SCM_ELISP_NIL;
843 }
844
845 static SCM
846 scm_read_semicolon_comment (int chr, SCM port)
847 {
848 int c;
849
850 /* We use the get_byte here because there is no need to get the
851 locale correct with comment input. This presumes that newline
852 always represents itself no matter what the encoding is. */
853 for (c = scm_get_byte_or_eof (port);
854 (c != EOF) && (c != '\n');
855 c = scm_get_byte_or_eof (port));
856
857 return SCM_UNSPECIFIED;
858 }
859
860 \f
861 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
862
863 static SCM
864 scm_read_boolean (int chr, SCM port)
865 {
866 switch (chr)
867 {
868 case 't':
869 case 'T':
870 return SCM_BOOL_T;
871
872 case 'f':
873 case 'F':
874 return SCM_BOOL_F;
875 }
876
877 return SCM_UNSPECIFIED;
878 }
879
880 static SCM
881 scm_read_character (scm_t_wchar chr, SCM port)
882 #define FUNC_NAME "scm_lreadr"
883 {
884 char buffer[READER_CHAR_NAME_MAX_SIZE];
885 SCM charname;
886 size_t charname_len, bytes_read;
887 scm_t_wchar cp;
888 int overflow;
889 scm_t_port *pt;
890
891 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
892 if (overflow)
893 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
894
895 if (bytes_read == 0)
896 {
897 chr = scm_getc (port);
898 if (chr == EOF)
899 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
900 "while reading character", SCM_EOL);
901
902 /* CHR must be a token delimiter, like a whitespace. */
903 return (SCM_MAKE_CHAR (chr));
904 }
905
906 pt = SCM_PTAB_ENTRY (port);
907
908 /* Simple ASCII characters can be processed immediately. Also, simple
909 ISO-8859-1 characters can be processed immediately if the encoding for this
910 port is ISO-8859-1. */
911 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
912 {
913 SCM_COL (port) += 1;
914 return SCM_MAKE_CHAR (buffer[0]);
915 }
916
917 /* Otherwise, convert the buffer into a proper scheme string for
918 processing. */
919 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
920 pt->ilseq_handler);
921 charname_len = scm_i_string_length (charname);
922 SCM_COL (port) += charname_len;
923 cp = scm_i_string_ref (charname, 0);
924 if (charname_len == 1)
925 return SCM_MAKE_CHAR (cp);
926
927 /* Ignore dotted circles, which may be used to keep combining characters from
928 combining with the backslash in #\charname. */
929 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
930 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
931
932 if (cp >= '0' && cp < '8')
933 {
934 /* Dirk:FIXME:: This type of character syntax is not R5RS
935 * compliant. Further, it should be verified that the constant
936 * does only consist of octal digits. */
937 SCM p = scm_string_to_number (charname, scm_from_uint (8));
938 if (SCM_I_INUMP (p))
939 {
940 scm_t_wchar c = scm_to_uint32 (p);
941 if (SCM_IS_UNICODE_CHAR (c))
942 return SCM_MAKE_CHAR (c);
943 else
944 scm_i_input_error (FUNC_NAME, port,
945 "out-of-range octal character escape: ~a",
946 scm_list_1 (charname));
947 }
948 }
949
950 if (cp == 'x' && (charname_len > 1))
951 {
952 SCM p;
953
954 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
955 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
956 scm_from_uint (16));
957 if (SCM_I_INUMP (p))
958 {
959 scm_t_wchar c = scm_to_uint32 (p);
960 if (SCM_IS_UNICODE_CHAR (c))
961 return SCM_MAKE_CHAR (c);
962 else
963 scm_i_input_error (FUNC_NAME, port,
964 "out-of-range hex character escape: ~a",
965 scm_list_1 (charname));
966 }
967 }
968
969 /* The names of characters should never have non-Latin1
970 characters. */
971 if (scm_i_is_narrow_string (charname)
972 || scm_i_try_narrow_string (charname))
973 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
974 charname_len);
975 if (scm_is_true (ch))
976 return ch;
977 }
978
979 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
980 scm_list_1 (charname));
981
982 return SCM_UNSPECIFIED;
983 }
984 #undef FUNC_NAME
985
986 static SCM
987 scm_read_keyword (int chr, SCM port)
988 {
989 SCM symbol;
990
991 /* Read the symbol that comprises the keyword. Doing this instead of
992 invoking a specific symbol reader function allows `scm_read_keyword ()'
993 to adapt to the delimiters currently valid of symbols.
994
995 XXX: This implementation allows sloppy syntaxes like `#: key'. */
996 symbol = scm_read_expression (port);
997 if (!scm_is_symbol (symbol))
998 scm_i_input_error ("scm_read_keyword", port,
999 "keyword prefix `~a' not followed by a symbol: ~s",
1000 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1001
1002 return (scm_symbol_to_keyword (symbol));
1003 }
1004
1005 static SCM
1006 scm_read_vector (int chr, SCM port)
1007 {
1008 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1009 guarantee that it's going to do what we want. After all, this is an
1010 implementation detail of `scm_read_vector ()', not a desirable
1011 property. */
1012 return (scm_vector (scm_read_sexp (chr, port)));
1013 }
1014
1015 static SCM
1016 scm_read_srfi4_vector (int chr, SCM port)
1017 {
1018 return scm_i_read_array (port, chr);
1019 }
1020
1021 static SCM
1022 scm_read_bytevector (scm_t_wchar chr, SCM port)
1023 {
1024 chr = scm_getc (port);
1025 if (chr != 'u')
1026 goto syntax;
1027
1028 chr = scm_getc (port);
1029 if (chr != '8')
1030 goto syntax;
1031
1032 chr = scm_getc (port);
1033 if (chr != '(')
1034 goto syntax;
1035
1036 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1037
1038 syntax:
1039 scm_i_input_error ("read_bytevector", port,
1040 "invalid bytevector prefix",
1041 SCM_MAKE_CHAR (chr));
1042 return SCM_UNSPECIFIED;
1043 }
1044
1045 static SCM
1046 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1047 {
1048 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1049 terribly inefficient but who cares? */
1050 SCM s_bits = SCM_EOL;
1051
1052 for (chr = scm_getc (port);
1053 (chr != EOF) && ((chr == '0') || (chr == '1'));
1054 chr = scm_getc (port))
1055 {
1056 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1057 }
1058
1059 if (chr != EOF)
1060 scm_ungetc (chr, port);
1061
1062 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1063 }
1064
1065 static SCM
1066 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1067 {
1068 int bang_seen = 0;
1069
1070 for (;;)
1071 {
1072 int c = scm_getc (port);
1073
1074 if (c == EOF)
1075 scm_i_input_error ("skip_block_comment", port,
1076 "unterminated `#! ... !#' comment", SCM_EOL);
1077
1078 if (c == '!')
1079 bang_seen = 1;
1080 else if (c == '#' && bang_seen)
1081 break;
1082 else
1083 bang_seen = 0;
1084 }
1085
1086 return SCM_UNSPECIFIED;
1087 }
1088
1089 static SCM
1090 scm_read_shebang (scm_t_wchar chr, SCM port)
1091 {
1092 int c = 0;
1093 if ((c = scm_get_byte_or_eof (port)) != 'r')
1094 {
1095 scm_ungetc (c, port);
1096 return scm_read_scsh_block_comment (chr, port);
1097 }
1098 if ((c = scm_get_byte_or_eof (port)) != '6')
1099 {
1100 scm_ungetc (c, port);
1101 scm_ungetc ('r', port);
1102 return scm_read_scsh_block_comment (chr, port);
1103 }
1104 if ((c = scm_get_byte_or_eof (port)) != 'r')
1105 {
1106 scm_ungetc (c, port);
1107 scm_ungetc ('6', port);
1108 scm_ungetc ('r', port);
1109 return scm_read_scsh_block_comment (chr, port);
1110 }
1111 if ((c = scm_get_byte_or_eof (port)) != 's')
1112 {
1113 scm_ungetc (c, port);
1114 scm_ungetc ('r', port);
1115 scm_ungetc ('6', port);
1116 scm_ungetc ('r', port);
1117 return scm_read_scsh_block_comment (chr, port);
1118 }
1119
1120 return SCM_UNSPECIFIED;
1121 }
1122
1123 static SCM
1124 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1125 {
1126 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1127 nested. So care must be taken. */
1128 int nesting_level = 1;
1129
1130 int a = scm_getc (port);
1131
1132 if (a == EOF)
1133 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1134 "unterminated `#| ... |#' comment", SCM_EOL);
1135
1136 while (nesting_level > 0)
1137 {
1138 int b = scm_getc (port);
1139
1140 if (b == EOF)
1141 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1142 "unterminated `#| ... |#' comment", SCM_EOL);
1143
1144 if (a == '|' && b == '#')
1145 {
1146 nesting_level--;
1147 b = EOF;
1148 }
1149 else if (a == '#' && b == '|')
1150 {
1151 nesting_level++;
1152 b = EOF;
1153 }
1154
1155 a = b;
1156 }
1157
1158 return SCM_UNSPECIFIED;
1159 }
1160
1161 static SCM
1162 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1163 {
1164 scm_t_wchar c;
1165
1166 c = flush_ws (port, (char *) NULL);
1167 if (EOF == c)
1168 scm_i_input_error ("read_commented_expression", port,
1169 "no expression after #; comment", SCM_EOL);
1170 scm_ungetc (c, port);
1171 scm_read_expression (port);
1172 return SCM_UNSPECIFIED;
1173 }
1174
1175 static SCM
1176 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1177 {
1178 /* Guile's extended symbol read syntax looks like this:
1179
1180 #{This is all a symbol name}#
1181
1182 So here, CHR is expected to be `{'. */
1183 int saw_brace = 0;
1184 size_t len = 0;
1185 SCM buf = scm_i_make_string (1024, NULL, 0);
1186
1187 buf = scm_i_string_start_writing (buf);
1188
1189 while ((chr = scm_getc (port)) != EOF)
1190 {
1191 if (saw_brace)
1192 {
1193 if (chr == '#')
1194 {
1195 break;
1196 }
1197 else
1198 {
1199 saw_brace = 0;
1200 scm_i_string_set_x (buf, len++, '}');
1201 }
1202 }
1203
1204 if (chr == '}')
1205 saw_brace = 1;
1206 else if (chr == '\\')
1207 {
1208 /* It used to be that print.c would print extended-read-syntax
1209 symbols with backslashes before "non-standard" chars, but
1210 this routine wouldn't do anything with those escapes.
1211 Bummer. What we've done is to change print.c to output
1212 R6RS hex escapes for those characters, relying on the fact
1213 that the extended read syntax would never put a `\' before
1214 an `x'. For now, we just ignore other instances of
1215 backslash in the string. */
1216 switch ((chr = scm_getc (port)))
1217 {
1218 case EOF:
1219 goto done;
1220 case 'x':
1221 {
1222 scm_t_wchar c;
1223
1224 SCM_READ_HEX_ESCAPE (10, ';');
1225 scm_i_string_set_x (buf, len++, c);
1226 break;
1227
1228 str_eof:
1229 chr = EOF;
1230 goto done;
1231
1232 bad_escaped:
1233 scm_i_string_stop_writing ();
1234 scm_i_input_error ("scm_read_extended_symbol", port,
1235 "illegal character in escape sequence: ~S",
1236 scm_list_1 (SCM_MAKE_CHAR (c)));
1237 break;
1238 }
1239 default:
1240 scm_i_string_set_x (buf, len++, chr);
1241 break;
1242 }
1243 }
1244 else
1245 scm_i_string_set_x (buf, len++, chr);
1246
1247 if (len >= scm_i_string_length (buf) - 2)
1248 {
1249 SCM addy;
1250
1251 scm_i_string_stop_writing ();
1252 addy = scm_i_make_string (1024, NULL, 0);
1253 buf = scm_string_append (scm_list_2 (buf, addy));
1254 len = 0;
1255 buf = scm_i_string_start_writing (buf);
1256 }
1257 }
1258
1259 done:
1260 scm_i_string_stop_writing ();
1261 if (chr == EOF)
1262 scm_i_input_error ("scm_read_extended_symbol", port,
1263 "end of file while reading symbol", SCM_EOL);
1264
1265 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1266 }
1267
1268
1269 \f
1270 /* Top-level token readers, i.e., dispatchers. */
1271
1272 static SCM
1273 scm_read_sharp_extension (int chr, SCM port)
1274 {
1275 SCM proc;
1276
1277 proc = scm_get_hash_procedure (chr);
1278 if (scm_is_true (scm_procedure_p (proc)))
1279 {
1280 long line = SCM_LINUM (port);
1281 int column = SCM_COL (port) - 2;
1282 SCM got;
1283
1284 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1285
1286 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1287 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1288
1289 return got;
1290 }
1291
1292 return SCM_UNSPECIFIED;
1293 }
1294
1295 /* The reader for the sharp `#' character. It basically dispatches reads
1296 among the above token readers. */
1297 static SCM
1298 scm_read_sharp (scm_t_wchar chr, SCM port)
1299 #define FUNC_NAME "scm_lreadr"
1300 {
1301 SCM result;
1302
1303 chr = scm_getc (port);
1304
1305 result = scm_read_sharp_extension (chr, port);
1306 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1307 return result;
1308
1309 switch (chr)
1310 {
1311 case '\\':
1312 return (scm_read_character (chr, port));
1313 case '(':
1314 return (scm_read_vector (chr, port));
1315 case 's':
1316 case 'u':
1317 case 'f':
1318 case 'c':
1319 /* This one may return either a boolean or an SRFI-4 vector. */
1320 return (scm_read_srfi4_vector (chr, port));
1321 case 'v':
1322 return (scm_read_bytevector (chr, port));
1323 case '*':
1324 return (scm_read_guile_bit_vector (chr, port));
1325 case 't':
1326 case 'T':
1327 case 'F':
1328 return (scm_read_boolean (chr, port));
1329 case ':':
1330 return (scm_read_keyword (chr, port));
1331 case '0': case '1': case '2': case '3': case '4':
1332 case '5': case '6': case '7': case '8': case '9':
1333 case '@':
1334 #if SCM_ENABLE_DEPRECATED
1335 /* See below for 'i' and 'e'. */
1336 case 'a':
1337 case 'y':
1338 case 'h':
1339 case 'l':
1340 #endif
1341 return (scm_i_read_array (port, chr));
1342
1343 case 'i':
1344 case 'e':
1345 #if SCM_ENABLE_DEPRECATED
1346 {
1347 /* When next char is '(', it really is an old-style
1348 uniform array. */
1349 scm_t_wchar next_c = scm_getc (port);
1350 if (next_c != EOF)
1351 scm_ungetc (next_c, port);
1352 if (next_c == '(')
1353 return scm_i_read_array (port, chr);
1354 /* Fall through. */
1355 }
1356 #endif
1357 case 'b':
1358 case 'B':
1359 case 'o':
1360 case 'O':
1361 case 'd':
1362 case 'D':
1363 case 'x':
1364 case 'X':
1365 case 'I':
1366 case 'E':
1367 return (scm_read_number_and_radix (chr, port));
1368 case '{':
1369 return (scm_read_extended_symbol (chr, port));
1370 case '!':
1371 return (scm_read_shebang (chr, port));
1372 case ';':
1373 return (scm_read_commented_expression (chr, port));
1374 case '`':
1375 case '\'':
1376 case ',':
1377 return (scm_read_syntax (chr, port));
1378 case 'n':
1379 return (scm_read_nil (chr, port));
1380 default:
1381 result = scm_read_sharp_extension (chr, port);
1382 if (scm_is_eq (result, SCM_UNSPECIFIED))
1383 {
1384 /* To remain compatible with 1.8 and earlier, the following
1385 characters have lower precedence than `read-hash-extend'
1386 characters. */
1387 switch (chr)
1388 {
1389 case '|':
1390 return scm_read_r6rs_block_comment (chr, port);
1391 default:
1392 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1393 scm_list_1 (SCM_MAKE_CHAR (chr)));
1394 }
1395 }
1396 else
1397 return result;
1398 }
1399
1400 return SCM_UNSPECIFIED;
1401 }
1402 #undef FUNC_NAME
1403
1404 static SCM
1405 scm_read_expression (SCM port)
1406 #define FUNC_NAME "scm_read_expression"
1407 {
1408 while (1)
1409 {
1410 scm_t_wchar chr;
1411
1412 chr = scm_getc (port);
1413
1414 switch (chr)
1415 {
1416 case SCM_WHITE_SPACES:
1417 case SCM_LINE_INCREMENTORS:
1418 break;
1419 case ';':
1420 (void) scm_read_semicolon_comment (chr, port);
1421 break;
1422 case '[':
1423 if (!SCM_SQUARE_BRACKETS_P)
1424 return (scm_read_mixed_case_symbol (chr, port));
1425 /* otherwise fall through */
1426 case '(':
1427 return (scm_read_sexp (chr, port));
1428 case '"':
1429 return (scm_read_string (chr, port));
1430 case '\'':
1431 case '`':
1432 case ',':
1433 return (scm_read_quote (chr, port));
1434 case '#':
1435 {
1436 SCM result;
1437 result = scm_read_sharp (chr, port);
1438 if (scm_is_eq (result, SCM_UNSPECIFIED))
1439 /* We read a comment or some such. */
1440 break;
1441 else
1442 return result;
1443 }
1444 case ')':
1445 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1446 break;
1447 case ']':
1448 if (SCM_SQUARE_BRACKETS_P)
1449 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1450 /* otherwise fall through */
1451 case EOF:
1452 return SCM_EOF_VAL;
1453 case ':':
1454 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1455 return scm_symbol_to_keyword (scm_read_expression (port));
1456 /* Fall through. */
1457
1458 default:
1459 {
1460 if (((chr >= '0') && (chr <= '9'))
1461 || (strchr ("+-.", chr)))
1462 return (scm_read_number (chr, port));
1463 else
1464 return (scm_read_mixed_case_symbol (chr, port));
1465 }
1466 }
1467 }
1468 }
1469 #undef FUNC_NAME
1470
1471 \f
1472 /* Actual reader. */
1473
1474 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1475 (SCM port),
1476 "Read an s-expression from the input port @var{port}, or from\n"
1477 "the current input port if @var{port} is not specified.\n"
1478 "Any whitespace before the next token is discarded.")
1479 #define FUNC_NAME s_scm_read
1480 {
1481 int c;
1482
1483 if (SCM_UNBNDP (port))
1484 port = scm_current_input_port ();
1485 SCM_VALIDATE_OPINPORT (1, port);
1486
1487 c = flush_ws (port, (char *) NULL);
1488 if (EOF == c)
1489 return SCM_EOF_VAL;
1490 scm_ungetc (c, port);
1491
1492 return (scm_read_expression (port));
1493 }
1494 #undef FUNC_NAME
1495
1496
1497 \f
1498
1499 /* Manipulate the read-hash-procedures alist. This could be written in
1500 Scheme, but maybe it will also be used by C code during initialisation. */
1501 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1502 (SCM chr, SCM proc),
1503 "Install the procedure @var{proc} for reading expressions\n"
1504 "starting with the character sequence @code{#} and @var{chr}.\n"
1505 "@var{proc} will be called with two arguments: the character\n"
1506 "@var{chr} and the port to read further data from. The object\n"
1507 "returned will be the return value of @code{read}. \n"
1508 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1509 )
1510 #define FUNC_NAME s_scm_read_hash_extend
1511 {
1512 SCM this;
1513 SCM prev;
1514
1515 SCM_VALIDATE_CHAR (1, chr);
1516 SCM_ASSERT (scm_is_false (proc)
1517 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1518 proc, SCM_ARG2, FUNC_NAME);
1519
1520 /* Check if chr is already in the alist. */
1521 this = scm_i_read_hash_procedures_ref ();
1522 prev = SCM_BOOL_F;
1523 while (1)
1524 {
1525 if (scm_is_null (this))
1526 {
1527 /* not found, so add it to the beginning. */
1528 if (scm_is_true (proc))
1529 {
1530 SCM new = scm_cons (scm_cons (chr, proc),
1531 scm_i_read_hash_procedures_ref ());
1532 scm_i_read_hash_procedures_set_x (new);
1533 }
1534 break;
1535 }
1536 if (scm_is_eq (chr, SCM_CAAR (this)))
1537 {
1538 /* already in the alist. */
1539 if (scm_is_false (proc))
1540 {
1541 /* remove it. */
1542 if (scm_is_false (prev))
1543 {
1544 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1545 scm_i_read_hash_procedures_set_x (rest);
1546 }
1547 else
1548 scm_set_cdr_x (prev, SCM_CDR (this));
1549 }
1550 else
1551 {
1552 /* replace it. */
1553 scm_set_cdr_x (SCM_CAR (this), proc);
1554 }
1555 break;
1556 }
1557 prev = this;
1558 this = SCM_CDR (this);
1559 }
1560
1561 return SCM_UNSPECIFIED;
1562 }
1563 #undef FUNC_NAME
1564
1565 /* Recover the read-hash procedure corresponding to char c. */
1566 static SCM
1567 scm_get_hash_procedure (int c)
1568 {
1569 SCM rest = scm_i_read_hash_procedures_ref ();
1570
1571 while (1)
1572 {
1573 if (scm_is_null (rest))
1574 return SCM_BOOL_F;
1575
1576 if (SCM_CHAR (SCM_CAAR (rest)) == c)
1577 return SCM_CDAR (rest);
1578
1579 rest = SCM_CDR (rest);
1580 }
1581 }
1582
1583 #define SCM_ENCODING_SEARCH_SIZE (500)
1584
1585 /* Search the first few hundred characters of a file for an Emacs-like coding
1586 declaration. Returns either NULL or a string whose storage has been
1587 allocated with `scm_gc_malloc ()'. */
1588 char *
1589 scm_i_scan_for_encoding (SCM port)
1590 {
1591 scm_t_port *pt;
1592 char header[SCM_ENCODING_SEARCH_SIZE+1];
1593 size_t bytes_read, encoding_length, i;
1594 char *encoding = NULL;
1595 int utf8_bom = 0;
1596 char *pos, *encoding_start;
1597 int in_comment;
1598
1599 pt = SCM_PTAB_ENTRY (port);
1600
1601 if (pt->rw_active == SCM_PORT_WRITE)
1602 scm_flush (port);
1603
1604 if (pt->rw_random)
1605 pt->rw_active = SCM_PORT_READ;
1606
1607 if (pt->read_pos == pt->read_end)
1608 {
1609 /* We can use the read buffer, and thus avoid a seek. */
1610 if (scm_fill_input (port) == EOF)
1611 return NULL;
1612
1613 bytes_read = pt->read_end - pt->read_pos;
1614 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1615 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1616
1617 if (bytes_read <= 1)
1618 /* An unbuffered port -- don't scan. */
1619 return NULL;
1620
1621 memcpy (header, pt->read_pos, bytes_read);
1622 header[bytes_read] = '\0';
1623 }
1624 else
1625 {
1626 /* Try to read some bytes and then seek back. Not all ports
1627 support seeking back; and indeed some file ports (like
1628 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1629 check performed by SCM_FPORT_FDES---but fail to seek
1630 backwards. Hence this block comes second. We prefer to use
1631 the read buffer in-place. */
1632 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1633 return NULL;
1634
1635 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1636 header[bytes_read] = '\0';
1637 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1638 }
1639
1640 if (bytes_read > 3
1641 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1642 utf8_bom = 1;
1643
1644 /* search past "coding[:=]" */
1645 pos = header;
1646 while (1)
1647 {
1648 if ((pos = strstr(pos, "coding")) == NULL)
1649 return NULL;
1650
1651 pos += strlen("coding");
1652 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1653 (*pos == ':' || *pos == '='))
1654 {
1655 pos ++;
1656 break;
1657 }
1658 }
1659
1660 /* skip spaces */
1661 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1662 (*pos == ' ' || *pos == '\t'))
1663 pos ++;
1664
1665 /* grab the next token */
1666 encoding_start = pos;
1667 i = 0;
1668 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1669 && encoding_start + i - header < bytes_read
1670 && (isalnum ((int) encoding_start[i])
1671 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1672 i++;
1673
1674 encoding_length = i;
1675 if (encoding_length == 0)
1676 return NULL;
1677
1678 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1679 for (i = 0; i < encoding_length; i++)
1680 encoding[i] = toupper ((int) encoding[i]);
1681
1682 /* push backwards to make sure we were in a comment */
1683 in_comment = 0;
1684 pos = encoding_start;
1685 while (pos >= header)
1686 {
1687 if (*pos == ';')
1688 {
1689 in_comment = 1;
1690 break;
1691 }
1692 else if (*pos == '\n' || pos == header)
1693 {
1694 /* This wasn't in a semicolon comment. Check for a
1695 hash-bang comment. */
1696 char *beg = strstr (header, "#!");
1697 char *end = strstr (header, "!#");
1698 if (beg < encoding_start && encoding_start + encoding_length <= end)
1699 in_comment = 1;
1700 break;
1701 }
1702 else
1703 {
1704 pos --;
1705 continue;
1706 }
1707 }
1708 if (!in_comment)
1709 /* This wasn't in a comment */
1710 return NULL;
1711
1712 if (utf8_bom && strcmp(encoding, "UTF-8"))
1713 scm_misc_error (NULL,
1714 "the port input declares the encoding ~s but is encoded as UTF-8",
1715 scm_list_1 (scm_from_locale_string (encoding)));
1716
1717 return encoding;
1718 }
1719
1720 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1721 (SCM port),
1722 "Scans the port for an Emacs-like character coding declaration\n"
1723 "near the top of the contents of a port with random-accessible contents.\n"
1724 "The coding declaration is of the form\n"
1725 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1726 "\n"
1727 "Returns a string containing the character encoding of the file\n"
1728 "if a declaration was found, or @code{#f} otherwise.\n")
1729 #define FUNC_NAME s_scm_file_encoding
1730 {
1731 char *enc;
1732 SCM s_enc;
1733
1734 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1735
1736 enc = scm_i_scan_for_encoding (port);
1737 if (enc == NULL)
1738 return SCM_BOOL_F;
1739 else
1740 {
1741 s_enc = scm_from_locale_string (enc);
1742 return s_enc;
1743 }
1744
1745 return SCM_BOOL_F;
1746 }
1747 #undef FUNC_NAME
1748
1749 void
1750 scm_init_read ()
1751 {
1752 SCM read_hash_procs;
1753
1754 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1755
1756 scm_i_read_hash_procedures =
1757 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1758
1759 scm_init_opts (scm_read_options, scm_read_opts);
1760 #include "libguile/read.x"
1761 }
1762
1763 /*
1764 Local Variables:
1765 c-file-style: "gnu"
1766 End:
1767 */