1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
63 SCM_SYMBOL (sym_nil
, "nil");
65 /* SRFI-105 curly infix expression support */
66 SCM_SYMBOL (sym_nfx
, "$nfx$");
67 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
68 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
70 scm_t_option scm_read_opts
[] =
72 { SCM_OPTION_BOOLEAN
, "copy", 0,
73 "Copy source code expressions." },
74 { SCM_OPTION_BOOLEAN
, "positions", 1,
75 "Record positions of source code expressions." },
76 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
77 "Convert symbols to lower case."},
78 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
79 "Style of keyword recognition: #f, 'prefix or 'postfix."},
80 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
81 "Use R6RS variable-length character and string hex escapes."},
82 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
83 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
84 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
85 "In strings, consume leading whitespace after an escaped end-of-line."},
86 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
87 "Support SRFI-105 curly infix expressions."},
91 /* Internal read options structure. This is initialized by 'scm_read'
92 from the global and per-port read options, and a pointer is passed
93 down to all helper functions. */
97 KEYWORD_STYLE_HASH_PREFIX
,
104 enum t_keyword_style keyword_style
;
105 unsigned int copy_source_p
: 1;
106 unsigned int record_positions_p
: 1;
107 unsigned int case_insensitive_p
: 1;
108 unsigned int r6rs_escapes_p
: 1;
109 unsigned int square_brackets_p
: 1;
110 unsigned int hungry_eol_escapes_p
: 1;
111 unsigned int curly_infix_p
: 1;
112 unsigned int neoteric_p
: 1;
115 typedef struct t_read_opts scm_t_read_opts
;
119 Give meaningful error messages for errors
123 FILE:LINE:COL: MESSAGE
124 This happened in ....
126 This is not standard GNU format, but the test-suite likes the real
127 message to be in front.
133 scm_i_input_error (char const *function
,
134 SCM port
, const char *message
, SCM arg
)
136 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
138 : scm_from_locale_string ("#<unknown port>"));
140 SCM string_port
= scm_open_output_string ();
141 SCM string
= SCM_EOL
;
142 scm_simple_format (string_port
,
143 scm_from_locale_string ("~A:~S:~S: ~A"),
145 scm_from_long (SCM_LINUM (port
) + 1),
146 scm_from_int (SCM_COL (port
) + 1),
147 scm_from_locale_string (message
)));
149 string
= scm_get_output_string (string_port
);
150 scm_close_output_port (string_port
);
151 scm_error_scm (scm_from_latin1_symbol ("read-error"),
152 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
159 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
161 "Option interface for the read options. Instead of using\n"
162 "this procedure directly, use the procedures @code{read-enable},\n"
163 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
164 #define FUNC_NAME s_scm_read_options
166 SCM ans
= scm_options (setting
,
169 if (SCM_COPY_SOURCE_P
)
170 SCM_RECORD_POSITIONS_P
= 1;
175 /* A fluid referring to an association list mapping extra hash
176 characters to procedures. */
177 static SCM
*scm_i_read_hash_procedures
;
180 scm_i_read_hash_procedures_ref (void)
182 return scm_fluid_ref (*scm_i_read_hash_procedures
);
186 scm_i_read_hash_procedures_set_x (SCM value
)
188 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
195 /* Size of the C buffer used to read symbols and numbers. */
196 #define READER_BUFFER_SIZE 128
198 /* Number of 32-bit codepoints in the buffer used to read strings. */
199 #define READER_STRING_BUFFER_SIZE 128
201 /* The maximum size of Scheme character names. */
202 #define READER_CHAR_NAME_MAX_SIZE 50
204 /* The maximum size of reader directive names. */
205 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
208 /* `isblank' is only in C99. */
209 #define CHAR_IS_BLANK_(_chr) \
210 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
211 || ((_chr) == '\f') || ((_chr) == '\r'))
214 # define CHAR_IS_BLANK(_chr) \
215 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
217 # define CHAR_IS_BLANK CHAR_IS_BLANK_
221 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
223 #define CHAR_IS_R5RS_DELIMITER(c) \
225 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
227 #define CHAR_IS_DELIMITER(c) \
228 (CHAR_IS_R5RS_DELIMITER (c) \
229 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
230 || opts->curly_infix_p)) \
231 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
233 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
235 #define CHAR_IS_EXPONENT_MARKER(_chr) \
236 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
237 || ((_chr) == 'd') || ((_chr) == 'l'))
239 /* Read an SCSH block comment. */
240 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
241 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
242 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
243 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
244 static SCM
scm_get_hash_procedure (int);
246 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
247 result in the pre-allocated buffer BUF. Return zero if the whole token has
248 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
249 bytes actually read. */
251 read_token (SCM port
, scm_t_read_opts
*opts
,
252 char *buf
, size_t buf_size
, size_t *read
)
256 while (*read
< buf_size
)
260 chr
= scm_get_byte_or_eof_unlocked (port
);
264 else if (CHAR_IS_DELIMITER (chr
))
266 scm_unget_byte_unlocked (chr
, port
);
279 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
280 if the token doesn't fit in BUFFER_SIZE bytes. */
282 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
283 char *buffer
, size_t buffer_size
, size_t *read
)
286 size_t bytes_read
, overflow_size
= 0;
287 char *overflow_buffer
= NULL
;
291 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
294 if (overflow
|| overflow_size
!= 0)
296 if (overflow_size
== 0)
298 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
299 memcpy (overflow_buffer
, buffer
, bytes_read
);
300 overflow_size
= bytes_read
;
305 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
307 memcpy (new_buf
, overflow_buffer
, overflow_size
);
308 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
310 overflow_buffer
= new_buf
;
311 overflow_size
+= bytes_read
;
318 *read
= overflow_size
;
322 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
325 /* Skip whitespace from PORT and return the first non-whitespace character
326 read. Raise an error on end-of-file. */
328 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
332 switch (c
= scm_getc_unlocked (port
))
338 scm_i_input_error (eoferr
,
347 switch (c
= scm_getc_unlocked (port
))
353 case SCM_LINE_INCREMENTORS
:
359 switch (c
= scm_getc_unlocked (port
))
362 eoferr
= "read_sharp";
365 scm_read_shebang (c
, port
, opts
);
368 scm_read_commented_expression (c
, port
, opts
);
371 if (scm_is_false (scm_get_hash_procedure (c
)))
373 scm_read_r6rs_block_comment (c
, port
);
378 scm_ungetc_unlocked (c
, port
);
383 case SCM_LINE_INCREMENTORS
:
384 case SCM_SINGLE_SPACES
:
399 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
400 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
401 long line
, int column
);
405 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
406 long line
, int column
)
408 if (opts
->record_positions_p
)
409 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
414 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
415 #define FUNC_NAME "scm_i_lreadparen"
418 SCM tmp
, tl
, ans
= SCM_EOL
;
419 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
420 const int terminating_char
= ((chr
== '{') ? '}'
421 : ((chr
== '[') ? ']'
424 /* Need to capture line and column numbers here. */
425 long line
= SCM_LINUM (port
);
426 int column
= SCM_COL (port
) - 1;
428 c
= flush_ws (port
, opts
, FUNC_NAME
);
429 if (terminating_char
== c
)
432 scm_ungetc_unlocked (c
, port
);
433 tmp
= scm_read_expression (port
, opts
);
435 /* Note that it is possible for scm_read_expression to return
436 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
437 check that it's a real dot by checking `c'. */
438 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
440 ans
= scm_read_expression (port
, opts
);
441 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
442 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
447 /* Build the head of the list structure. */
448 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
450 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
454 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
455 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
456 scm_i_input_error (FUNC_NAME
, port
,
457 "in pair: mismatched close paren: ~A",
458 scm_list_1 (SCM_MAKE_CHAR (c
)));
460 scm_ungetc_unlocked (c
, port
);
461 tmp
= scm_read_expression (port
, opts
);
463 /* See above note about scm_sym_dot. */
464 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
466 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
468 c
= flush_ws (port
, opts
, FUNC_NAME
);
469 if (terminating_char
!= c
)
470 scm_i_input_error (FUNC_NAME
, port
,
471 "in pair: missing close paren", SCM_EOL
);
475 new_tail
= scm_cons (tmp
, SCM_EOL
);
476 SCM_SETCDR (tl
, new_tail
);
482 /* In addition to finding the length, 'scm_ilength' checks for
483 improper or circular lists, in which case it returns -1. */
484 int len
= scm_ilength (ans
);
486 /* The (len == 0) case is handled above */
488 /* Return directly to avoid re-annotating the element's source
489 location with the position of the outer brace. Also, it
490 might not be possible to annotate the element. */
491 return scm_car (ans
); /* {e} => e */
493 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
494 else if (len
>= 3 && (len
& 1))
496 /* It's a proper list whose length is odd and at least 3. If
497 the elements at odd indices (the infix operator positions)
498 are all 'equal?', then it's a simple curly-infix list.
499 Otherwise it's a mixed curly-infix list. */
500 SCM op
= scm_cadr (ans
);
502 /* Check to see if the elements at odd indices are 'equal?' */
503 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
505 if (scm_is_null (tl
))
507 /* Convert simple curly-infix list to prefix:
508 {a <op> b <op> ...} => (<op> a b ...) */
510 while (scm_is_pair (scm_cdr (tl
)))
513 SCM_SETCDR (tl
, tmp
);
516 ans
= scm_cons (op
, ans
);
519 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
521 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
522 ans
= scm_cons (sym_nfx
, ans
);
528 /* Mixed curly-infix (possibly improper) list:
529 {e . tail} => ($nfx$ e . tail) */
530 ans
= scm_cons (sym_nfx
, ans
);
533 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
538 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
539 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
541 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
547 while (i < ndigits) \
549 a = scm_getc_unlocked (port); \
553 && (a == (scm_t_wchar) terminator) \
556 if ('0' <= a && a <= '9') \
558 else if ('A' <= a && a <= 'F') \
560 else if ('a' <= a && a <= 'f') \
573 skip_intraline_whitespace (SCM port
)
579 c
= scm_getc_unlocked (port
);
583 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
585 scm_ungetc_unlocked (c
, port
);
589 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
590 #define FUNC_NAME "scm_lreadr"
592 /* For strings smaller than C_STR, this function creates only one Scheme
593 object (the string returned). */
596 size_t c_str_len
= 0;
597 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
599 /* Need to capture line and column numbers here. */
600 long line
= SCM_LINUM (port
);
601 int column
= SCM_COL (port
) - 1;
603 while ('"' != (c
= scm_getc_unlocked (port
)))
608 scm_i_input_error (FUNC_NAME
, port
,
609 "end of file in string constant", SCM_EOL
);
612 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
614 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
620 switch (c
= scm_getc_unlocked (port
))
628 if (opts
->hungry_eol_escapes_p
)
629 skip_intraline_whitespace (port
);
656 if (opts
->r6rs_escapes_p
)
657 SCM_READ_HEX_ESCAPE (10, ';');
659 SCM_READ_HEX_ESCAPE (2, '\0');
662 if (!opts
->r6rs_escapes_p
)
664 SCM_READ_HEX_ESCAPE (4, '\0');
668 if (!opts
->r6rs_escapes_p
)
670 SCM_READ_HEX_ESCAPE (6, '\0');
675 scm_i_input_error (FUNC_NAME
, port
,
676 "illegal character in escape sequence: ~S",
677 scm_list_1 (SCM_MAKE_CHAR (c
)));
681 c_str
[c_str_len
++] = c
;
684 if (scm_is_null (str
))
685 /* Fast path: we got a string that fits in C_STR. */
686 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
690 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
692 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
695 return maybe_annotate_source (str
, port
, opts
, line
, column
);
701 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
703 SCM result
, str
= SCM_EOL
;
704 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
707 /* Need to capture line and column numbers here. */
708 long line
= SCM_LINUM (port
);
709 int column
= SCM_COL (port
) - 1;
711 scm_ungetc_unlocked (chr
, port
);
712 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
715 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
717 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
718 if (scm_is_false (result
))
720 /* Return a symbol instead of a number */
721 if (opts
->case_insensitive_p
)
722 str
= scm_string_downcase_x (str
);
723 result
= scm_string_to_symbol (str
);
725 else if (SCM_NIMP (result
))
726 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
728 SCM_COL (port
) += scm_i_string_length (str
);
733 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
736 int ends_with_colon
= 0;
738 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
739 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
742 scm_ungetc_unlocked (chr
, port
);
743 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
746 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
748 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
750 str
= scm_from_port_stringn (buffer
, bytes_read
- 1, port
);
752 if (opts
->case_insensitive_p
)
753 str
= scm_string_downcase_x (str
);
754 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
758 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
760 if (opts
->case_insensitive_p
)
761 str
= scm_string_downcase_x (str
);
762 result
= scm_string_to_symbol (str
);
765 SCM_COL (port
) += scm_i_string_length (str
);
770 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
771 #define FUNC_NAME "scm_lreadr"
775 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
802 scm_ungetc_unlocked (chr
, port
);
803 scm_ungetc_unlocked ('#', port
);
807 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
810 str
= scm_from_port_stringn (buffer
, read
, port
);
812 result
= scm_string_to_number (str
, scm_from_uint (radix
));
814 SCM_COL (port
) += scm_i_string_length (str
);
816 if (scm_is_true (result
))
819 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
826 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
829 long line
= SCM_LINUM (port
);
830 int column
= SCM_COL (port
) - 1;
835 p
= scm_sym_quasiquote
;
846 c
= scm_getc_unlocked (port
);
848 p
= scm_sym_uq_splicing
;
851 scm_ungetc_unlocked (c
, port
);
858 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
859 "scm_read_quote", chr
);
863 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
864 return maybe_annotate_source (p
, port
, opts
, line
, column
);
867 SCM_SYMBOL (sym_syntax
, "syntax");
868 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
869 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
870 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
873 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
876 long line
= SCM_LINUM (port
);
877 int column
= SCM_COL (port
) - 1;
893 c
= scm_getc_unlocked (port
);
895 p
= sym_unsyntax_splicing
;
898 scm_ungetc_unlocked (c
, port
);
905 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
906 "scm_read_syntax", chr
);
910 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
911 return maybe_annotate_source (p
, port
, opts
, line
, column
);
915 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
917 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
919 if (!scm_is_eq (id
, sym_nil
))
920 scm_i_input_error ("scm_read_nil", port
,
921 "unexpected input while reading #nil: ~a",
924 return SCM_ELISP_NIL
;
928 scm_read_semicolon_comment (int chr
, SCM port
)
932 /* We use the get_byte here because there is no need to get the
933 locale correct with comment input. This presumes that newline
934 always represents itself no matter what the encoding is. */
935 for (c
= scm_get_byte_or_eof_unlocked (port
);
936 (c
!= EOF
) && (c
!= '\n');
937 c
= scm_get_byte_or_eof_unlocked (port
));
939 return SCM_UNSPECIFIED
;
943 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
946 scm_read_boolean (int chr
, SCM port
)
959 return SCM_UNSPECIFIED
;
963 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
964 #define FUNC_NAME "scm_lreadr"
966 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
968 size_t charname_len
, bytes_read
;
973 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
976 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
980 chr
= scm_getc_unlocked (port
);
982 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
983 "while reading character", SCM_EOL
);
985 /* CHR must be a token delimiter, like a whitespace. */
986 return (SCM_MAKE_CHAR (chr
));
989 pt
= SCM_PTAB_ENTRY (port
);
991 /* Simple ASCII characters can be processed immediately. Also, simple
992 ISO-8859-1 characters can be processed immediately if the encoding for this
993 port is ISO-8859-1. */
994 if (bytes_read
== 1 &&
995 ((unsigned char) buffer
[0] <= 127
996 || pt
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
))
999 return SCM_MAKE_CHAR (buffer
[0]);
1002 /* Otherwise, convert the buffer into a proper scheme string for
1004 charname
= scm_from_port_stringn (buffer
, bytes_read
, port
);
1005 charname_len
= scm_i_string_length (charname
);
1006 SCM_COL (port
) += charname_len
;
1007 cp
= scm_i_string_ref (charname
, 0);
1008 if (charname_len
== 1)
1009 return SCM_MAKE_CHAR (cp
);
1011 /* Ignore dotted circles, which may be used to keep combining characters from
1012 combining with the backslash in #\charname. */
1013 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1014 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1016 if (cp
>= '0' && cp
< '8')
1018 /* Dirk:FIXME:: This type of character syntax is not R5RS
1019 * compliant. Further, it should be verified that the constant
1020 * does only consist of octal digits. */
1021 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1022 if (SCM_I_INUMP (p
))
1024 scm_t_wchar c
= scm_to_uint32 (p
);
1025 if (SCM_IS_UNICODE_CHAR (c
))
1026 return SCM_MAKE_CHAR (c
);
1028 scm_i_input_error (FUNC_NAME
, port
,
1029 "out-of-range octal character escape: ~a",
1030 scm_list_1 (charname
));
1034 if (cp
== 'x' && (charname_len
> 1))
1038 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1039 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1040 scm_from_uint (16));
1041 if (SCM_I_INUMP (p
))
1043 scm_t_wchar c
= scm_to_uint32 (p
);
1044 if (SCM_IS_UNICODE_CHAR (c
))
1045 return SCM_MAKE_CHAR (c
);
1047 scm_i_input_error (FUNC_NAME
, port
,
1048 "out-of-range hex character escape: ~a",
1049 scm_list_1 (charname
));
1053 /* The names of characters should never have non-Latin1
1055 if (scm_i_is_narrow_string (charname
)
1056 || scm_i_try_narrow_string (charname
))
1057 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1059 if (scm_is_true (ch
))
1063 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1064 scm_list_1 (charname
));
1066 return SCM_UNSPECIFIED
;
1071 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1075 /* Read the symbol that comprises the keyword. Doing this instead of
1076 invoking a specific symbol reader function allows `scm_read_keyword ()'
1077 to adapt to the delimiters currently valid of symbols.
1079 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1080 symbol
= scm_read_expression (port
, opts
);
1081 if (!scm_is_symbol (symbol
))
1082 scm_i_input_error ("scm_read_keyword", port
,
1083 "keyword prefix `~a' not followed by a symbol: ~s",
1084 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1086 return (scm_symbol_to_keyword (symbol
));
1090 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1091 long line
, int column
)
1093 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1094 guarantee that it's going to do what we want. After all, this is an
1095 implementation detail of `scm_read_vector ()', not a desirable
1097 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1098 port
, opts
, line
, column
);
1101 /* Helper used by scm_read_array */
1103 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1112 c
= scm_getc_unlocked (port
);
1115 while ('0' <= c
&& c
<= '9')
1117 res
= 10*res
+ c
-'0';
1119 c
= scm_getc_unlocked (port
);
1127 /* Read an array. This function can also read vectors and uniform
1128 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1131 C is the first character read after the '#'. */
1133 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1136 scm_t_wchar tag_buf
[8];
1139 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1141 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1142 the array code can not deal with zero-length dimensions yet, and
1143 we want to allow zero-length vectors, of course. */
1145 return scm_read_vector (c
, port
, opts
, line
, column
);
1147 /* Disambiguate between '#f' and uniform floating point vectors. */
1150 c
= scm_getc_unlocked (port
);
1151 if (c
!= '3' && c
!= '6')
1154 scm_ungetc_unlocked (c
, port
);
1160 goto continue_reading_tag
;
1165 c
= read_decimal_integer (port
, c
, &rank
);
1167 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1172 continue_reading_tag
:
1173 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1174 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1176 tag_buf
[tag_len
++] = c
;
1177 c
= scm_getc_unlocked (port
);
1183 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1184 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1185 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1190 if (c
== '@' || c
== ':')
1196 ssize_t lbnd
= 0, len
= 0;
1201 c
= scm_getc_unlocked (port
);
1202 c
= read_decimal_integer (port
, c
, &lbnd
);
1205 s
= scm_from_ssize_t (lbnd
);
1209 c
= scm_getc_unlocked (port
);
1210 c
= read_decimal_integer (port
, c
, &len
);
1212 scm_i_input_error (NULL
, port
,
1213 "array length must be non-negative",
1216 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1219 shape
= scm_cons (s
, shape
);
1220 } while (c
== '@' || c
== ':');
1222 shape
= scm_reverse_x (shape
, SCM_EOL
);
1225 /* Read nested lists of elements. */
1227 scm_i_input_error (NULL
, port
,
1228 "missing '(' in vector or array literal",
1230 elements
= scm_read_sexp (c
, port
, opts
);
1232 if (scm_is_false (shape
))
1233 shape
= scm_from_ssize_t (rank
);
1234 else if (scm_ilength (shape
) != rank
)
1237 "the number of shape specifications must match the array rank",
1240 /* Handle special print syntax of rank zero arrays; see
1241 scm_i_print_array for a rationale. */
1244 if (!scm_is_pair (elements
))
1245 scm_i_input_error (NULL
, port
,
1246 "too few elements in array literal, need 1",
1248 if (!scm_is_null (SCM_CDR (elements
)))
1249 scm_i_input_error (NULL
, port
,
1250 "too many elements in array literal, want 1",
1252 elements
= SCM_CAR (elements
);
1255 /* Construct array, annotate with source location, and return. */
1256 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1257 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1261 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1262 long line
, int column
)
1264 return scm_read_array (chr
, port
, opts
, line
, column
);
1268 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1269 long line
, int column
)
1271 chr
= scm_getc_unlocked (port
);
1275 chr
= scm_getc_unlocked (port
);
1279 chr
= scm_getc_unlocked (port
);
1283 return maybe_annotate_source
1284 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1285 port
, opts
, line
, column
);
1288 scm_i_input_error ("read_bytevector", port
,
1289 "invalid bytevector prefix",
1290 SCM_MAKE_CHAR (chr
));
1291 return SCM_UNSPECIFIED
;
1295 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1296 long line
, int column
)
1298 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1299 terribly inefficient but who cares? */
1300 SCM s_bits
= SCM_EOL
;
1302 for (chr
= scm_getc_unlocked (port
);
1303 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1304 chr
= scm_getc_unlocked (port
))
1306 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1310 scm_ungetc_unlocked (chr
, port
);
1312 return maybe_annotate_source
1313 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1314 port
, opts
, line
, column
);
1318 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1324 int c
= scm_getc_unlocked (port
);
1327 scm_i_input_error ("skip_block_comment", port
,
1328 "unterminated `#! ... !#' comment", SCM_EOL
);
1332 else if (c
== '#' && bang_seen
)
1338 return SCM_UNSPECIFIED
;
1341 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1343 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1345 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1349 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1351 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1355 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1357 c
= scm_getc_unlocked (port
);
1359 scm_i_input_error ("skip_block_comment", port
,
1360 "unterminated `#! ... !#' comment", SCM_EOL
);
1361 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1363 else if (CHAR_IS_DELIMITER (c
))
1365 scm_ungetc_unlocked (c
, port
);
1367 if (0 == strcmp ("r6rs", name
))
1368 ; /* Silently ignore */
1369 else if (0 == strcmp ("fold-case", name
))
1370 set_port_case_insensitive_p (port
, opts
, 1);
1371 else if (0 == strcmp ("no-fold-case", name
))
1372 set_port_case_insensitive_p (port
, opts
, 0);
1373 else if (0 == strcmp ("curly-infix", name
))
1374 set_port_curly_infix_p (port
, opts
, 1);
1375 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1377 set_port_curly_infix_p (port
, opts
, 1);
1378 set_port_square_brackets_p (port
, opts
, 0);
1383 return SCM_UNSPECIFIED
;
1387 scm_ungetc_unlocked (c
, port
);
1392 scm_ungetc_unlocked (name
[--i
], port
);
1393 return scm_read_scsh_block_comment (chr
, port
);
1397 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1399 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1400 nested. So care must be taken. */
1401 int nesting_level
= 1;
1403 int a
= scm_getc_unlocked (port
);
1406 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1407 "unterminated `#| ... |#' comment", SCM_EOL
);
1409 while (nesting_level
> 0)
1411 int b
= scm_getc_unlocked (port
);
1414 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1415 "unterminated `#| ... |#' comment", SCM_EOL
);
1417 if (a
== '|' && b
== '#')
1422 else if (a
== '#' && b
== '|')
1431 return SCM_UNSPECIFIED
;
1435 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1436 scm_t_read_opts
*opts
)
1440 c
= flush_ws (port
, opts
, (char *) NULL
);
1442 scm_i_input_error ("read_commented_expression", port
,
1443 "no expression after #; comment", SCM_EOL
);
1444 scm_ungetc_unlocked (c
, port
);
1445 scm_read_expression (port
, opts
);
1446 return SCM_UNSPECIFIED
;
1450 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1452 /* Guile's extended symbol read syntax looks like this:
1454 #{This is all a symbol name}#
1456 So here, CHR is expected to be `{'. */
1459 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1461 buf
= scm_i_string_start_writing (buf
);
1463 while ((chr
= scm_getc_unlocked (port
)) != EOF
)
1474 scm_i_string_set_x (buf
, len
++, '}');
1480 else if (chr
== '\\')
1482 /* It used to be that print.c would print extended-read-syntax
1483 symbols with backslashes before "non-standard" chars, but
1484 this routine wouldn't do anything with those escapes.
1485 Bummer. What we've done is to change print.c to output
1486 R6RS hex escapes for those characters, relying on the fact
1487 that the extended read syntax would never put a `\' before
1488 an `x'. For now, we just ignore other instances of
1489 backslash in the string. */
1490 switch ((chr
= scm_getc_unlocked (port
)))
1498 SCM_READ_HEX_ESCAPE (10, ';');
1499 scm_i_string_set_x (buf
, len
++, c
);
1507 scm_i_string_stop_writing ();
1508 scm_i_input_error ("scm_read_extended_symbol", port
,
1509 "illegal character in escape sequence: ~S",
1510 scm_list_1 (SCM_MAKE_CHAR (c
)));
1514 scm_i_string_set_x (buf
, len
++, chr
);
1519 scm_i_string_set_x (buf
, len
++, chr
);
1521 if (len
>= scm_i_string_length (buf
) - 2)
1525 scm_i_string_stop_writing ();
1526 addy
= scm_i_make_string (1024, NULL
, 0);
1527 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1529 buf
= scm_i_string_start_writing (buf
);
1534 scm_i_string_stop_writing ();
1536 scm_i_input_error ("scm_read_extended_symbol", port
,
1537 "end of file while reading symbol", SCM_EOL
);
1539 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1544 /* Top-level token readers, i.e., dispatchers. */
1547 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1551 proc
= scm_get_hash_procedure (chr
);
1552 if (scm_is_true (scm_procedure_p (proc
)))
1554 long line
= SCM_LINUM (port
);
1555 int column
= SCM_COL (port
) - 2;
1558 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1560 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1561 && !scm_i_has_source_properties (got
))
1562 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1567 return SCM_UNSPECIFIED
;
1570 /* The reader for the sharp `#' character. It basically dispatches reads
1571 among the above token readers. */
1573 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1574 long line
, int column
)
1575 #define FUNC_NAME "scm_lreadr"
1579 chr
= scm_getc_unlocked (port
);
1581 result
= scm_read_sharp_extension (chr
, port
, opts
);
1582 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1588 return (scm_read_character (chr
, port
, opts
));
1590 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1595 /* This one may return either a boolean or an SRFI-4 vector. */
1596 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1598 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1600 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1604 return (scm_read_boolean (chr
, port
));
1606 return (scm_read_keyword (chr
, port
, opts
));
1607 case '0': case '1': case '2': case '3': case '4':
1608 case '5': case '6': case '7': case '8': case '9':
1610 return (scm_read_array (chr
, port
, opts
, line
, column
));
1624 return (scm_read_number_and_radix (chr
, port
, opts
));
1626 return (scm_read_extended_symbol (chr
, port
));
1628 return (scm_read_shebang (chr
, port
, opts
));
1630 return (scm_read_commented_expression (chr
, port
, opts
));
1634 return (scm_read_syntax (chr
, port
, opts
));
1636 return (scm_read_nil (chr
, port
, opts
));
1638 result
= scm_read_sharp_extension (chr
, port
, opts
);
1639 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1641 /* To remain compatible with 1.8 and earlier, the following
1642 characters have lower precedence than `read-hash-extend'
1647 return scm_read_r6rs_block_comment (chr
, port
);
1649 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1650 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1657 return SCM_UNSPECIFIED
;
1662 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1663 #define FUNC_NAME "read_inner_expression"
1669 chr
= scm_getc_unlocked (port
);
1673 case SCM_WHITE_SPACES
:
1674 case SCM_LINE_INCREMENTORS
:
1677 (void) scm_read_semicolon_comment (chr
, port
);
1680 if (opts
->curly_infix_p
)
1682 if (opts
->neoteric_p
)
1683 return scm_read_sexp (chr
, port
, opts
);
1688 /* Enable neoteric expressions within curly braces */
1689 opts
->neoteric_p
= 1;
1690 expr
= scm_read_sexp (chr
, port
, opts
);
1691 opts
->neoteric_p
= 0;
1696 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1698 if (opts
->square_brackets_p
)
1699 return scm_read_sexp (chr
, port
, opts
);
1700 else if (opts
->curly_infix_p
)
1702 /* The syntax of neoteric expressions requires that '[' be
1703 a delimiter when curly-infix is enabled, so it cannot
1704 be part of an unescaped symbol. We might as well do
1705 something useful with it, so we adopt Kawa's convention:
1706 [...] => ($bracket-list$ ...) */
1707 long line
= SCM_LINUM (port
);
1708 int column
= SCM_COL (port
) - 1;
1709 return maybe_annotate_source
1710 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1711 port
, opts
, line
, column
);
1714 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1716 return (scm_read_sexp (chr
, port
, opts
));
1718 return (scm_read_string (chr
, port
, opts
));
1722 return (scm_read_quote (chr
, port
, opts
));
1725 long line
= SCM_LINUM (port
);
1726 int column
= SCM_COL (port
) - 1;
1727 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1728 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1729 /* We read a comment or some such. */
1735 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1738 if (opts
->curly_infix_p
)
1739 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1741 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1743 if (opts
->square_brackets_p
)
1744 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1745 /* otherwise fall through */
1749 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1750 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1755 if (((chr
>= '0') && (chr
<= '9'))
1756 || (strchr ("+-.", chr
)))
1757 return (scm_read_number (chr
, port
, opts
));
1759 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1767 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1768 #define FUNC_NAME "scm_read_expression"
1770 if (!opts
->neoteric_p
)
1771 return read_inner_expression (port
, opts
);
1778 if (opts
->record_positions_p
)
1780 /* We need to get the position of the first non-whitespace
1781 character in order to correctly annotate neoteric
1782 expressions. For example, for the expression 'f(x)', the
1783 first call to 'read_inner_expression' reads the 'f' (which
1784 cannot be annotated), and then we later read the '(x)' and
1785 use it to construct the new list (f x). */
1786 int c
= flush_ws (port
, opts
, (char *) NULL
);
1789 scm_ungetc_unlocked (c
, port
);
1790 line
= SCM_LINUM (port
);
1791 column
= SCM_COL (port
);
1794 expr
= read_inner_expression (port
, opts
);
1796 /* 'expr' is the first component of the neoteric expression. Now
1797 we loop, and as long as the next character is '(', '[', or '{',
1798 (without any intervening whitespace), we use it to construct a
1799 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1802 int chr
= scm_getc_unlocked (port
);
1805 /* e(...) => (e ...) */
1806 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1807 else if (chr
== '[')
1808 /* e[...] => ($bracket-apply$ e ...) */
1809 expr
= scm_cons (sym_bracket_apply
,
1811 scm_read_sexp (chr
, port
, opts
)));
1812 else if (chr
== '{')
1814 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1816 if (scm_is_null (arg
))
1817 expr
= scm_list_1 (expr
); /* e{} => (e) */
1819 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1824 scm_ungetc_unlocked (chr
, port
);
1827 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1835 /* Actual reader. */
1837 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1839 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1841 "Read an s-expression from the input port @var{port}, or from\n"
1842 "the current input port if @var{port} is not specified.\n"
1843 "Any whitespace before the next token is discarded.")
1844 #define FUNC_NAME s_scm_read
1846 scm_t_read_opts opts
;
1849 if (SCM_UNBNDP (port
))
1850 port
= scm_current_input_port ();
1851 SCM_VALIDATE_OPINPORT (1, port
);
1853 init_read_options (port
, &opts
);
1855 c
= flush_ws (port
, &opts
, (char *) NULL
);
1858 scm_ungetc_unlocked (c
, port
);
1860 return (scm_read_expression (port
, &opts
));
1867 /* Manipulate the read-hash-procedures alist. This could be written in
1868 Scheme, but maybe it will also be used by C code during initialisation. */
1869 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1870 (SCM chr
, SCM proc
),
1871 "Install the procedure @var{proc} for reading expressions\n"
1872 "starting with the character sequence @code{#} and @var{chr}.\n"
1873 "@var{proc} will be called with two arguments: the character\n"
1874 "@var{chr} and the port to read further data from. The object\n"
1875 "returned will be the return value of @code{read}. \n"
1876 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1878 #define FUNC_NAME s_scm_read_hash_extend
1883 SCM_VALIDATE_CHAR (1, chr
);
1884 SCM_ASSERT (scm_is_false (proc
)
1885 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1886 proc
, SCM_ARG2
, FUNC_NAME
);
1888 /* Check if chr is already in the alist. */
1889 this = scm_i_read_hash_procedures_ref ();
1893 if (scm_is_null (this))
1895 /* not found, so add it to the beginning. */
1896 if (scm_is_true (proc
))
1898 SCM
new = scm_cons (scm_cons (chr
, proc
),
1899 scm_i_read_hash_procedures_ref ());
1900 scm_i_read_hash_procedures_set_x (new);
1904 if (scm_is_eq (chr
, SCM_CAAR (this)))
1906 /* already in the alist. */
1907 if (scm_is_false (proc
))
1910 if (scm_is_false (prev
))
1912 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1913 scm_i_read_hash_procedures_set_x (rest
);
1916 scm_set_cdr_x (prev
, SCM_CDR (this));
1921 scm_set_cdr_x (SCM_CAR (this), proc
);
1926 this = SCM_CDR (this);
1929 return SCM_UNSPECIFIED
;
1933 /* Recover the read-hash procedure corresponding to char c. */
1935 scm_get_hash_procedure (int c
)
1937 SCM rest
= scm_i_read_hash_procedures_ref ();
1941 if (scm_is_null (rest
))
1944 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1945 return SCM_CDAR (rest
);
1947 rest
= SCM_CDR (rest
);
1951 #define SCM_ENCODING_SEARCH_SIZE (500)
1954 is_encoding_char (char c
)
1956 if (c
>= 'a' && c
<= 'z') return 1;
1957 if (c
>= 'A' && c
<= 'Z') return 1;
1958 if (c
>= '0' && c
<= '9') return 1;
1959 return strchr ("_-.:/,+=()", c
) != NULL
;
1962 /* Search the first few hundred characters of a file for an Emacs-like coding
1963 declaration. Returns either NULL or a string whose storage has been
1964 allocated with `scm_gc_malloc ()'. */
1966 scm_i_scan_for_encoding (SCM port
)
1969 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1970 size_t bytes_read
, encoding_length
, i
;
1971 char *encoding
= NULL
;
1973 char *pos
, *encoding_start
;
1976 pt
= SCM_PTAB_ENTRY (port
);
1978 if (pt
->rw_active
== SCM_PORT_WRITE
)
1979 scm_flush_unlocked (port
);
1982 pt
->rw_active
= SCM_PORT_READ
;
1984 if (pt
->read_pos
== pt
->read_end
)
1986 /* We can use the read buffer, and thus avoid a seek. */
1987 if (scm_fill_input_unlocked (port
) == EOF
)
1990 bytes_read
= pt
->read_end
- pt
->read_pos
;
1991 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1992 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1994 if (bytes_read
<= 1)
1995 /* An unbuffered port -- don't scan. */
1998 memcpy (header
, pt
->read_pos
, bytes_read
);
1999 header
[bytes_read
] = '\0';
2003 /* Try to read some bytes and then seek back. Not all ports
2004 support seeking back; and indeed some file ports (like
2005 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2006 check performed by SCM_FPORT_FDES---but fail to seek
2007 backwards. Hence this block comes second. We prefer to use
2008 the read buffer in-place. */
2009 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2012 bytes_read
= scm_c_read_unlocked (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2013 header
[bytes_read
] = '\0';
2014 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2018 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
2021 /* search past "coding[:=]" */
2025 if ((pos
= strstr(pos
, "coding")) == NULL
)
2028 pos
+= strlen("coding");
2029 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2030 (*pos
== ':' || *pos
== '='))
2038 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2039 (*pos
== ' ' || *pos
== '\t'))
2042 /* grab the next token */
2043 encoding_start
= pos
;
2045 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2046 && encoding_start
+ i
- header
< bytes_read
2047 && is_encoding_char (encoding_start
[i
]))
2050 encoding_length
= i
;
2051 if (encoding_length
== 0)
2054 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2056 /* push backwards to make sure we were in a comment */
2058 pos
= encoding_start
;
2059 while (pos
>= header
)
2066 else if (*pos
== '\n' || pos
== header
)
2068 /* This wasn't in a semicolon comment. Check for a
2069 hash-bang comment. */
2070 char *beg
= strstr (header
, "#!");
2071 char *end
= strstr (header
, "!#");
2072 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2083 /* This wasn't in a comment */
2086 if (utf8_bom
&& strcasecmp (encoding
, "UTF-8"))
2087 scm_misc_error (NULL
,
2088 "the port input declares the encoding ~s but is encoded as UTF-8",
2089 scm_list_1 (scm_from_locale_string (encoding
)));
2094 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2096 "Scans the port for an Emacs-like character coding declaration\n"
2097 "near the top of the contents of a port with random-accessible contents.\n"
2098 "The coding declaration is of the form\n"
2099 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2101 "Returns a string containing the character encoding of the file\n"
2102 "if a declaration was found, or @code{#f} otherwise.\n")
2103 #define FUNC_NAME s_scm_file_encoding
2108 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2110 enc
= scm_i_scan_for_encoding (port
);
2115 s_enc
= scm_from_locale_string (enc
);
2124 /* Per-port read options.
2126 We store per-port read options in the 'port-read-options' key of the
2127 port's alist. The value stored in the alist is a single integer that
2128 contains a two-bit field for each read option.
2130 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2131 the applicable value should be inherited from the corresponding
2132 global read option. Otherwise, the bit field contains the value of
2133 the read option. For boolean read options that have been set
2134 per-port, the possible values are 0 or 1. If the 'keyword_style'
2135 read option has been set per-port, its possible values are those in
2136 'enum t_keyword_style'. */
2138 /* Key to read options in per-port alists. */
2139 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2141 /* Offsets of bit fields for each per-port override */
2142 #define READ_OPTION_COPY_SOURCE_P 0
2143 #define READ_OPTION_RECORD_POSITIONS_P 2
2144 #define READ_OPTION_CASE_INSENSITIVE_P 4
2145 #define READ_OPTION_KEYWORD_STYLE 6
2146 #define READ_OPTION_R6RS_ESCAPES_P 8
2147 #define READ_OPTION_SQUARE_BRACKETS_P 10
2148 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2149 #define READ_OPTION_CURLY_INFIX_P 14
2151 /* The total width in bits of the per-port overrides */
2152 #define READ_OPTIONS_NUM_BITS 16
2154 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2155 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2157 #define READ_OPTION_MASK 3
2158 #define READ_OPTION_INHERIT 3
2161 set_port_read_option (SCM port
, int option
, int new_value
)
2163 SCM scm_read_options
;
2164 unsigned int read_options
;
2166 new_value
&= READ_OPTION_MASK
;
2167 scm_read_options
= scm_assq_ref (SCM_PTAB_ENTRY(port
)->alist
,
2168 sym_port_read_options
);
2169 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2170 read_options
= scm_to_uint (scm_read_options
);
2172 read_options
= READ_OPTIONS_INHERIT_ALL
;
2173 read_options
&= ~(READ_OPTION_MASK
<< option
);
2174 read_options
|= new_value
<< option
;
2175 scm_read_options
= scm_from_uint (read_options
);
2176 SCM_PTAB_ENTRY(port
)->alist
= scm_assq_set_x (SCM_PTAB_ENTRY(port
)->alist
,
2177 sym_port_read_options
,
2181 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2183 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2186 opts
->case_insensitive_p
= value
;
2187 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2190 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2192 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2195 opts
->square_brackets_p
= value
;
2196 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2199 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2201 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2204 opts
->curly_infix_p
= value
;
2205 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2208 /* Initialize OPTS based on PORT's read options and the global read
2211 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2213 SCM val
, scm_read_options
;
2214 unsigned int read_options
, x
;
2216 scm_read_options
= scm_assq_ref (SCM_PTAB_ENTRY(port
)->alist
,
2217 sym_port_read_options
);
2219 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2220 read_options
= scm_to_uint (scm_read_options
);
2222 read_options
= READ_OPTIONS_INHERIT_ALL
;
2224 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2225 if (x
== READ_OPTION_INHERIT
)
2227 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2228 if (scm_is_eq (val
, scm_keyword_prefix
))
2229 x
= KEYWORD_STYLE_PREFIX
;
2230 else if (scm_is_eq (val
, scm_keyword_postfix
))
2231 x
= KEYWORD_STYLE_POSTFIX
;
2233 x
= KEYWORD_STYLE_HASH_PREFIX
;
2235 opts
->keyword_style
= x
;
2237 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2240 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2241 if (x == READ_OPTION_INHERIT) \
2242 x = !!SCM_ ## NAME; \
2247 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2248 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2249 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2250 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2251 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2252 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2253 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2255 #undef RESOLVE_BOOLEAN_OPTION
2257 opts
->neoteric_p
= 0;
2263 SCM read_hash_procs
;
2265 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2267 scm_i_read_hash_procedures
=
2268 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2270 scm_init_opts (scm_read_options
, scm_read_opts
);
2271 #include "libguile/read.x"