1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include <c-strcase.h>
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/fports.h"
48 #include "libguile/root.h"
49 #include "libguile/strings.h"
50 #include "libguile/strports.h"
51 #include "libguile/vectors.h"
52 #include "libguile/validate.h"
53 #include "libguile/srfi-4.h"
54 #include "libguile/srfi-13.h"
56 #include "libguile/read.h"
57 #include "libguile/private-options.h"
62 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
63 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
64 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
65 SCM_SYMBOL (sym_nil
, "nil");
67 /* SRFI-105 curly infix expression support */
68 SCM_SYMBOL (sym_nfx
, "$nfx$");
69 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
70 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
72 scm_t_option scm_read_opts
[] =
74 { SCM_OPTION_BOOLEAN
, "copy", 0,
75 "Copy source code expressions." },
76 { SCM_OPTION_BOOLEAN
, "positions", 1,
77 "Record positions of source code expressions." },
78 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
79 "Convert symbols to lower case."},
80 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
81 "Style of keyword recognition: #f, 'prefix or 'postfix."},
82 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
83 "Use R6RS variable-length character and string hex escapes."},
84 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
85 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
87 "In strings, consume leading whitespace after an escaped end-of-line."},
88 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
89 "Support SRFI-105 curly infix expressions."},
93 /* Internal read options structure. This is initialized by 'scm_read'
94 from the global and per-port read options, and a pointer is passed
95 down to all helper functions. */
99 KEYWORD_STYLE_HASH_PREFIX
,
100 KEYWORD_STYLE_PREFIX
,
101 KEYWORD_STYLE_POSTFIX
106 enum t_keyword_style keyword_style
;
107 unsigned int copy_source_p
: 1;
108 unsigned int record_positions_p
: 1;
109 unsigned int case_insensitive_p
: 1;
110 unsigned int r6rs_escapes_p
: 1;
111 unsigned int square_brackets_p
: 1;
112 unsigned int hungry_eol_escapes_p
: 1;
113 unsigned int curly_infix_p
: 1;
114 unsigned int neoteric_p
: 1;
117 typedef struct t_read_opts scm_t_read_opts
;
121 Give meaningful error messages for errors
125 FILE:LINE:COL: MESSAGE
126 This happened in ....
128 This is not standard GNU format, but the test-suite likes the real
129 message to be in front.
135 scm_i_input_error (char const *function
,
136 SCM port
, const char *message
, SCM arg
)
138 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
140 : scm_from_locale_string ("#<unknown port>"));
142 SCM string_port
= scm_open_output_string ();
143 SCM string
= SCM_EOL
;
144 scm_simple_format (string_port
,
145 scm_from_locale_string ("~A:~S:~S: ~A"),
147 scm_from_long (SCM_LINUM (port
) + 1),
148 scm_from_int (SCM_COL (port
) + 1),
149 scm_from_locale_string (message
)));
151 string
= scm_get_output_string (string_port
);
152 scm_close_output_port (string_port
);
153 scm_error_scm (scm_from_latin1_symbol ("read-error"),
154 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
161 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
163 "Option interface for the read options. Instead of using\n"
164 "this procedure directly, use the procedures @code{read-enable},\n"
165 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
166 #define FUNC_NAME s_scm_read_options
168 SCM ans
= scm_options (setting
,
171 if (SCM_COPY_SOURCE_P
)
172 SCM_RECORD_POSITIONS_P
= 1;
177 /* A fluid referring to an association list mapping extra hash
178 characters to procedures. */
179 static SCM
*scm_i_read_hash_procedures
;
182 scm_i_read_hash_procedures_ref (void)
184 return scm_fluid_ref (*scm_i_read_hash_procedures
);
188 scm_i_read_hash_procedures_set_x (SCM value
)
190 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
197 /* Size of the C buffer used to read symbols and numbers. */
198 #define READER_BUFFER_SIZE 128
200 /* Number of 32-bit codepoints in the buffer used to read strings. */
201 #define READER_STRING_BUFFER_SIZE 128
203 /* The maximum size of Scheme character names. */
204 #define READER_CHAR_NAME_MAX_SIZE 50
206 /* The maximum size of reader directive names. */
207 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
210 /* `isblank' is only in C99. */
211 #define CHAR_IS_BLANK_(_chr) \
212 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
213 || ((_chr) == '\f') || ((_chr) == '\r'))
216 # define CHAR_IS_BLANK(_chr) \
217 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
219 # define CHAR_IS_BLANK CHAR_IS_BLANK_
223 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
225 #define CHAR_IS_R5RS_DELIMITER(c) \
227 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
229 #define CHAR_IS_DELIMITER(c) \
230 (CHAR_IS_R5RS_DELIMITER (c) \
231 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
232 || opts->curly_infix_p)) \
233 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
235 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
237 #define CHAR_IS_EXPONENT_MARKER(_chr) \
238 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
239 || ((_chr) == 'd') || ((_chr) == 'l'))
241 /* Read an SCSH block comment. */
242 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
243 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
244 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
245 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
246 static SCM
scm_get_hash_procedure (int);
248 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
249 result in the pre-allocated buffer BUF. Return zero if the whole token has
250 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
251 bytes actually read. */
253 read_token (SCM port
, scm_t_read_opts
*opts
,
254 char *buf
, size_t buf_size
, size_t *read
)
258 while (*read
< buf_size
)
262 chr
= scm_get_byte_or_eof (port
);
266 else if (CHAR_IS_DELIMITER (chr
))
268 scm_unget_byte (chr
, port
);
281 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
282 if the token doesn't fit in BUFFER_SIZE bytes. */
284 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
285 char *buffer
, size_t buffer_size
, size_t *read
)
288 size_t bytes_read
, overflow_size
= 0;
289 char *overflow_buffer
= NULL
;
293 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
296 if (overflow
|| overflow_size
!= 0)
298 if (overflow_size
== 0)
300 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
301 memcpy (overflow_buffer
, buffer
, bytes_read
);
302 overflow_size
= bytes_read
;
307 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
309 memcpy (new_buf
, overflow_buffer
, overflow_size
);
310 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
312 overflow_buffer
= new_buf
;
313 overflow_size
+= bytes_read
;
320 *read
= overflow_size
;
324 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
327 /* Skip whitespace from PORT and return the first non-whitespace character
328 read. Raise an error on end-of-file. */
330 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
334 switch (c
= scm_getc (port
))
340 scm_i_input_error (eoferr
,
349 switch (c
= scm_getc (port
))
355 case SCM_LINE_INCREMENTORS
:
361 switch (c
= scm_getc (port
))
364 eoferr
= "read_sharp";
367 scm_read_shebang (c
, port
, opts
);
370 scm_read_commented_expression (c
, port
, opts
);
373 if (scm_is_false (scm_get_hash_procedure (c
)))
375 scm_read_r6rs_block_comment (c
, port
);
380 scm_ungetc (c
, port
);
385 case SCM_LINE_INCREMENTORS
:
386 case SCM_SINGLE_SPACES
:
401 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
402 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
403 long line
, int column
);
407 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
408 long line
, int column
)
410 if (opts
->record_positions_p
)
411 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
416 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
417 #define FUNC_NAME "scm_i_lreadparen"
420 SCM tmp
, tl
, ans
= SCM_EOL
;
421 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
422 const int terminating_char
= ((chr
== '{') ? '}'
423 : ((chr
== '[') ? ']'
426 /* Need to capture line and column numbers here. */
427 long line
= SCM_LINUM (port
);
428 int column
= SCM_COL (port
) - 1;
430 c
= flush_ws (port
, opts
, FUNC_NAME
);
431 if (terminating_char
== c
)
434 scm_ungetc (c
, port
);
435 tmp
= scm_read_expression (port
, opts
);
437 /* Note that it is possible for scm_read_expression to return
438 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
439 check that it's a real dot by checking `c'. */
440 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
442 ans
= scm_read_expression (port
, opts
);
443 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
444 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
449 /* Build the head of the list structure. */
450 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
452 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
456 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
457 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
458 scm_i_input_error (FUNC_NAME
, port
,
459 "in pair: mismatched close paren: ~A",
460 scm_list_1 (SCM_MAKE_CHAR (c
)));
462 scm_ungetc (c
, port
);
463 tmp
= scm_read_expression (port
, opts
);
465 /* See above note about scm_sym_dot. */
466 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
468 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
470 c
= flush_ws (port
, opts
, FUNC_NAME
);
471 if (terminating_char
!= c
)
472 scm_i_input_error (FUNC_NAME
, port
,
473 "in pair: missing close paren", SCM_EOL
);
477 new_tail
= scm_cons (tmp
, SCM_EOL
);
478 SCM_SETCDR (tl
, new_tail
);
484 /* In addition to finding the length, 'scm_ilength' checks for
485 improper or circular lists, in which case it returns -1. */
486 int len
= scm_ilength (ans
);
488 /* The (len == 0) case is handled above */
490 /* Return directly to avoid re-annotating the element's source
491 location with the position of the outer brace. Also, it
492 might not be possible to annotate the element. */
493 return scm_car (ans
); /* {e} => e */
495 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
496 else if (len
>= 3 && (len
& 1))
498 /* It's a proper list whose length is odd and at least 3. If
499 the elements at odd indices (the infix operator positions)
500 are all 'equal?', then it's a simple curly-infix list.
501 Otherwise it's a mixed curly-infix list. */
502 SCM op
= scm_cadr (ans
);
504 /* Check to see if the elements at odd indices are 'equal?' */
505 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
507 if (scm_is_null (tl
))
509 /* Convert simple curly-infix list to prefix:
510 {a <op> b <op> ...} => (<op> a b ...) */
512 while (scm_is_pair (scm_cdr (tl
)))
515 SCM_SETCDR (tl
, tmp
);
518 ans
= scm_cons (op
, ans
);
521 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
523 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
524 ans
= scm_cons (sym_nfx
, ans
);
530 /* Mixed curly-infix (possibly improper) list:
531 {e . tail} => ($nfx$ e . tail) */
532 ans
= scm_cons (sym_nfx
, ans
);
535 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
540 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
541 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
543 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
549 while (i < ndigits) \
551 a = scm_getc (port); \
555 && (a == (scm_t_wchar) terminator) \
558 if ('0' <= a && a <= '9') \
560 else if ('A' <= a && a <= 'F') \
562 else if ('a' <= a && a <= 'f') \
575 skip_intraline_whitespace (SCM port
)
585 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
587 scm_ungetc (c
, port
);
591 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
592 #define FUNC_NAME "scm_lreadr"
594 /* For strings smaller than C_STR, this function creates only one Scheme
595 object (the string returned). */
598 size_t c_str_len
= 0;
599 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
601 /* Need to capture line and column numbers here. */
602 long line
= SCM_LINUM (port
);
603 int column
= SCM_COL (port
) - 1;
605 while ('"' != (c
= scm_getc (port
)))
610 scm_i_input_error (FUNC_NAME
, port
,
611 "end of file in string constant", SCM_EOL
);
614 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
616 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
622 switch (c
= scm_getc (port
))
631 if (opts
->hungry_eol_escapes_p
)
632 skip_intraline_whitespace (port
);
659 if (opts
->r6rs_escapes_p
)
660 SCM_READ_HEX_ESCAPE (10, ';');
662 SCM_READ_HEX_ESCAPE (2, '\0');
665 if (!opts
->r6rs_escapes_p
)
667 SCM_READ_HEX_ESCAPE (4, '\0');
671 if (!opts
->r6rs_escapes_p
)
673 SCM_READ_HEX_ESCAPE (6, '\0');
678 scm_i_input_error (FUNC_NAME
, port
,
679 "illegal character in escape sequence: ~S",
680 scm_list_1 (SCM_MAKE_CHAR (c
)));
684 c_str
[c_str_len
++] = c
;
687 if (scm_is_null (str
))
688 /* Fast path: we got a string that fits in C_STR. */
689 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
693 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
695 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
698 return maybe_annotate_source (str
, port
, opts
, line
, column
);
704 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
706 SCM result
, str
= SCM_EOL
;
707 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
709 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
711 /* Need to capture line and column numbers here. */
712 long line
= SCM_LINUM (port
);
713 int column
= SCM_COL (port
) - 1;
715 scm_ungetc (chr
, port
);
716 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
719 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
721 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
722 if (scm_is_false (result
))
724 /* Return a symbol instead of a number */
725 if (opts
->case_insensitive_p
)
726 str
= scm_string_downcase_x (str
);
727 result
= scm_string_to_symbol (str
);
729 else if (SCM_NIMP (result
))
730 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
732 SCM_COL (port
) += scm_i_string_length (str
);
737 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
740 int ends_with_colon
= 0;
742 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
743 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
744 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
747 scm_ungetc (chr
, port
);
748 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
751 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
753 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
755 str
= scm_from_stringn (buffer
, bytes_read
- 1,
756 pt
->encoding
, pt
->ilseq_handler
);
758 if (opts
->case_insensitive_p
)
759 str
= scm_string_downcase_x (str
);
760 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
764 str
= scm_from_stringn (buffer
, bytes_read
,
765 pt
->encoding
, pt
->ilseq_handler
);
767 if (opts
->case_insensitive_p
)
768 str
= scm_string_downcase_x (str
);
769 result
= scm_string_to_symbol (str
);
772 SCM_COL (port
) += scm_i_string_length (str
);
777 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
778 #define FUNC_NAME "scm_lreadr"
782 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
810 scm_ungetc (chr
, port
);
811 scm_ungetc ('#', port
);
815 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
818 pt
= SCM_PTAB_ENTRY (port
);
819 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
821 result
= scm_string_to_number (str
, scm_from_uint (radix
));
823 SCM_COL (port
) += scm_i_string_length (str
);
825 if (scm_is_true (result
))
828 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
835 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
838 long line
= SCM_LINUM (port
);
839 int column
= SCM_COL (port
) - 1;
844 p
= scm_sym_quasiquote
;
857 p
= scm_sym_uq_splicing
;
860 scm_ungetc (c
, port
);
867 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
868 "scm_read_quote", chr
);
872 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
873 return maybe_annotate_source (p
, port
, opts
, line
, column
);
876 SCM_SYMBOL (sym_syntax
, "syntax");
877 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
878 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
879 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
882 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
885 long line
= SCM_LINUM (port
);
886 int column
= SCM_COL (port
) - 1;
904 p
= sym_unsyntax_splicing
;
907 scm_ungetc (c
, port
);
914 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
915 "scm_read_syntax", chr
);
919 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
920 return maybe_annotate_source (p
, port
, opts
, line
, column
);
924 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
926 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
928 if (!scm_is_eq (id
, sym_nil
))
929 scm_i_input_error ("scm_read_nil", port
,
930 "unexpected input while reading #nil: ~a",
933 return SCM_ELISP_NIL
;
937 scm_read_semicolon_comment (int chr
, SCM port
)
941 /* We use the get_byte here because there is no need to get the
942 locale correct with comment input. This presumes that newline
943 always represents itself no matter what the encoding is. */
944 for (c
= scm_get_byte_or_eof (port
);
945 (c
!= EOF
) && (c
!= '\n');
946 c
= scm_get_byte_or_eof (port
));
948 return SCM_UNSPECIFIED
;
951 /* If the EXPECTED_CHARS are the next ones available from PORT, then
952 consume them and return 1. Otherwise leave the port position where
953 it was and return 0. EXPECTED_CHARS should be all lowercase, and
954 will be matched case-insensitively against the characters read from
957 try_read_ci_chars (SCM port
, const char *expected_chars
)
959 int num_chars_wanted
= strlen (expected_chars
);
960 int num_chars_read
= 0;
961 char *chars_read
= alloca (num_chars_wanted
);
964 while (num_chars_read
< num_chars_wanted
)
969 else if (tolower (c
) != expected_chars
[num_chars_read
])
971 scm_ungetc (c
, port
);
975 chars_read
[num_chars_read
++] = c
;
978 if (num_chars_read
== num_chars_wanted
)
982 while (num_chars_read
> 0)
983 scm_ungetc (chars_read
[--num_chars_read
], port
);
989 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
992 scm_read_boolean (int chr
, SCM port
)
998 try_read_ci_chars (port
, "rue");
1003 try_read_ci_chars (port
, "alse");
1007 return SCM_UNSPECIFIED
;
1011 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1012 #define FUNC_NAME "scm_lreadr"
1014 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
1016 size_t charname_len
, bytes_read
;
1021 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
1024 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
1026 if (bytes_read
== 0)
1028 chr
= scm_getc (port
);
1030 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
1031 "while reading character", SCM_EOL
);
1033 /* CHR must be a token delimiter, like a whitespace. */
1034 return (SCM_MAKE_CHAR (chr
));
1037 pt
= SCM_PTAB_ENTRY (port
);
1039 /* Simple ASCII characters can be processed immediately. Also, simple
1040 ISO-8859-1 characters can be processed immediately if the encoding for this
1041 port is ISO-8859-1. */
1042 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
1044 SCM_COL (port
) += 1;
1045 return SCM_MAKE_CHAR (buffer
[0]);
1048 /* Otherwise, convert the buffer into a proper scheme string for
1050 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
1052 charname_len
= scm_i_string_length (charname
);
1053 SCM_COL (port
) += charname_len
;
1054 cp
= scm_i_string_ref (charname
, 0);
1055 if (charname_len
== 1)
1056 return SCM_MAKE_CHAR (cp
);
1058 /* Ignore dotted circles, which may be used to keep combining characters from
1059 combining with the backslash in #\charname. */
1060 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1061 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1063 if (cp
>= '0' && cp
< '8')
1065 /* Dirk:FIXME:: This type of character syntax is not R5RS
1066 * compliant. Further, it should be verified that the constant
1067 * does only consist of octal digits. */
1068 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1069 if (SCM_I_INUMP (p
))
1071 scm_t_wchar c
= scm_to_uint32 (p
);
1072 if (SCM_IS_UNICODE_CHAR (c
))
1073 return SCM_MAKE_CHAR (c
);
1075 scm_i_input_error (FUNC_NAME
, port
,
1076 "out-of-range octal character escape: ~a",
1077 scm_list_1 (charname
));
1081 if (cp
== 'x' && (charname_len
> 1))
1085 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1086 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1087 scm_from_uint (16));
1088 if (SCM_I_INUMP (p
))
1090 scm_t_wchar c
= scm_to_uint32 (p
);
1091 if (SCM_IS_UNICODE_CHAR (c
))
1092 return SCM_MAKE_CHAR (c
);
1094 scm_i_input_error (FUNC_NAME
, port
,
1095 "out-of-range hex character escape: ~a",
1096 scm_list_1 (charname
));
1100 /* The names of characters should never have non-Latin1
1102 if (scm_i_is_narrow_string (charname
)
1103 || scm_i_try_narrow_string (charname
))
1104 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1106 if (scm_is_true (ch
))
1110 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1111 scm_list_1 (charname
));
1113 return SCM_UNSPECIFIED
;
1118 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1122 /* Read the symbol that comprises the keyword. Doing this instead of
1123 invoking a specific symbol reader function allows `scm_read_keyword ()'
1124 to adapt to the delimiters currently valid of symbols.
1126 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1127 symbol
= scm_read_expression (port
, opts
);
1128 if (!scm_is_symbol (symbol
))
1129 scm_i_input_error ("scm_read_keyword", port
,
1130 "keyword prefix `~a' not followed by a symbol: ~s",
1131 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1133 return (scm_symbol_to_keyword (symbol
));
1137 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1138 long line
, int column
)
1140 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1141 guarantee that it's going to do what we want. After all, this is an
1142 implementation detail of `scm_read_vector ()', not a desirable
1144 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1145 port
, opts
, line
, column
);
1148 /* Helper used by scm_read_array */
1150 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1159 c
= scm_getc (port
);
1162 while ('0' <= c
&& c
<= '9')
1164 if (((SSIZE_MAX
- (c
-'0')) / 10) <= res
)
1165 scm_i_input_error ("read_decimal_integer", port
,
1166 "number too large", SCM_EOL
);
1167 res
= 10*res
+ c
-'0';
1169 c
= scm_getc (port
);
1177 /* Read an array. This function can also read vectors and uniform
1178 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1181 C is the first character read after the '#'. */
1183 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1186 scm_t_wchar tag_buf
[8];
1189 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1191 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1192 the array code can not deal with zero-length dimensions yet, and
1193 we want to allow zero-length vectors, of course. */
1195 return scm_read_vector (c
, port
, opts
, line
, column
);
1197 /* Disambiguate between '#f' and uniform floating point vectors. */
1200 c
= scm_getc (port
);
1201 if (c
!= '3' && c
!= '6')
1203 if (c
== 'a' && try_read_ci_chars (port
, "lse"))
1206 scm_ungetc (c
, port
);
1212 goto continue_reading_tag
;
1217 c
= read_decimal_integer (port
, c
, &rank
);
1219 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1224 continue_reading_tag
:
1225 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1226 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1228 tag_buf
[tag_len
++] = c
;
1229 c
= scm_getc (port
);
1235 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1236 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1237 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1242 if (c
== '@' || c
== ':')
1248 ssize_t lbnd
= 0, len
= 0;
1253 c
= scm_getc (port
);
1254 c
= read_decimal_integer (port
, c
, &lbnd
);
1257 s
= scm_from_ssize_t (lbnd
);
1261 c
= scm_getc (port
);
1262 c
= read_decimal_integer (port
, c
, &len
);
1264 scm_i_input_error (NULL
, port
,
1265 "array length must be non-negative",
1268 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1271 shape
= scm_cons (s
, shape
);
1272 } while (c
== '@' || c
== ':');
1274 shape
= scm_reverse_x (shape
, SCM_EOL
);
1277 /* Read nested lists of elements. */
1279 scm_i_input_error (NULL
, port
,
1280 "missing '(' in vector or array literal",
1282 elements
= scm_read_sexp (c
, port
, opts
);
1284 if (scm_is_false (shape
))
1285 shape
= scm_from_ssize_t (rank
);
1286 else if (scm_ilength (shape
) != rank
)
1289 "the number of shape specifications must match the array rank",
1292 /* Handle special print syntax of rank zero arrays; see
1293 scm_i_print_array for a rationale. */
1296 if (!scm_is_pair (elements
))
1297 scm_i_input_error (NULL
, port
,
1298 "too few elements in array literal, need 1",
1300 if (!scm_is_null (SCM_CDR (elements
)))
1301 scm_i_input_error (NULL
, port
,
1302 "too many elements in array literal, want 1",
1304 elements
= SCM_CAR (elements
);
1307 /* Construct array, annotate with source location, and return. */
1308 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1309 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1313 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1314 long line
, int column
)
1316 return scm_read_array (chr
, port
, opts
, line
, column
);
1320 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1321 long line
, int column
)
1323 chr
= scm_getc (port
);
1327 chr
= scm_getc (port
);
1331 chr
= scm_getc (port
);
1335 return maybe_annotate_source
1336 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1337 port
, opts
, line
, column
);
1340 scm_i_input_error ("read_bytevector", port
,
1341 "invalid bytevector prefix",
1342 SCM_MAKE_CHAR (chr
));
1343 return SCM_UNSPECIFIED
;
1347 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1348 long line
, int column
)
1350 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1351 terribly inefficient but who cares? */
1352 SCM s_bits
= SCM_EOL
;
1354 for (chr
= scm_getc (port
);
1355 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1356 chr
= scm_getc (port
))
1358 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1362 scm_ungetc (chr
, port
);
1364 return maybe_annotate_source
1365 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1366 port
, opts
, line
, column
);
1370 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1376 int c
= scm_getc (port
);
1379 scm_i_input_error ("skip_block_comment", port
,
1380 "unterminated `#! ... !#' comment", SCM_EOL
);
1384 else if (c
== '#' && bang_seen
)
1390 return SCM_UNSPECIFIED
;
1393 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1395 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1397 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1401 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1403 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1407 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1409 c
= scm_getc (port
);
1411 scm_i_input_error ("skip_block_comment", port
,
1412 "unterminated `#! ... !#' comment", SCM_EOL
);
1413 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1415 else if (CHAR_IS_DELIMITER (c
))
1417 scm_ungetc (c
, port
);
1419 if (0 == strcmp ("r6rs", name
))
1420 ; /* Silently ignore */
1421 else if (0 == strcmp ("fold-case", name
))
1422 set_port_case_insensitive_p (port
, opts
, 1);
1423 else if (0 == strcmp ("no-fold-case", name
))
1424 set_port_case_insensitive_p (port
, opts
, 0);
1425 else if (0 == strcmp ("curly-infix", name
))
1426 set_port_curly_infix_p (port
, opts
, 1);
1427 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1429 set_port_curly_infix_p (port
, opts
, 1);
1430 set_port_square_brackets_p (port
, opts
, 0);
1435 return SCM_UNSPECIFIED
;
1439 scm_ungetc (c
, port
);
1444 scm_ungetc (name
[--i
], port
);
1445 return scm_read_scsh_block_comment (chr
, port
);
1449 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1451 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1452 nested. So care must be taken. */
1453 int nesting_level
= 1;
1455 int a
= scm_getc (port
);
1458 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1459 "unterminated `#| ... |#' comment", SCM_EOL
);
1461 while (nesting_level
> 0)
1463 int b
= scm_getc (port
);
1466 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1467 "unterminated `#| ... |#' comment", SCM_EOL
);
1469 if (a
== '|' && b
== '#')
1474 else if (a
== '#' && b
== '|')
1483 return SCM_UNSPECIFIED
;
1487 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1488 scm_t_read_opts
*opts
)
1492 c
= flush_ws (port
, opts
, (char *) NULL
);
1494 scm_i_input_error ("read_commented_expression", port
,
1495 "no expression after #; comment", SCM_EOL
);
1496 scm_ungetc (c
, port
);
1497 scm_read_expression (port
, opts
);
1498 return SCM_UNSPECIFIED
;
1502 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1504 /* Guile's extended symbol read syntax looks like this:
1506 #{This is all a symbol name}#
1508 So here, CHR is expected to be `{'. */
1511 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1513 buf
= scm_i_string_start_writing (buf
);
1515 while ((chr
= scm_getc (port
)) != EOF
)
1526 scm_i_string_set_x (buf
, len
++, '}');
1532 else if (chr
== '\\')
1534 /* It used to be that print.c would print extended-read-syntax
1535 symbols with backslashes before "non-standard" chars, but
1536 this routine wouldn't do anything with those escapes.
1537 Bummer. What we've done is to change print.c to output
1538 R6RS hex escapes for those characters, relying on the fact
1539 that the extended read syntax would never put a `\' before
1540 an `x'. For now, we just ignore other instances of
1541 backslash in the string. */
1542 switch ((chr
= scm_getc (port
)))
1550 SCM_READ_HEX_ESCAPE (10, ';');
1551 scm_i_string_set_x (buf
, len
++, c
);
1559 scm_i_string_stop_writing ();
1560 scm_i_input_error ("scm_read_extended_symbol", port
,
1561 "illegal character in escape sequence: ~S",
1562 scm_list_1 (SCM_MAKE_CHAR (c
)));
1566 scm_i_string_set_x (buf
, len
++, chr
);
1571 scm_i_string_set_x (buf
, len
++, chr
);
1573 if (len
>= scm_i_string_length (buf
) - 2)
1577 scm_i_string_stop_writing ();
1578 addy
= scm_i_make_string (1024, NULL
, 0);
1579 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1581 buf
= scm_i_string_start_writing (buf
);
1586 scm_i_string_stop_writing ();
1588 scm_i_input_error ("scm_read_extended_symbol", port
,
1589 "end of file while reading symbol", SCM_EOL
);
1591 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1596 /* Top-level token readers, i.e., dispatchers. */
1599 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1603 proc
= scm_get_hash_procedure (chr
);
1604 if (scm_is_true (scm_procedure_p (proc
)))
1606 long line
= SCM_LINUM (port
);
1607 int column
= SCM_COL (port
) - 2;
1610 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1612 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1613 && !scm_i_has_source_properties (got
))
1614 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1619 return SCM_UNSPECIFIED
;
1622 /* The reader for the sharp `#' character. It basically dispatches reads
1623 among the above token readers. */
1625 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1626 long line
, int column
)
1627 #define FUNC_NAME "scm_lreadr"
1631 chr
= scm_getc (port
);
1633 result
= scm_read_sharp_extension (chr
, port
, opts
);
1634 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1640 return (scm_read_character (chr
, port
, opts
));
1642 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1647 /* This one may return either a boolean or an SRFI-4 vector. */
1648 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1650 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1652 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1656 return (scm_read_boolean (chr
, port
));
1658 return (scm_read_keyword (chr
, port
, opts
));
1659 case '0': case '1': case '2': case '3': case '4':
1660 case '5': case '6': case '7': case '8': case '9':
1662 #if SCM_ENABLE_DEPRECATED
1663 /* See below for 'i' and 'e'. */
1669 return (scm_read_array (chr
, port
, opts
, line
, column
));
1673 #if SCM_ENABLE_DEPRECATED
1675 /* When next char is '(', it really is an old-style
1677 scm_t_wchar next_c
= scm_getc (port
);
1679 scm_ungetc (next_c
, port
);
1681 return scm_read_array (chr
, port
, opts
, line
, column
);
1695 return (scm_read_number_and_radix (chr
, port
, opts
));
1697 return (scm_read_extended_symbol (chr
, port
));
1699 return (scm_read_shebang (chr
, port
, opts
));
1701 return (scm_read_commented_expression (chr
, port
, opts
));
1705 return (scm_read_syntax (chr
, port
, opts
));
1707 return (scm_read_nil (chr
, port
, opts
));
1709 result
= scm_read_sharp_extension (chr
, port
, opts
);
1710 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1712 /* To remain compatible with 1.8 and earlier, the following
1713 characters have lower precedence than `read-hash-extend'
1718 return scm_read_r6rs_block_comment (chr
, port
);
1720 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1721 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1728 return SCM_UNSPECIFIED
;
1733 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1734 #define FUNC_NAME "read_inner_expression"
1740 chr
= scm_getc (port
);
1744 case SCM_WHITE_SPACES
:
1745 case SCM_LINE_INCREMENTORS
:
1748 (void) scm_read_semicolon_comment (chr
, port
);
1751 if (opts
->curly_infix_p
)
1753 if (opts
->neoteric_p
)
1754 return scm_read_sexp (chr
, port
, opts
);
1759 /* Enable neoteric expressions within curly braces */
1760 opts
->neoteric_p
= 1;
1761 expr
= scm_read_sexp (chr
, port
, opts
);
1762 opts
->neoteric_p
= 0;
1767 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1769 if (opts
->square_brackets_p
)
1770 return scm_read_sexp (chr
, port
, opts
);
1771 else if (opts
->curly_infix_p
)
1773 /* The syntax of neoteric expressions requires that '[' be
1774 a delimiter when curly-infix is enabled, so it cannot
1775 be part of an unescaped symbol. We might as well do
1776 something useful with it, so we adopt Kawa's convention:
1777 [...] => ($bracket-list$ ...) */
1778 long line
= SCM_LINUM (port
);
1779 int column
= SCM_COL (port
) - 1;
1780 return maybe_annotate_source
1781 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1782 port
, opts
, line
, column
);
1785 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1787 return (scm_read_sexp (chr
, port
, opts
));
1789 return (scm_read_string (chr
, port
, opts
));
1793 return (scm_read_quote (chr
, port
, opts
));
1796 long line
= SCM_LINUM (port
);
1797 int column
= SCM_COL (port
) - 1;
1798 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1799 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1800 /* We read a comment or some such. */
1806 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1809 if (opts
->curly_infix_p
)
1810 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1812 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1814 if (opts
->square_brackets_p
)
1815 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1816 /* otherwise fall through */
1820 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1821 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1826 if (((chr
>= '0') && (chr
<= '9'))
1827 || (strchr ("+-.", chr
)))
1828 return (scm_read_number (chr
, port
, opts
));
1830 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1838 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1839 #define FUNC_NAME "scm_read_expression"
1841 if (!opts
->neoteric_p
)
1842 return read_inner_expression (port
, opts
);
1849 if (opts
->record_positions_p
)
1851 /* We need to get the position of the first non-whitespace
1852 character in order to correctly annotate neoteric
1853 expressions. For example, for the expression 'f(x)', the
1854 first call to 'read_inner_expression' reads the 'f' (which
1855 cannot be annotated), and then we later read the '(x)' and
1856 use it to construct the new list (f x). */
1857 int c
= flush_ws (port
, opts
, (char *) NULL
);
1860 scm_ungetc (c
, port
);
1861 line
= SCM_LINUM (port
);
1862 column
= SCM_COL (port
);
1865 expr
= read_inner_expression (port
, opts
);
1867 /* 'expr' is the first component of the neoteric expression. Now
1868 we loop, and as long as the next character is '(', '[', or '{',
1869 (without any intervening whitespace), we use it to construct a
1870 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1873 int chr
= scm_getc (port
);
1876 /* e(...) => (e ...) */
1877 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1878 else if (chr
== '[')
1879 /* e[...] => ($bracket-apply$ e ...) */
1880 expr
= scm_cons (sym_bracket_apply
,
1882 scm_read_sexp (chr
, port
, opts
)));
1883 else if (chr
== '{')
1885 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1887 if (scm_is_null (arg
))
1888 expr
= scm_list_1 (expr
); /* e{} => (e) */
1890 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1895 scm_ungetc (chr
, port
);
1898 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1906 /* Actual reader. */
1908 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1910 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1912 "Read an s-expression from the input port @var{port}, or from\n"
1913 "the current input port if @var{port} is not specified.\n"
1914 "Any whitespace before the next token is discarded.")
1915 #define FUNC_NAME s_scm_read
1917 scm_t_read_opts opts
;
1920 if (SCM_UNBNDP (port
))
1921 port
= scm_current_input_port ();
1922 SCM_VALIDATE_OPINPORT (1, port
);
1924 init_read_options (port
, &opts
);
1926 c
= flush_ws (port
, &opts
, (char *) NULL
);
1929 scm_ungetc (c
, port
);
1931 return (scm_read_expression (port
, &opts
));
1938 /* Manipulate the read-hash-procedures alist. This could be written in
1939 Scheme, but maybe it will also be used by C code during initialisation. */
1940 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1941 (SCM chr
, SCM proc
),
1942 "Install the procedure @var{proc} for reading expressions\n"
1943 "starting with the character sequence @code{#} and @var{chr}.\n"
1944 "@var{proc} will be called with two arguments: the character\n"
1945 "@var{chr} and the port to read further data from. The object\n"
1946 "returned will be the return value of @code{read}. \n"
1947 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1949 #define FUNC_NAME s_scm_read_hash_extend
1954 SCM_VALIDATE_CHAR (1, chr
);
1955 SCM_ASSERT (scm_is_false (proc
)
1956 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1957 proc
, SCM_ARG2
, FUNC_NAME
);
1959 /* Check if chr is already in the alist. */
1960 this = scm_i_read_hash_procedures_ref ();
1964 if (scm_is_null (this))
1966 /* not found, so add it to the beginning. */
1967 if (scm_is_true (proc
))
1969 SCM
new = scm_cons (scm_cons (chr
, proc
),
1970 scm_i_read_hash_procedures_ref ());
1971 scm_i_read_hash_procedures_set_x (new);
1975 if (scm_is_eq (chr
, SCM_CAAR (this)))
1977 /* already in the alist. */
1978 if (scm_is_false (proc
))
1981 if (scm_is_false (prev
))
1983 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1984 scm_i_read_hash_procedures_set_x (rest
);
1987 scm_set_cdr_x (prev
, SCM_CDR (this));
1992 scm_set_cdr_x (SCM_CAR (this), proc
);
1997 this = SCM_CDR (this);
2000 return SCM_UNSPECIFIED
;
2004 /* Recover the read-hash procedure corresponding to char c. */
2006 scm_get_hash_procedure (int c
)
2008 SCM rest
= scm_i_read_hash_procedures_ref ();
2012 if (scm_is_null (rest
))
2015 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
2016 return SCM_CDAR (rest
);
2018 rest
= SCM_CDR (rest
);
2022 #define SCM_ENCODING_SEARCH_SIZE (500)
2024 /* Search the first few hundred characters of a file for an Emacs-like coding
2025 declaration. Returns either NULL or a string whose storage has been
2026 allocated with `scm_gc_malloc ()'. */
2028 scm_i_scan_for_encoding (SCM port
)
2031 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
2032 size_t bytes_read
, encoding_length
, i
;
2033 char *encoding
= NULL
;
2034 char *pos
, *encoding_start
;
2037 pt
= SCM_PTAB_ENTRY (port
);
2039 if (pt
->rw_active
== SCM_PORT_WRITE
)
2043 pt
->rw_active
= SCM_PORT_READ
;
2045 if (pt
->read_pos
== pt
->read_end
)
2047 /* We can use the read buffer, and thus avoid a seek. */
2048 if (scm_fill_input (port
) == EOF
)
2051 bytes_read
= pt
->read_end
- pt
->read_pos
;
2052 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
2053 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
2055 if (bytes_read
<= 1)
2056 /* An unbuffered port -- don't scan. */
2059 memcpy (header
, pt
->read_pos
, bytes_read
);
2060 header
[bytes_read
] = '\0';
2064 /* Try to read some bytes and then seek back. Not all ports
2065 support seeking back; and indeed some file ports (like
2066 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2067 check performed by SCM_FPORT_FDES---but fail to seek
2068 backwards. Hence this block comes second. We prefer to use
2069 the read buffer in-place. */
2070 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2073 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2074 header
[bytes_read
] = '\0';
2075 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2078 /* search past "coding[:=]" */
2082 if ((pos
= strstr(pos
, "coding")) == NULL
)
2085 pos
+= strlen("coding");
2086 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2087 (*pos
== ':' || *pos
== '='))
2095 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2096 (*pos
== ' ' || *pos
== '\t'))
2099 /* grab the next token */
2100 encoding_start
= pos
;
2102 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2103 && encoding_start
+ i
- header
< bytes_read
2104 && (isalnum ((int) encoding_start
[i
])
2105 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
2108 encoding_length
= i
;
2109 if (encoding_length
== 0)
2112 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2113 for (i
= 0; i
< encoding_length
; i
++)
2114 encoding
[i
] = toupper ((int) encoding
[i
]);
2116 /* push backwards to make sure we were in a comment */
2118 pos
= encoding_start
;
2119 while (pos
>= header
)
2126 else if (*pos
== '\n' || pos
== header
)
2128 /* This wasn't in a semicolon comment. Check for a
2129 hash-bang comment. */
2130 char *beg
= strstr (header
, "#!");
2131 char *end
= strstr (header
, "!#");
2132 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2143 /* This wasn't in a comment */
2149 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2151 "Scans the port for an Emacs-like character coding declaration\n"
2152 "near the top of the contents of a port with random-accessible contents.\n"
2153 "The coding declaration is of the form\n"
2154 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2156 "Returns a string containing the character encoding of the file\n"
2157 "if a declaration was found, or @code{#f} otherwise.\n")
2158 #define FUNC_NAME s_scm_file_encoding
2163 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2165 enc
= scm_i_scan_for_encoding (port
);
2170 s_enc
= scm_from_locale_string (enc
);
2179 /* Per-port read options.
2181 We store per-port read options in the 'port-read-options' port
2182 property, which is stored in the internal port structure. The value
2183 stored is a single integer that contains a two-bit field for each
2186 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2187 the applicable value should be inherited from the corresponding
2188 global read option. Otherwise, the bit field contains the value of
2189 the read option. For boolean read options that have been set
2190 per-port, the possible values are 0 or 1. If the 'keyword_style'
2191 read option has been set per-port, its possible values are those in
2192 'enum t_keyword_style'. */
2194 /* Key to read options in port properties. */
2195 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2197 /* Offsets of bit fields for each per-port override */
2198 #define READ_OPTION_COPY_SOURCE_P 0
2199 #define READ_OPTION_RECORD_POSITIONS_P 2
2200 #define READ_OPTION_CASE_INSENSITIVE_P 4
2201 #define READ_OPTION_KEYWORD_STYLE 6
2202 #define READ_OPTION_R6RS_ESCAPES_P 8
2203 #define READ_OPTION_SQUARE_BRACKETS_P 10
2204 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2205 #define READ_OPTION_CURLY_INFIX_P 14
2207 /* The total width in bits of the per-port overrides */
2208 #define READ_OPTIONS_NUM_BITS 16
2210 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2211 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2213 #define READ_OPTION_MASK 3
2214 #define READ_OPTION_INHERIT 3
2217 set_port_read_option (SCM port
, int option
, int new_value
)
2219 SCM scm_read_options
;
2220 unsigned int read_options
;
2222 new_value
&= READ_OPTION_MASK
;
2223 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2224 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2225 read_options
= scm_to_uint (scm_read_options
);
2227 read_options
= READ_OPTIONS_INHERIT_ALL
;
2228 read_options
&= ~(READ_OPTION_MASK
<< option
);
2229 read_options
|= new_value
<< option
;
2230 scm_read_options
= scm_from_uint (read_options
);
2231 scm_i_set_port_property_x (port
, sym_port_read_options
, scm_read_options
);
2234 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2236 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2239 opts
->case_insensitive_p
= value
;
2240 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2243 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2245 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2248 opts
->square_brackets_p
= value
;
2249 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2252 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2254 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2257 opts
->curly_infix_p
= value
;
2258 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2261 /* Initialize OPTS based on PORT's read options and the global read
2264 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2266 SCM val
, scm_read_options
;
2267 unsigned int read_options
, x
;
2269 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2271 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2272 read_options
= scm_to_uint (scm_read_options
);
2274 read_options
= READ_OPTIONS_INHERIT_ALL
;
2276 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2277 if (x
== READ_OPTION_INHERIT
)
2279 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2280 if (scm_is_eq (val
, scm_keyword_prefix
))
2281 x
= KEYWORD_STYLE_PREFIX
;
2282 else if (scm_is_eq (val
, scm_keyword_postfix
))
2283 x
= KEYWORD_STYLE_POSTFIX
;
2285 x
= KEYWORD_STYLE_HASH_PREFIX
;
2287 opts
->keyword_style
= x
;
2289 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2292 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2293 if (x == READ_OPTION_INHERIT) \
2294 x = !!SCM_ ## NAME; \
2299 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2300 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2301 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2302 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2303 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2304 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2305 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2307 #undef RESOLVE_BOOLEAN_OPTION
2309 opts
->neoteric_p
= 0;
2315 SCM read_hash_procs
;
2317 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2319 scm_i_read_hash_procedures
=
2320 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2322 scm_init_opts (scm_read_options
, scm_read_opts
);
2323 #include "libguile/read.x"