1 /* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014
2 * Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include <c-strcase.h>
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/ports-internal.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
63 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
64 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
66 SCM_SYMBOL (sym_nil
, "nil");
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx
, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
73 scm_t_option scm_read_opts
[] =
75 { SCM_OPTION_BOOLEAN
, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN
, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN
, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
102 KEYWORD_STYLE_HASH_PREFIX
,
103 KEYWORD_STYLE_PREFIX
,
104 KEYWORD_STYLE_POSTFIX
109 enum t_keyword_style keyword_style
;
110 unsigned int copy_source_p
: 1;
111 unsigned int record_positions_p
: 1;
112 unsigned int case_insensitive_p
: 1;
113 unsigned int r6rs_escapes_p
: 1;
114 unsigned int square_brackets_p
: 1;
115 unsigned int hungry_eol_escapes_p
: 1;
116 unsigned int curly_infix_p
: 1;
117 unsigned int neoteric_p
: 1;
118 unsigned int r7rs_symbols_p
: 1;
121 typedef struct t_read_opts scm_t_read_opts
;
125 Give meaningful error messages for errors
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
139 scm_i_input_error (char const *function
,
140 SCM port
, const char *message
, SCM arg
)
142 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
144 : scm_from_locale_string ("#<unknown port>"));
146 SCM string_port
= scm_open_output_string ();
147 SCM string
= SCM_EOL
;
148 scm_simple_format (string_port
,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
151 scm_from_long (SCM_LINUM (port
) + 1),
152 scm_from_int (SCM_COL (port
) + 1),
153 scm_from_locale_string (message
)));
155 string
= scm_get_output_string (string_port
);
156 scm_close_output_port (string_port
);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
165 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
172 SCM ans
= scm_options (setting
,
175 if (SCM_COPY_SOURCE_P
)
176 SCM_RECORD_POSITIONS_P
= 1;
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM
*scm_i_read_hash_procedures
;
186 scm_i_read_hash_procedures_ref (void)
188 return scm_fluid_ref (*scm_i_read_hash_procedures
);
192 scm_i_read_hash_procedures_set_x (SCM value
)
194 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
229 #define CHAR_IS_R5RS_DELIMITER(c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
245 /* Read an SCSH block comment. */
246 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
247 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
248 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
249 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
250 static SCM
scm_get_hash_procedure (int);
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
257 read_token (SCM port
, scm_t_read_opts
*opts
,
258 char *buf
, size_t buf_size
, size_t *read
)
262 while (*read
< buf_size
)
266 chr
= scm_get_byte_or_eof_unlocked (port
);
270 else if (CHAR_IS_DELIMITER (chr
))
272 scm_unget_byte_unlocked (chr
, port
);
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
288 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
289 char *buffer
, size_t buffer_size
, size_t *read
)
292 size_t bytes_read
, overflow_size
= 0;
293 char *overflow_buffer
= NULL
;
297 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
300 if (overflow
|| overflow_size
!= 0)
302 if (overflow_size
== 0)
304 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
305 memcpy (overflow_buffer
, buffer
, bytes_read
);
306 overflow_size
= bytes_read
;
311 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
313 memcpy (new_buf
, overflow_buffer
, overflow_size
);
314 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
316 overflow_buffer
= new_buf
;
317 overflow_size
+= bytes_read
;
324 *read
= overflow_size
;
328 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
334 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
338 switch (c
= scm_getc_unlocked (port
))
344 scm_i_input_error (eoferr
,
353 switch (c
= scm_getc_unlocked (port
))
359 case SCM_LINE_INCREMENTORS
:
365 switch (c
= scm_getc_unlocked (port
))
368 eoferr
= "read_sharp";
371 scm_read_shebang (c
, port
, opts
);
374 scm_read_commented_expression (c
, port
, opts
);
377 if (scm_is_false (scm_get_hash_procedure (c
)))
379 scm_read_r6rs_block_comment (c
, port
);
384 scm_ungetc_unlocked (c
, port
);
389 case SCM_LINE_INCREMENTORS
:
390 case SCM_SINGLE_SPACES
:
405 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
406 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
407 long line
, int column
);
411 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
412 long line
, int column
)
414 /* This condition can be caused by a user calling
416 if (line
< 0 || column
< 0)
419 if (opts
->record_positions_p
)
420 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
425 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
426 #define FUNC_NAME "scm_i_lreadparen"
429 SCM tmp
, tl
, ans
= SCM_EOL
;
430 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
431 const int terminating_char
= ((chr
== '{') ? '}'
432 : ((chr
== '[') ? ']'
435 /* Need to capture line and column numbers here. */
436 long line
= SCM_LINUM (port
);
437 int column
= SCM_COL (port
) - 1;
439 c
= flush_ws (port
, opts
, FUNC_NAME
);
440 if (terminating_char
== c
)
443 scm_ungetc_unlocked (c
, port
);
444 tmp
= scm_read_expression (port
, opts
);
446 /* Note that it is possible for scm_read_expression to return
447 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
448 check that it's a real dot by checking `c'. */
449 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
451 ans
= scm_read_expression (port
, opts
);
452 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
453 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
458 /* Build the head of the list structure. */
459 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
461 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
465 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
466 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
467 scm_i_input_error (FUNC_NAME
, port
,
468 "in pair: mismatched close paren: ~A",
469 scm_list_1 (SCM_MAKE_CHAR (c
)));
471 scm_ungetc_unlocked (c
, port
);
472 tmp
= scm_read_expression (port
, opts
);
474 /* See above note about scm_sym_dot. */
475 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
477 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
479 c
= flush_ws (port
, opts
, FUNC_NAME
);
480 if (terminating_char
!= c
)
481 scm_i_input_error (FUNC_NAME
, port
,
482 "in pair: missing close paren", SCM_EOL
);
486 new_tail
= scm_cons (tmp
, SCM_EOL
);
487 SCM_SETCDR (tl
, new_tail
);
493 /* In addition to finding the length, 'scm_ilength' checks for
494 improper or circular lists, in which case it returns -1. */
495 int len
= scm_ilength (ans
);
497 /* The (len == 0) case is handled above */
499 /* Return directly to avoid re-annotating the element's source
500 location with the position of the outer brace. Also, it
501 might not be possible to annotate the element. */
502 return scm_car (ans
); /* {e} => e */
504 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
505 else if (len
>= 3 && (len
& 1))
507 /* It's a proper list whose length is odd and at least 3. If
508 the elements at odd indices (the infix operator positions)
509 are all 'equal?', then it's a simple curly-infix list.
510 Otherwise it's a mixed curly-infix list. */
511 SCM op
= scm_cadr (ans
);
513 /* Check to see if the elements at odd indices are 'equal?' */
514 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
516 if (scm_is_null (tl
))
518 /* Convert simple curly-infix list to prefix:
519 {a <op> b <op> ...} => (<op> a b ...) */
521 while (scm_is_pair (scm_cdr (tl
)))
524 SCM_SETCDR (tl
, tmp
);
527 ans
= scm_cons (op
, ans
);
530 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
532 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
533 ans
= scm_cons (sym_nfx
, ans
);
539 /* Mixed curly-infix (possibly improper) list:
540 {e . tail} => ($nfx$ e . tail) */
541 ans
= scm_cons (sym_nfx
, ans
);
544 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
549 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
550 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
552 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
558 while (i < ndigits) \
560 a = scm_getc_unlocked (port); \
564 && (a == (scm_t_wchar) terminator) \
567 if ('0' <= a && a <= '9') \
569 else if ('A' <= a && a <= 'F') \
571 else if ('a' <= a && a <= 'f') \
584 skip_intraline_whitespace (SCM port
)
590 c
= scm_getc_unlocked (port
);
594 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
596 scm_ungetc_unlocked (c
, port
);
599 /* Read either a double-quoted string or an R7RS-style symbol delimited
600 by vertical lines, depending on the value of 'chr' ('"' or '|').
601 Regardless, the result is always returned as a string. */
603 scm_read_string_like_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
604 #define FUNC_NAME "scm_lreadr"
606 /* For strings smaller than C_STR, this function creates only one Scheme
607 object (the string returned). */
610 size_t c_str_len
= 0;
611 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
613 /* Need to capture line and column numbers here. */
614 long line
= SCM_LINUM (port
);
615 int column
= SCM_COL (port
) - 1;
617 while (chr
!= (c
= scm_getc_unlocked (port
)))
622 scm_i_input_error (FUNC_NAME
, port
,
624 ? "end of file in symbol"
625 : "end of file in string constant"),
629 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
631 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
637 switch (c
= scm_getc_unlocked (port
))
643 case '(': /* Accept "\(" for use at the beginning of lines
644 in multiline strings to avoid confusing emacs
648 if (opts
->hungry_eol_escapes_p
)
649 skip_intraline_whitespace (port
);
676 if (opts
->r6rs_escapes_p
|| chr
== '|')
677 SCM_READ_HEX_ESCAPE (10, ';');
679 SCM_READ_HEX_ESCAPE (2, '\0');
682 if (!opts
->r6rs_escapes_p
)
684 SCM_READ_HEX_ESCAPE (4, '\0');
688 if (!opts
->r6rs_escapes_p
)
690 SCM_READ_HEX_ESCAPE (6, '\0');
697 scm_i_input_error (FUNC_NAME
, port
,
698 "illegal character in escape sequence: ~S",
699 scm_list_1 (SCM_MAKE_CHAR (c
)));
703 c_str
[c_str_len
++] = c
;
706 if (scm_is_null (str
))
707 /* Fast path: we got a string that fits in C_STR. */
708 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
712 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
714 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
717 return maybe_annotate_source (str
, port
, opts
, line
, column
);
722 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
724 return scm_read_string_like_syntax (chr
, port
, opts
);
728 scm_read_r7rs_symbol (int chr
, SCM port
, scm_t_read_opts
*opts
)
730 return scm_string_to_symbol (scm_read_string_like_syntax (chr
, port
, opts
));
734 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
736 SCM result
, str
= SCM_EOL
;
737 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
740 /* Need to capture line and column numbers here. */
741 long line
= SCM_LINUM (port
);
742 int column
= SCM_COL (port
) - 1;
744 scm_ungetc_unlocked (chr
, port
);
745 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
748 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
750 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
751 if (scm_is_false (result
))
753 /* Return a symbol instead of a number */
754 if (opts
->case_insensitive_p
)
755 str
= scm_string_downcase_x (str
);
756 result
= scm_string_to_symbol (str
);
758 else if (SCM_NIMP (result
))
759 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
761 SCM_COL (port
) += scm_i_string_length (str
);
766 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
769 int ends_with_colon
= 0;
771 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
772 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
775 scm_ungetc_unlocked (chr
, port
);
776 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
779 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
781 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
783 str
= scm_from_port_stringn (buffer
, bytes_read
- 1, port
);
785 if (opts
->case_insensitive_p
)
786 str
= scm_string_downcase_x (str
);
787 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
791 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
793 if (opts
->case_insensitive_p
)
794 str
= scm_string_downcase_x (str
);
795 result
= scm_string_to_symbol (str
);
798 SCM_COL (port
) += scm_i_string_length (str
);
803 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
804 #define FUNC_NAME "scm_lreadr"
808 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
835 scm_ungetc_unlocked (chr
, port
);
836 scm_ungetc_unlocked ('#', port
);
840 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
843 str
= scm_from_port_stringn (buffer
, read
, port
);
845 result
= scm_string_to_number (str
, scm_from_uint (radix
));
847 SCM_COL (port
) += scm_i_string_length (str
);
849 if (scm_is_true (result
))
852 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
859 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
862 long line
= SCM_LINUM (port
);
863 int column
= SCM_COL (port
) - 1;
868 p
= scm_sym_quasiquote
;
879 c
= scm_getc_unlocked (port
);
881 p
= scm_sym_uq_splicing
;
884 scm_ungetc_unlocked (c
, port
);
891 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
892 "scm_read_quote", chr
);
896 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
897 return maybe_annotate_source (p
, port
, opts
, line
, column
);
900 SCM_SYMBOL (sym_syntax
, "syntax");
901 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
902 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
903 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
906 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
909 long line
= SCM_LINUM (port
);
910 int column
= SCM_COL (port
) - 1;
926 c
= scm_getc_unlocked (port
);
928 p
= sym_unsyntax_splicing
;
931 scm_ungetc_unlocked (c
, port
);
938 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
939 "scm_read_syntax", chr
);
943 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
944 return maybe_annotate_source (p
, port
, opts
, line
, column
);
948 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
950 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
952 if (!scm_is_eq (id
, sym_nil
))
953 scm_i_input_error ("scm_read_nil", port
,
954 "unexpected input while reading #nil: ~a",
957 return SCM_ELISP_NIL
;
961 scm_read_semicolon_comment (int chr
, SCM port
)
965 /* We use the get_byte here because there is no need to get the
966 locale correct with comment input. This presumes that newline
967 always represents itself no matter what the encoding is. */
968 for (c
= scm_get_byte_or_eof_unlocked (port
);
969 (c
!= EOF
) && (c
!= '\n');
970 c
= scm_get_byte_or_eof_unlocked (port
));
972 return SCM_UNSPECIFIED
;
975 /* If the EXPECTED_CHARS are the next ones available from PORT, then
976 consume them and return 1. Otherwise leave the port position where
977 it was and return 0. EXPECTED_CHARS should be all lowercase, and
978 will be matched case-insensitively against the characters read from
981 try_read_ci_chars (SCM port
, const char *expected_chars
)
983 int num_chars_wanted
= strlen (expected_chars
);
984 int num_chars_read
= 0;
985 char *chars_read
= alloca (num_chars_wanted
);
988 while (num_chars_read
< num_chars_wanted
)
990 c
= scm_getc_unlocked (port
);
993 else if (c_tolower (c
) != expected_chars
[num_chars_read
])
995 scm_ungetc_unlocked (c
, port
);
999 chars_read
[num_chars_read
++] = c
;
1002 if (num_chars_read
== num_chars_wanted
)
1006 while (num_chars_read
> 0)
1007 scm_ungetc_unlocked (chars_read
[--num_chars_read
], port
);
1013 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1016 scm_read_boolean (int chr
, SCM port
)
1022 try_read_ci_chars (port
, "rue");
1027 try_read_ci_chars (port
, "alse");
1031 return SCM_UNSPECIFIED
;
1035 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1036 #define FUNC_NAME "scm_lreadr"
1038 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
1040 size_t charname_len
, bytes_read
;
1043 scm_t_port_internal
*pti
;
1045 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
1048 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
1050 if (bytes_read
== 0)
1052 chr
= scm_getc_unlocked (port
);
1054 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
1055 "while reading character", SCM_EOL
);
1057 /* CHR must be a token delimiter, like a whitespace. */
1058 return (SCM_MAKE_CHAR (chr
));
1061 pti
= SCM_PORT_GET_INTERNAL (port
);
1063 /* Simple ASCII characters can be processed immediately. Also, simple
1064 ISO-8859-1 characters can be processed immediately if the encoding for this
1065 port is ISO-8859-1. */
1066 if (bytes_read
== 1 &&
1067 ((unsigned char) buffer
[0] <= 127
1068 || pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
))
1070 SCM_COL (port
) += 1;
1071 return SCM_MAKE_CHAR (buffer
[0]);
1074 /* Otherwise, convert the buffer into a proper scheme string for
1076 charname
= scm_from_port_stringn (buffer
, bytes_read
, port
);
1077 charname_len
= scm_i_string_length (charname
);
1078 SCM_COL (port
) += charname_len
;
1079 cp
= scm_i_string_ref (charname
, 0);
1080 if (charname_len
== 1)
1081 return SCM_MAKE_CHAR (cp
);
1083 /* Ignore dotted circles, which may be used to keep combining characters from
1084 combining with the backslash in #\charname. */
1085 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1086 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1088 if (cp
>= '0' && cp
< '8')
1090 /* Dirk:FIXME:: This type of character syntax is not R5RS
1091 * compliant. Further, it should be verified that the constant
1092 * does only consist of octal digits. */
1093 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1094 if (SCM_I_INUMP (p
))
1096 scm_t_wchar c
= scm_to_uint32 (p
);
1097 if (SCM_IS_UNICODE_CHAR (c
))
1098 return SCM_MAKE_CHAR (c
);
1100 scm_i_input_error (FUNC_NAME
, port
,
1101 "out-of-range octal character escape: ~a",
1102 scm_list_1 (charname
));
1106 if (cp
== 'x' && (charname_len
> 1))
1110 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1111 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1112 scm_from_uint (16));
1113 if (SCM_I_INUMP (p
))
1115 scm_t_wchar c
= scm_to_uint32 (p
);
1116 if (SCM_IS_UNICODE_CHAR (c
))
1117 return SCM_MAKE_CHAR (c
);
1119 scm_i_input_error (FUNC_NAME
, port
,
1120 "out-of-range hex character escape: ~a",
1121 scm_list_1 (charname
));
1125 /* The names of characters should never have non-Latin1
1127 if (scm_i_is_narrow_string (charname
)
1128 || scm_i_try_narrow_string (charname
))
1129 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1131 if (scm_is_true (ch
))
1135 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1136 scm_list_1 (charname
));
1138 return SCM_UNSPECIFIED
;
1143 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1147 /* Read the symbol that comprises the keyword. Doing this instead of
1148 invoking a specific symbol reader function allows `scm_read_keyword ()'
1149 to adapt to the delimiters currently valid of symbols.
1151 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1152 symbol
= scm_read_expression (port
, opts
);
1153 if (!scm_is_symbol (symbol
))
1154 scm_i_input_error ("scm_read_keyword", port
,
1155 "keyword prefix `~a' not followed by a symbol: ~s",
1156 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1158 return (scm_symbol_to_keyword (symbol
));
1162 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1163 long line
, int column
)
1165 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1166 guarantee that it's going to do what we want. After all, this is an
1167 implementation detail of `scm_read_vector ()', not a desirable
1169 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1170 port
, opts
, line
, column
);
1173 /* Helper used by scm_read_array */
1175 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1184 c
= scm_getc_unlocked (port
);
1187 while ('0' <= c
&& c
<= '9')
1189 if (((SSIZE_MAX
- (c
-'0')) / 10) <= res
)
1190 scm_i_input_error ("read_decimal_integer", port
,
1191 "number too large", SCM_EOL
);
1192 res
= 10*res
+ c
-'0';
1194 c
= scm_getc_unlocked (port
);
1202 /* Read an array. This function can also read vectors and uniform
1203 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1206 C is the first character read after the '#'. */
1208 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1211 scm_t_wchar tag_buf
[8];
1214 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1216 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1217 the array code can not deal with zero-length dimensions yet, and
1218 we want to allow zero-length vectors, of course. */
1220 return scm_read_vector (c
, port
, opts
, line
, column
);
1222 /* Disambiguate between '#f' and uniform floating point vectors. */
1225 c
= scm_getc_unlocked (port
);
1226 if (c
!= '3' && c
!= '6')
1228 if (c
== 'a' && try_read_ci_chars (port
, "lse"))
1231 scm_ungetc_unlocked (c
, port
);
1237 goto continue_reading_tag
;
1242 c
= read_decimal_integer (port
, c
, &rank
);
1244 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1249 continue_reading_tag
:
1250 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1251 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1253 tag_buf
[tag_len
++] = c
;
1254 c
= scm_getc_unlocked (port
);
1260 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1261 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1262 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1267 if (c
== '@' || c
== ':')
1273 ssize_t lbnd
= 0, len
= 0;
1278 c
= scm_getc_unlocked (port
);
1279 c
= read_decimal_integer (port
, c
, &lbnd
);
1282 s
= scm_from_ssize_t (lbnd
);
1286 c
= scm_getc_unlocked (port
);
1287 c
= read_decimal_integer (port
, c
, &len
);
1289 scm_i_input_error (NULL
, port
,
1290 "array length must be non-negative",
1293 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1296 shape
= scm_cons (s
, shape
);
1297 } while (c
== '@' || c
== ':');
1299 shape
= scm_reverse_x (shape
, SCM_EOL
);
1302 /* Read nested lists of elements. */
1304 scm_i_input_error (NULL
, port
,
1305 "missing '(' in vector or array literal",
1307 elements
= scm_read_sexp (c
, port
, opts
);
1309 if (scm_is_false (shape
))
1310 shape
= scm_from_ssize_t (rank
);
1311 else if (scm_ilength (shape
) != rank
)
1314 "the number of shape specifications must match the array rank",
1317 /* Handle special print syntax of rank zero arrays; see
1318 scm_i_print_array for a rationale. */
1321 if (!scm_is_pair (elements
))
1322 scm_i_input_error (NULL
, port
,
1323 "too few elements in array literal, need 1",
1325 if (!scm_is_null (SCM_CDR (elements
)))
1326 scm_i_input_error (NULL
, port
,
1327 "too many elements in array literal, want 1",
1329 elements
= SCM_CAR (elements
);
1332 /* Construct array, annotate with source location, and return. */
1333 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1334 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1338 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1339 long line
, int column
)
1341 return scm_read_array (chr
, port
, opts
, line
, column
);
1345 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1346 long line
, int column
)
1348 chr
= scm_getc_unlocked (port
);
1352 chr
= scm_getc_unlocked (port
);
1356 chr
= scm_getc_unlocked (port
);
1360 return maybe_annotate_source
1361 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1362 port
, opts
, line
, column
);
1365 scm_i_input_error ("read_bytevector", port
,
1366 "invalid bytevector prefix",
1367 SCM_MAKE_CHAR (chr
));
1368 return SCM_UNSPECIFIED
;
1372 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1373 long line
, int column
)
1375 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1376 terribly inefficient but who cares? */
1377 SCM s_bits
= SCM_EOL
;
1379 for (chr
= scm_getc_unlocked (port
);
1380 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1381 chr
= scm_getc_unlocked (port
))
1383 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1387 scm_ungetc_unlocked (chr
, port
);
1389 return maybe_annotate_source
1390 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1391 port
, opts
, line
, column
);
1395 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1401 int c
= scm_getc_unlocked (port
);
1404 scm_i_input_error ("skip_block_comment", port
,
1405 "unterminated `#! ... !#' comment", SCM_EOL
);
1409 else if (c
== '#' && bang_seen
)
1415 return SCM_UNSPECIFIED
;
1418 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1420 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1422 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1426 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1428 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1432 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1434 c
= scm_getc_unlocked (port
);
1436 scm_i_input_error ("skip_block_comment", port
,
1437 "unterminated `#! ... !#' comment", SCM_EOL
);
1438 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1440 else if (CHAR_IS_DELIMITER (c
))
1442 scm_ungetc_unlocked (c
, port
);
1444 if (0 == strcmp ("r6rs", name
))
1445 ; /* Silently ignore */
1446 else if (0 == strcmp ("fold-case", name
))
1447 set_port_case_insensitive_p (port
, opts
, 1);
1448 else if (0 == strcmp ("no-fold-case", name
))
1449 set_port_case_insensitive_p (port
, opts
, 0);
1450 else if (0 == strcmp ("curly-infix", name
))
1451 set_port_curly_infix_p (port
, opts
, 1);
1452 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1454 set_port_curly_infix_p (port
, opts
, 1);
1455 set_port_square_brackets_p (port
, opts
, 0);
1460 return SCM_UNSPECIFIED
;
1464 scm_ungetc_unlocked (c
, port
);
1469 scm_ungetc_unlocked (name
[--i
], port
);
1470 return scm_read_scsh_block_comment (chr
, port
);
1474 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1476 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1477 nested. So care must be taken. */
1478 int nesting_level
= 1;
1480 int a
= scm_getc_unlocked (port
);
1483 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1484 "unterminated `#| ... |#' comment", SCM_EOL
);
1486 while (nesting_level
> 0)
1488 int b
= scm_getc_unlocked (port
);
1491 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1492 "unterminated `#| ... |#' comment", SCM_EOL
);
1494 if (a
== '|' && b
== '#')
1499 else if (a
== '#' && b
== '|')
1508 return SCM_UNSPECIFIED
;
1512 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1513 scm_t_read_opts
*opts
)
1517 c
= flush_ws (port
, opts
, (char *) NULL
);
1519 scm_i_input_error ("read_commented_expression", port
,
1520 "no expression after #; comment", SCM_EOL
);
1521 scm_ungetc_unlocked (c
, port
);
1522 scm_read_expression (port
, opts
);
1523 return SCM_UNSPECIFIED
;
1527 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1529 /* Guile's extended symbol read syntax looks like this:
1531 #{This is all a symbol name}#
1533 So here, CHR is expected to be `{'. */
1536 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1538 buf
= scm_i_string_start_writing (buf
);
1540 while ((chr
= scm_getc_unlocked (port
)) != EOF
)
1551 scm_i_string_set_x (buf
, len
++, '}');
1557 else if (chr
== '\\')
1559 /* It used to be that print.c would print extended-read-syntax
1560 symbols with backslashes before "non-standard" chars, but
1561 this routine wouldn't do anything with those escapes.
1562 Bummer. What we've done is to change print.c to output
1563 R6RS hex escapes for those characters, relying on the fact
1564 that the extended read syntax would never put a `\' before
1565 an `x'. For now, we just ignore other instances of
1566 backslash in the string. */
1567 switch ((chr
= scm_getc_unlocked (port
)))
1575 SCM_READ_HEX_ESCAPE (10, ';');
1576 scm_i_string_set_x (buf
, len
++, c
);
1584 scm_i_string_stop_writing ();
1585 scm_i_input_error ("scm_read_extended_symbol", port
,
1586 "illegal character in escape sequence: ~S",
1587 scm_list_1 (SCM_MAKE_CHAR (c
)));
1591 scm_i_string_set_x (buf
, len
++, chr
);
1596 scm_i_string_set_x (buf
, len
++, chr
);
1598 if (len
>= scm_i_string_length (buf
) - 2)
1602 scm_i_string_stop_writing ();
1603 addy
= scm_i_make_string (1024, NULL
, 0);
1604 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1606 buf
= scm_i_string_start_writing (buf
);
1611 scm_i_string_stop_writing ();
1613 scm_i_input_error ("scm_read_extended_symbol", port
,
1614 "end of file while reading symbol", SCM_EOL
);
1616 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1621 /* Top-level token readers, i.e., dispatchers. */
1624 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1628 proc
= scm_get_hash_procedure (chr
);
1629 if (scm_is_true (scm_procedure_p (proc
)))
1631 long line
= SCM_LINUM (port
);
1632 int column
= SCM_COL (port
) - 2;
1635 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1637 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1638 && !scm_i_has_source_properties (got
))
1639 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1644 return SCM_UNSPECIFIED
;
1647 /* The reader for the sharp `#' character. It basically dispatches reads
1648 among the above token readers. */
1650 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1651 long line
, int column
)
1652 #define FUNC_NAME "scm_lreadr"
1656 chr
= scm_getc_unlocked (port
);
1658 result
= scm_read_sharp_extension (chr
, port
, opts
);
1659 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1665 return (scm_read_character (chr
, port
, opts
));
1667 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1672 /* This one may return either a boolean or an SRFI-4 vector. */
1673 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1675 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1677 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1681 return (scm_read_boolean (chr
, port
));
1683 return (scm_read_keyword (chr
, port
, opts
));
1684 case '0': case '1': case '2': case '3': case '4':
1685 case '5': case '6': case '7': case '8': case '9':
1687 return (scm_read_array (chr
, port
, opts
, line
, column
));
1701 return (scm_read_number_and_radix (chr
, port
, opts
));
1703 return (scm_read_extended_symbol (chr
, port
));
1705 return (scm_read_shebang (chr
, port
, opts
));
1707 return (scm_read_commented_expression (chr
, port
, opts
));
1711 return (scm_read_syntax (chr
, port
, opts
));
1713 return (scm_read_nil (chr
, port
, opts
));
1715 result
= scm_read_sharp_extension (chr
, port
, opts
);
1716 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1718 /* To remain compatible with 1.8 and earlier, the following
1719 characters have lower precedence than `read-hash-extend'
1724 return scm_read_r6rs_block_comment (chr
, port
);
1726 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1727 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1734 return SCM_UNSPECIFIED
;
1739 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1740 #define FUNC_NAME "read_inner_expression"
1746 chr
= scm_getc_unlocked (port
);
1750 case SCM_WHITE_SPACES
:
1751 case SCM_LINE_INCREMENTORS
:
1754 (void) scm_read_semicolon_comment (chr
, port
);
1757 if (opts
->curly_infix_p
)
1759 if (opts
->neoteric_p
)
1760 return scm_read_sexp (chr
, port
, opts
);
1765 /* Enable neoteric expressions within curly braces */
1766 opts
->neoteric_p
= 1;
1767 expr
= scm_read_sexp (chr
, port
, opts
);
1768 opts
->neoteric_p
= 0;
1773 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1775 if (opts
->square_brackets_p
)
1776 return scm_read_sexp (chr
, port
, opts
);
1777 else if (opts
->curly_infix_p
)
1779 /* The syntax of neoteric expressions requires that '[' be
1780 a delimiter when curly-infix is enabled, so it cannot
1781 be part of an unescaped symbol. We might as well do
1782 something useful with it, so we adopt Kawa's convention:
1783 [...] => ($bracket-list$ ...) */
1784 long line
= SCM_LINUM (port
);
1785 int column
= SCM_COL (port
) - 1;
1786 return maybe_annotate_source
1787 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1788 port
, opts
, line
, column
);
1791 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1793 return (scm_read_sexp (chr
, port
, opts
));
1795 return (scm_read_string (chr
, port
, opts
));
1797 if (opts
->r7rs_symbols_p
)
1798 return scm_read_r7rs_symbol (chr
, port
, opts
);
1800 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1804 return (scm_read_quote (chr
, port
, opts
));
1807 long line
= SCM_LINUM (port
);
1808 int column
= SCM_COL (port
) - 1;
1809 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1810 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1811 /* We read a comment or some such. */
1817 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1820 if (opts
->curly_infix_p
)
1821 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1823 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1825 if (opts
->square_brackets_p
)
1826 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1827 /* otherwise fall through */
1831 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1832 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1837 if (((chr
>= '0') && (chr
<= '9'))
1838 || (strchr ("+-.", chr
)))
1839 return (scm_read_number (chr
, port
, opts
));
1841 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1849 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1850 #define FUNC_NAME "scm_read_expression"
1852 if (!opts
->neoteric_p
)
1853 return read_inner_expression (port
, opts
);
1860 if (opts
->record_positions_p
)
1862 /* We need to get the position of the first non-whitespace
1863 character in order to correctly annotate neoteric
1864 expressions. For example, for the expression 'f(x)', the
1865 first call to 'read_inner_expression' reads the 'f' (which
1866 cannot be annotated), and then we later read the '(x)' and
1867 use it to construct the new list (f x). */
1868 int c
= flush_ws (port
, opts
, (char *) NULL
);
1871 scm_ungetc_unlocked (c
, port
);
1872 line
= SCM_LINUM (port
);
1873 column
= SCM_COL (port
);
1876 expr
= read_inner_expression (port
, opts
);
1878 /* 'expr' is the first component of the neoteric expression. Now
1879 we loop, and as long as the next character is '(', '[', or '{',
1880 (without any intervening whitespace), we use it to construct a
1881 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1884 int chr
= scm_getc_unlocked (port
);
1887 /* e(...) => (e ...) */
1888 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1889 else if (chr
== '[')
1890 /* e[...] => ($bracket-apply$ e ...) */
1891 expr
= scm_cons (sym_bracket_apply
,
1893 scm_read_sexp (chr
, port
, opts
)));
1894 else if (chr
== '{')
1896 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1898 if (scm_is_null (arg
))
1899 expr
= scm_list_1 (expr
); /* e{} => (e) */
1901 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1906 scm_ungetc_unlocked (chr
, port
);
1909 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1917 /* Actual reader. */
1919 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1921 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1923 "Read an s-expression from the input port @var{port}, or from\n"
1924 "the current input port if @var{port} is not specified.\n"
1925 "Any whitespace before the next token is discarded.")
1926 #define FUNC_NAME s_scm_read
1928 scm_t_read_opts opts
;
1931 if (SCM_UNBNDP (port
))
1932 port
= scm_current_input_port ();
1933 SCM_VALIDATE_OPINPORT (1, port
);
1935 init_read_options (port
, &opts
);
1937 c
= flush_ws (port
, &opts
, (char *) NULL
);
1940 scm_ungetc_unlocked (c
, port
);
1942 return (scm_read_expression (port
, &opts
));
1949 /* Manipulate the read-hash-procedures alist. This could be written in
1950 Scheme, but maybe it will also be used by C code during initialisation. */
1951 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1952 (SCM chr
, SCM proc
),
1953 "Install the procedure @var{proc} for reading expressions\n"
1954 "starting with the character sequence @code{#} and @var{chr}.\n"
1955 "@var{proc} will be called with two arguments: the character\n"
1956 "@var{chr} and the port to read further data from. The object\n"
1957 "returned will be the return value of @code{read}. \n"
1958 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1960 #define FUNC_NAME s_scm_read_hash_extend
1965 SCM_VALIDATE_CHAR (1, chr
);
1966 SCM_ASSERT (scm_is_false (proc
)
1967 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1968 proc
, SCM_ARG2
, FUNC_NAME
);
1970 /* Check if chr is already in the alist. */
1971 this = scm_i_read_hash_procedures_ref ();
1975 if (scm_is_null (this))
1977 /* not found, so add it to the beginning. */
1978 if (scm_is_true (proc
))
1980 SCM
new = scm_cons (scm_cons (chr
, proc
),
1981 scm_i_read_hash_procedures_ref ());
1982 scm_i_read_hash_procedures_set_x (new);
1986 if (scm_is_eq (chr
, SCM_CAAR (this)))
1988 /* already in the alist. */
1989 if (scm_is_false (proc
))
1992 if (scm_is_false (prev
))
1994 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1995 scm_i_read_hash_procedures_set_x (rest
);
1998 scm_set_cdr_x (prev
, SCM_CDR (this));
2003 scm_set_cdr_x (SCM_CAR (this), proc
);
2008 this = SCM_CDR (this);
2011 return SCM_UNSPECIFIED
;
2015 /* Recover the read-hash procedure corresponding to char c. */
2017 scm_get_hash_procedure (int c
)
2019 SCM rest
= scm_i_read_hash_procedures_ref ();
2023 if (scm_is_null (rest
))
2026 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
2027 return SCM_CDAR (rest
);
2029 rest
= SCM_CDR (rest
);
2034 is_encoding_char (char c
)
2036 if (c
>= 'a' && c
<= 'z') return 1;
2037 if (c
>= 'A' && c
<= 'Z') return 1;
2038 if (c
>= '0' && c
<= '9') return 1;
2039 return strchr ("_-.:/,+=()", c
) != NULL
;
2042 /* Maximum size of an encoding name. This is a bit more than the
2043 longest name listed at
2044 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2046 #define ENCODING_NAME_MAX_SIZE 20
2048 /* Number of bytes at the beginning or end of a file that are scanned
2049 for a "coding:" declaration. */
2050 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2053 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2054 coding declaration. Returns either NULL or a string whose storage
2055 has been allocated with `scm_gc_malloc'. */
2057 scm_i_scan_for_encoding (SCM port
)
2060 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
2061 size_t bytes_read
, encoding_length
, i
;
2062 char *encoding
= NULL
;
2063 char *pos
, *encoding_start
;
2066 pt
= SCM_PTAB_ENTRY (port
);
2068 if (pt
->rw_active
== SCM_PORT_WRITE
)
2069 scm_flush_unlocked (port
);
2072 pt
->rw_active
= SCM_PORT_READ
;
2074 if (pt
->read_pos
== pt
->read_end
)
2076 /* We can use the read buffer, and thus avoid a seek. */
2077 if (scm_fill_input_unlocked (port
) == EOF
)
2080 bytes_read
= pt
->read_end
- pt
->read_pos
;
2081 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
2082 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
2084 if (bytes_read
<= 1)
2085 /* An unbuffered port -- don't scan. */
2088 memcpy (header
, pt
->read_pos
, bytes_read
);
2089 header
[bytes_read
] = '\0';
2093 /* Try to read some bytes and then seek back. Not all ports
2094 support seeking back; and indeed some file ports (like
2095 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2096 check performed by SCM_FPORT_FDES---but fail to seek
2097 backwards. Hence this block comes second. We prefer to use
2098 the read buffer in-place. */
2099 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2102 bytes_read
= scm_c_read_unlocked (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2103 header
[bytes_read
] = '\0';
2104 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2107 /* search past "coding[:=]" */
2111 if ((pos
= strstr(pos
, "coding")) == NULL
)
2114 pos
+= strlen ("coding");
2115 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2116 (*pos
== ':' || *pos
== '='))
2124 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2125 (*pos
== ' ' || *pos
== '\t'))
2128 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
- ENCODING_NAME_MAX_SIZE
)
2129 /* We found the "coding:" string, but there is probably not enough
2130 room to store an encoding name in its entirety, so ignore it.
2131 This makes sure we do not end up returning a truncated encoding
2135 /* grab the next token */
2136 encoding_start
= pos
;
2138 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2139 && encoding_start
+ i
- header
< bytes_read
2140 && is_encoding_char (encoding_start
[i
]))
2143 encoding_length
= i
;
2144 if (encoding_length
== 0)
2147 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2149 /* push backwards to make sure we were in a comment */
2151 pos
= encoding_start
;
2152 while (pos
>= header
)
2159 else if (*pos
== '\n' || pos
== header
)
2161 /* This wasn't in a semicolon comment. Check for a
2162 hash-bang comment. */
2163 char *beg
= strstr (header
, "#!");
2164 char *end
= strstr (header
, "!#");
2165 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2176 /* This wasn't in a comment */
2182 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2184 "Scans the port for an Emacs-like character coding declaration\n"
2185 "near the top of the contents of a port with random-accessible contents.\n"
2186 "The coding declaration is of the form\n"
2187 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2189 "Returns a string containing the character encoding of the file\n"
2190 "if a declaration was found, or @code{#f} otherwise.\n")
2191 #define FUNC_NAME s_scm_file_encoding
2196 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2198 enc
= scm_i_scan_for_encoding (port
);
2203 s_enc
= scm_string_upcase (scm_from_locale_string (enc
));
2212 /* Per-port read options.
2214 We store per-port read options in the 'port-read-options' port
2215 property, which is stored in the internal port structure. The value
2216 stored is a single integer that contains a two-bit field for each
2219 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2220 the applicable value should be inherited from the corresponding
2221 global read option. Otherwise, the bit field contains the value of
2222 the read option. For boolean read options that have been set
2223 per-port, the possible values are 0 or 1. If the 'keyword_style'
2224 read option has been set per-port, its possible values are those in
2225 'enum t_keyword_style'. */
2227 /* Key to read options in port properties. */
2228 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2230 /* Offsets of bit fields for each per-port override */
2231 #define READ_OPTION_COPY_SOURCE_P 0
2232 #define READ_OPTION_RECORD_POSITIONS_P 2
2233 #define READ_OPTION_CASE_INSENSITIVE_P 4
2234 #define READ_OPTION_KEYWORD_STYLE 6
2235 #define READ_OPTION_R6RS_ESCAPES_P 8
2236 #define READ_OPTION_SQUARE_BRACKETS_P 10
2237 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2238 #define READ_OPTION_CURLY_INFIX_P 14
2239 #define READ_OPTION_R7RS_SYMBOLS_P 16
2241 /* The total width in bits of the per-port overrides */
2242 #define READ_OPTIONS_NUM_BITS 18
2244 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2245 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2247 #define READ_OPTION_MASK 3
2248 #define READ_OPTION_INHERIT 3
2251 set_port_read_option (SCM port
, int option
, int new_value
)
2253 SCM scm_read_options
;
2254 unsigned int read_options
;
2256 new_value
&= READ_OPTION_MASK
;
2258 scm_dynwind_begin (0);
2259 scm_dynwind_lock_port (port
);
2261 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2262 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2263 read_options
= scm_to_uint (scm_read_options
);
2265 read_options
= READ_OPTIONS_INHERIT_ALL
;
2266 read_options
&= ~(READ_OPTION_MASK
<< option
);
2267 read_options
|= new_value
<< option
;
2268 scm_read_options
= scm_from_uint (read_options
);
2269 scm_i_set_port_property_x (port
, sym_port_read_options
, scm_read_options
);
2274 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2276 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2279 opts
->case_insensitive_p
= value
;
2280 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2283 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2285 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2288 opts
->square_brackets_p
= value
;
2289 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2292 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2294 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2297 opts
->curly_infix_p
= value
;
2298 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2301 /* Initialize OPTS based on PORT's read options and the global read
2304 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2306 SCM val
, scm_read_options
;
2307 unsigned int read_options
, x
;
2309 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2311 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2312 read_options
= scm_to_uint (scm_read_options
);
2314 read_options
= READ_OPTIONS_INHERIT_ALL
;
2316 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2317 if (x
== READ_OPTION_INHERIT
)
2319 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2320 if (scm_is_eq (val
, scm_keyword_prefix
))
2321 x
= KEYWORD_STYLE_PREFIX
;
2322 else if (scm_is_eq (val
, scm_keyword_postfix
))
2323 x
= KEYWORD_STYLE_POSTFIX
;
2325 x
= KEYWORD_STYLE_HASH_PREFIX
;
2327 opts
->keyword_style
= x
;
2329 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2332 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2333 if (x == READ_OPTION_INHERIT) \
2334 x = !!SCM_ ## NAME; \
2339 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2340 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2341 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2342 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2343 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2344 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2345 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2346 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P
, r7rs_symbols_p
);
2348 #undef RESOLVE_BOOLEAN_OPTION
2350 opts
->neoteric_p
= 0;
2356 SCM read_hash_procs
;
2358 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2360 scm_i_read_hash_procedures
=
2361 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2363 scm_init_opts (scm_read_options
, scm_read_opts
);
2364 #include "libguile/read.x"