1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include <c-strcase.h>
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/ports-internal.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
63 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
64 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
66 SCM_SYMBOL (sym_nil
, "nil");
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx
, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
73 scm_t_option scm_read_opts
[] =
75 { SCM_OPTION_BOOLEAN
, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN
, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN
, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
102 KEYWORD_STYLE_HASH_PREFIX
,
103 KEYWORD_STYLE_PREFIX
,
104 KEYWORD_STYLE_POSTFIX
109 enum t_keyword_style keyword_style
;
110 unsigned int copy_source_p
: 1;
111 unsigned int record_positions_p
: 1;
112 unsigned int case_insensitive_p
: 1;
113 unsigned int r6rs_escapes_p
: 1;
114 unsigned int square_brackets_p
: 1;
115 unsigned int hungry_eol_escapes_p
: 1;
116 unsigned int curly_infix_p
: 1;
117 unsigned int neoteric_p
: 1;
118 unsigned int r7rs_symbols_p
: 1;
121 typedef struct t_read_opts scm_t_read_opts
;
125 Give meaningful error messages for errors
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
139 scm_i_input_error (char const *function
,
140 SCM port
, const char *message
, SCM arg
)
142 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
144 : scm_from_locale_string ("#<unknown port>"));
146 SCM string_port
= scm_open_output_string ();
147 SCM string
= SCM_EOL
;
148 scm_simple_format (string_port
,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
151 scm_from_long (SCM_LINUM (port
) + 1),
152 scm_from_int (SCM_COL (port
) + 1),
153 scm_from_locale_string (message
)));
155 string
= scm_get_output_string (string_port
);
156 scm_close_output_port (string_port
);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
165 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
172 SCM ans
= scm_options (setting
,
175 if (SCM_COPY_SOURCE_P
)
176 SCM_RECORD_POSITIONS_P
= 1;
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM
*scm_i_read_hash_procedures
;
186 scm_i_read_hash_procedures_ref (void)
188 return scm_fluid_ref (*scm_i_read_hash_procedures
);
192 scm_i_read_hash_procedures_set_x (SCM value
)
194 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
229 #define CHAR_IS_R5RS_DELIMITER(c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
245 /* Read an SCSH block comment. */
246 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
247 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
248 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
249 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
250 static SCM
scm_get_hash_procedure (int);
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
257 read_token (SCM port
, scm_t_read_opts
*opts
,
258 char *buf
, size_t buf_size
, size_t *read
)
262 while (*read
< buf_size
)
266 chr
= scm_get_byte_or_eof_unlocked (port
);
270 else if (CHAR_IS_DELIMITER (chr
))
272 scm_unget_byte_unlocked (chr
, port
);
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
288 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
289 char *buffer
, size_t buffer_size
, size_t *read
)
292 size_t bytes_read
, overflow_size
= 0;
293 char *overflow_buffer
= NULL
;
297 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
300 if (overflow
|| overflow_size
!= 0)
302 if (overflow_size
== 0)
304 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
305 memcpy (overflow_buffer
, buffer
, bytes_read
);
306 overflow_size
= bytes_read
;
311 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
313 memcpy (new_buf
, overflow_buffer
, overflow_size
);
314 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
316 overflow_buffer
= new_buf
;
317 overflow_size
+= bytes_read
;
324 *read
= overflow_size
;
328 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
334 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
338 switch (c
= scm_getc_unlocked (port
))
344 scm_i_input_error (eoferr
,
353 switch (c
= scm_getc_unlocked (port
))
359 case SCM_LINE_INCREMENTORS
:
365 switch (c
= scm_getc_unlocked (port
))
368 eoferr
= "read_sharp";
371 scm_read_shebang (c
, port
, opts
);
374 scm_read_commented_expression (c
, port
, opts
);
377 if (scm_is_false (scm_get_hash_procedure (c
)))
379 scm_read_r6rs_block_comment (c
, port
);
384 scm_ungetc_unlocked (c
, port
);
389 case SCM_LINE_INCREMENTORS
:
390 case SCM_SINGLE_SPACES
:
405 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
406 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
407 long line
, int column
);
411 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
412 long line
, int column
)
414 /* This condition can be caused by a user calling
416 if (line
< 0 || column
< 0)
419 if (opts
->record_positions_p
)
420 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
425 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
426 #define FUNC_NAME "scm_i_lreadparen"
429 SCM tmp
, tl
, ans
= SCM_EOL
;
430 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
431 const int terminating_char
= ((chr
== '{') ? '}'
432 : ((chr
== '[') ? ']'
435 /* Need to capture line and column numbers here. */
436 long line
= SCM_LINUM (port
);
437 int column
= SCM_COL (port
) - 1;
439 c
= flush_ws (port
, opts
, FUNC_NAME
);
440 if (terminating_char
== c
)
443 scm_ungetc_unlocked (c
, port
);
444 tmp
= scm_read_expression (port
, opts
);
446 /* Note that it is possible for scm_read_expression to return
447 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
448 check that it's a real dot by checking `c'. */
449 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
451 ans
= scm_read_expression (port
, opts
);
452 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
453 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
458 /* Build the head of the list structure. */
459 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
461 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
465 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
466 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
467 scm_i_input_error (FUNC_NAME
, port
,
468 "in pair: mismatched close paren: ~A",
469 scm_list_1 (SCM_MAKE_CHAR (c
)));
471 scm_ungetc_unlocked (c
, port
);
472 tmp
= scm_read_expression (port
, opts
);
474 /* See above note about scm_sym_dot. */
475 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
477 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
479 c
= flush_ws (port
, opts
, FUNC_NAME
);
480 if (terminating_char
!= c
)
481 scm_i_input_error (FUNC_NAME
, port
,
482 "in pair: missing close paren", SCM_EOL
);
486 new_tail
= scm_cons (tmp
, SCM_EOL
);
487 SCM_SETCDR (tl
, new_tail
);
493 /* In addition to finding the length, 'scm_ilength' checks for
494 improper or circular lists, in which case it returns -1. */
495 int len
= scm_ilength (ans
);
497 /* The (len == 0) case is handled above */
499 /* Return directly to avoid re-annotating the element's source
500 location with the position of the outer brace. Also, it
501 might not be possible to annotate the element. */
502 return scm_car (ans
); /* {e} => e */
504 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
505 else if (len
>= 3 && (len
& 1))
507 /* It's a proper list whose length is odd and at least 3. If
508 the elements at odd indices (the infix operator positions)
509 are all 'equal?', then it's a simple curly-infix list.
510 Otherwise it's a mixed curly-infix list. */
511 SCM op
= scm_cadr (ans
);
513 /* Check to see if the elements at odd indices are 'equal?' */
514 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
516 if (scm_is_null (tl
))
518 /* Convert simple curly-infix list to prefix:
519 {a <op> b <op> ...} => (<op> a b ...) */
521 while (scm_is_pair (scm_cdr (tl
)))
524 SCM_SETCDR (tl
, tmp
);
527 ans
= scm_cons (op
, ans
);
530 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
532 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
533 ans
= scm_cons (sym_nfx
, ans
);
539 /* Mixed curly-infix (possibly improper) list:
540 {e . tail} => ($nfx$ e . tail) */
541 ans
= scm_cons (sym_nfx
, ans
);
544 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
549 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
550 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
552 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
558 while (i < ndigits) \
560 a = scm_getc_unlocked (port); \
564 && (a == (scm_t_wchar) terminator) \
567 if ('0' <= a && a <= '9') \
569 else if ('A' <= a && a <= 'F') \
571 else if ('a' <= a && a <= 'f') \
584 skip_intraline_whitespace (SCM port
)
590 c
= scm_getc_unlocked (port
);
594 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
596 scm_ungetc_unlocked (c
, port
);
599 /* Read either a double-quoted string or an R7RS-style symbol delimited
600 by vertical lines, depending on the value of 'chr' ('"' or '|').
601 Regardless, the result is always returned as a string. */
603 scm_read_string_like_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
604 #define FUNC_NAME "scm_lreadr"
606 /* For strings smaller than C_STR, this function creates only one Scheme
607 object (the string returned). */
610 size_t c_str_len
= 0;
611 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
613 /* Need to capture line and column numbers here. */
614 long line
= SCM_LINUM (port
);
615 int column
= SCM_COL (port
) - 1;
617 while (chr
!= (c
= scm_getc_unlocked (port
)))
622 scm_i_input_error (FUNC_NAME
, port
,
624 ? "end of file in symbol"
625 : "end of file in string constant"),
629 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
631 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
637 switch (c
= scm_getc_unlocked (port
))
645 if (opts
->hungry_eol_escapes_p
)
646 skip_intraline_whitespace (port
);
673 if (opts
->r6rs_escapes_p
|| chr
== '|')
674 SCM_READ_HEX_ESCAPE (10, ';');
676 SCM_READ_HEX_ESCAPE (2, '\0');
679 if (!opts
->r6rs_escapes_p
)
681 SCM_READ_HEX_ESCAPE (4, '\0');
685 if (!opts
->r6rs_escapes_p
)
687 SCM_READ_HEX_ESCAPE (6, '\0');
694 scm_i_input_error (FUNC_NAME
, port
,
695 "illegal character in escape sequence: ~S",
696 scm_list_1 (SCM_MAKE_CHAR (c
)));
700 c_str
[c_str_len
++] = c
;
703 if (scm_is_null (str
))
704 /* Fast path: we got a string that fits in C_STR. */
705 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
709 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
711 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
714 return maybe_annotate_source (str
, port
, opts
, line
, column
);
719 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
721 return scm_read_string_like_syntax (chr
, port
, opts
);
725 scm_read_r7rs_symbol (int chr
, SCM port
, scm_t_read_opts
*opts
)
727 return scm_string_to_symbol (scm_read_string_like_syntax (chr
, port
, opts
));
731 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
733 SCM result
, str
= SCM_EOL
;
734 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
737 /* Need to capture line and column numbers here. */
738 long line
= SCM_LINUM (port
);
739 int column
= SCM_COL (port
) - 1;
741 scm_ungetc_unlocked (chr
, port
);
742 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
745 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
747 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
748 if (scm_is_false (result
))
750 /* Return a symbol instead of a number */
751 if (opts
->case_insensitive_p
)
752 str
= scm_string_downcase_x (str
);
753 result
= scm_string_to_symbol (str
);
755 else if (SCM_NIMP (result
))
756 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
758 SCM_COL (port
) += scm_i_string_length (str
);
763 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
766 int ends_with_colon
= 0;
768 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
769 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
772 scm_ungetc_unlocked (chr
, port
);
773 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
776 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
778 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
780 str
= scm_from_port_stringn (buffer
, bytes_read
- 1, port
);
782 if (opts
->case_insensitive_p
)
783 str
= scm_string_downcase_x (str
);
784 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
788 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
790 if (opts
->case_insensitive_p
)
791 str
= scm_string_downcase_x (str
);
792 result
= scm_string_to_symbol (str
);
795 SCM_COL (port
) += scm_i_string_length (str
);
800 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
801 #define FUNC_NAME "scm_lreadr"
805 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
832 scm_ungetc_unlocked (chr
, port
);
833 scm_ungetc_unlocked ('#', port
);
837 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
840 str
= scm_from_port_stringn (buffer
, read
, port
);
842 result
= scm_string_to_number (str
, scm_from_uint (radix
));
844 SCM_COL (port
) += scm_i_string_length (str
);
846 if (scm_is_true (result
))
849 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
856 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
859 long line
= SCM_LINUM (port
);
860 int column
= SCM_COL (port
) - 1;
865 p
= scm_sym_quasiquote
;
876 c
= scm_getc_unlocked (port
);
878 p
= scm_sym_uq_splicing
;
881 scm_ungetc_unlocked (c
, port
);
888 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
889 "scm_read_quote", chr
);
893 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
894 return maybe_annotate_source (p
, port
, opts
, line
, column
);
897 SCM_SYMBOL (sym_syntax
, "syntax");
898 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
899 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
900 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
903 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
906 long line
= SCM_LINUM (port
);
907 int column
= SCM_COL (port
) - 1;
923 c
= scm_getc_unlocked (port
);
925 p
= sym_unsyntax_splicing
;
928 scm_ungetc_unlocked (c
, port
);
935 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
936 "scm_read_syntax", chr
);
940 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
941 return maybe_annotate_source (p
, port
, opts
, line
, column
);
945 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
947 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
949 if (!scm_is_eq (id
, sym_nil
))
950 scm_i_input_error ("scm_read_nil", port
,
951 "unexpected input while reading #nil: ~a",
954 return SCM_ELISP_NIL
;
958 scm_read_semicolon_comment (int chr
, SCM port
)
962 /* We use the get_byte here because there is no need to get the
963 locale correct with comment input. This presumes that newline
964 always represents itself no matter what the encoding is. */
965 for (c
= scm_get_byte_or_eof_unlocked (port
);
966 (c
!= EOF
) && (c
!= '\n');
967 c
= scm_get_byte_or_eof_unlocked (port
));
969 return SCM_UNSPECIFIED
;
972 /* If the EXPECTED_CHARS are the next ones available from PORT, then
973 consume them and return 1. Otherwise leave the port position where
974 it was and return 0. EXPECTED_CHARS should be all lowercase, and
975 will be matched case-insensitively against the characters read from
978 try_read_ci_chars (SCM port
, const char *expected_chars
)
980 int num_chars_wanted
= strlen (expected_chars
);
981 int num_chars_read
= 0;
982 char *chars_read
= alloca (num_chars_wanted
);
985 while (num_chars_read
< num_chars_wanted
)
987 c
= scm_getc_unlocked (port
);
990 else if (c_tolower (c
) != expected_chars
[num_chars_read
])
992 scm_ungetc_unlocked (c
, port
);
996 chars_read
[num_chars_read
++] = c
;
999 if (num_chars_read
== num_chars_wanted
)
1003 while (num_chars_read
> 0)
1004 scm_ungetc_unlocked (chars_read
[--num_chars_read
], port
);
1010 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1013 scm_read_boolean (int chr
, SCM port
)
1019 try_read_ci_chars (port
, "rue");
1024 try_read_ci_chars (port
, "alse");
1028 return SCM_UNSPECIFIED
;
1032 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1033 #define FUNC_NAME "scm_lreadr"
1035 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
1037 size_t charname_len
, bytes_read
;
1040 scm_t_port_internal
*pti
;
1042 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
1045 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
1047 if (bytes_read
== 0)
1049 chr
= scm_getc_unlocked (port
);
1051 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
1052 "while reading character", SCM_EOL
);
1054 /* CHR must be a token delimiter, like a whitespace. */
1055 return (SCM_MAKE_CHAR (chr
));
1058 pti
= SCM_PORT_GET_INTERNAL (port
);
1060 /* Simple ASCII characters can be processed immediately. Also, simple
1061 ISO-8859-1 characters can be processed immediately if the encoding for this
1062 port is ISO-8859-1. */
1063 if (bytes_read
== 1 &&
1064 ((unsigned char) buffer
[0] <= 127
1065 || pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
))
1067 SCM_COL (port
) += 1;
1068 return SCM_MAKE_CHAR (buffer
[0]);
1071 /* Otherwise, convert the buffer into a proper scheme string for
1073 charname
= scm_from_port_stringn (buffer
, bytes_read
, port
);
1074 charname_len
= scm_i_string_length (charname
);
1075 SCM_COL (port
) += charname_len
;
1076 cp
= scm_i_string_ref (charname
, 0);
1077 if (charname_len
== 1)
1078 return SCM_MAKE_CHAR (cp
);
1080 /* Ignore dotted circles, which may be used to keep combining characters from
1081 combining with the backslash in #\charname. */
1082 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1083 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1085 if (cp
>= '0' && cp
< '8')
1087 /* Dirk:FIXME:: This type of character syntax is not R5RS
1088 * compliant. Further, it should be verified that the constant
1089 * does only consist of octal digits. */
1090 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1091 if (SCM_I_INUMP (p
))
1093 scm_t_wchar c
= scm_to_uint32 (p
);
1094 if (SCM_IS_UNICODE_CHAR (c
))
1095 return SCM_MAKE_CHAR (c
);
1097 scm_i_input_error (FUNC_NAME
, port
,
1098 "out-of-range octal character escape: ~a",
1099 scm_list_1 (charname
));
1103 if (cp
== 'x' && (charname_len
> 1))
1107 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1109 scm_from_uint (16));
1110 if (SCM_I_INUMP (p
))
1112 scm_t_wchar c
= scm_to_uint32 (p
);
1113 if (SCM_IS_UNICODE_CHAR (c
))
1114 return SCM_MAKE_CHAR (c
);
1116 scm_i_input_error (FUNC_NAME
, port
,
1117 "out-of-range hex character escape: ~a",
1118 scm_list_1 (charname
));
1122 /* The names of characters should never have non-Latin1
1124 if (scm_i_is_narrow_string (charname
)
1125 || scm_i_try_narrow_string (charname
))
1126 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1128 if (scm_is_true (ch
))
1132 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1133 scm_list_1 (charname
));
1135 return SCM_UNSPECIFIED
;
1140 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1144 /* Read the symbol that comprises the keyword. Doing this instead of
1145 invoking a specific symbol reader function allows `scm_read_keyword ()'
1146 to adapt to the delimiters currently valid of symbols.
1148 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1149 symbol
= scm_read_expression (port
, opts
);
1150 if (!scm_is_symbol (symbol
))
1151 scm_i_input_error ("scm_read_keyword", port
,
1152 "keyword prefix `~a' not followed by a symbol: ~s",
1153 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1155 return (scm_symbol_to_keyword (symbol
));
1159 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1160 long line
, int column
)
1162 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163 guarantee that it's going to do what we want. After all, this is an
1164 implementation detail of `scm_read_vector ()', not a desirable
1166 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1167 port
, opts
, line
, column
);
1170 /* Helper used by scm_read_array */
1172 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1181 c
= scm_getc_unlocked (port
);
1184 while ('0' <= c
&& c
<= '9')
1186 if (((SSIZE_MAX
- (c
-'0')) / 10) <= res
)
1187 scm_i_input_error ("read_decimal_integer", port
,
1188 "number too large", SCM_EOL
);
1189 res
= 10*res
+ c
-'0';
1191 c
= scm_getc_unlocked (port
);
1199 /* Read an array. This function can also read vectors and uniform
1200 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1203 C is the first character read after the '#'. */
1205 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1208 scm_t_wchar tag_buf
[8];
1211 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1213 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1214 the array code can not deal with zero-length dimensions yet, and
1215 we want to allow zero-length vectors, of course. */
1217 return scm_read_vector (c
, port
, opts
, line
, column
);
1219 /* Disambiguate between '#f' and uniform floating point vectors. */
1222 c
= scm_getc_unlocked (port
);
1223 if (c
!= '3' && c
!= '6')
1225 if (c
== 'a' && try_read_ci_chars (port
, "lse"))
1228 scm_ungetc_unlocked (c
, port
);
1234 goto continue_reading_tag
;
1239 c
= read_decimal_integer (port
, c
, &rank
);
1241 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1246 continue_reading_tag
:
1247 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1248 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1250 tag_buf
[tag_len
++] = c
;
1251 c
= scm_getc_unlocked (port
);
1257 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1258 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1259 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1264 if (c
== '@' || c
== ':')
1270 ssize_t lbnd
= 0, len
= 0;
1275 c
= scm_getc_unlocked (port
);
1276 c
= read_decimal_integer (port
, c
, &lbnd
);
1279 s
= scm_from_ssize_t (lbnd
);
1283 c
= scm_getc_unlocked (port
);
1284 c
= read_decimal_integer (port
, c
, &len
);
1286 scm_i_input_error (NULL
, port
,
1287 "array length must be non-negative",
1290 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1293 shape
= scm_cons (s
, shape
);
1294 } while (c
== '@' || c
== ':');
1296 shape
= scm_reverse_x (shape
, SCM_EOL
);
1299 /* Read nested lists of elements. */
1301 scm_i_input_error (NULL
, port
,
1302 "missing '(' in vector or array literal",
1304 elements
= scm_read_sexp (c
, port
, opts
);
1306 if (scm_is_false (shape
))
1307 shape
= scm_from_ssize_t (rank
);
1308 else if (scm_ilength (shape
) != rank
)
1311 "the number of shape specifications must match the array rank",
1314 /* Handle special print syntax of rank zero arrays; see
1315 scm_i_print_array for a rationale. */
1318 if (!scm_is_pair (elements
))
1319 scm_i_input_error (NULL
, port
,
1320 "too few elements in array literal, need 1",
1322 if (!scm_is_null (SCM_CDR (elements
)))
1323 scm_i_input_error (NULL
, port
,
1324 "too many elements in array literal, want 1",
1326 elements
= SCM_CAR (elements
);
1329 /* Construct array, annotate with source location, and return. */
1330 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1331 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1335 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1336 long line
, int column
)
1338 return scm_read_array (chr
, port
, opts
, line
, column
);
1342 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1343 long line
, int column
)
1345 chr
= scm_getc_unlocked (port
);
1349 chr
= scm_getc_unlocked (port
);
1353 chr
= scm_getc_unlocked (port
);
1357 return maybe_annotate_source
1358 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1359 port
, opts
, line
, column
);
1362 scm_i_input_error ("read_bytevector", port
,
1363 "invalid bytevector prefix",
1364 SCM_MAKE_CHAR (chr
));
1365 return SCM_UNSPECIFIED
;
1369 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1370 long line
, int column
)
1372 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1373 terribly inefficient but who cares? */
1374 SCM s_bits
= SCM_EOL
;
1376 for (chr
= scm_getc_unlocked (port
);
1377 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1378 chr
= scm_getc_unlocked (port
))
1380 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1384 scm_ungetc_unlocked (chr
, port
);
1386 return maybe_annotate_source
1387 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1388 port
, opts
, line
, column
);
1392 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1398 int c
= scm_getc_unlocked (port
);
1401 scm_i_input_error ("skip_block_comment", port
,
1402 "unterminated `#! ... !#' comment", SCM_EOL
);
1406 else if (c
== '#' && bang_seen
)
1412 return SCM_UNSPECIFIED
;
1415 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1417 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1419 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1423 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1425 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1429 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1431 c
= scm_getc_unlocked (port
);
1433 scm_i_input_error ("skip_block_comment", port
,
1434 "unterminated `#! ... !#' comment", SCM_EOL
);
1435 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1437 else if (CHAR_IS_DELIMITER (c
))
1439 scm_ungetc_unlocked (c
, port
);
1441 if (0 == strcmp ("r6rs", name
))
1442 ; /* Silently ignore */
1443 else if (0 == strcmp ("fold-case", name
))
1444 set_port_case_insensitive_p (port
, opts
, 1);
1445 else if (0 == strcmp ("no-fold-case", name
))
1446 set_port_case_insensitive_p (port
, opts
, 0);
1447 else if (0 == strcmp ("curly-infix", name
))
1448 set_port_curly_infix_p (port
, opts
, 1);
1449 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1451 set_port_curly_infix_p (port
, opts
, 1);
1452 set_port_square_brackets_p (port
, opts
, 0);
1457 return SCM_UNSPECIFIED
;
1461 scm_ungetc_unlocked (c
, port
);
1466 scm_ungetc_unlocked (name
[--i
], port
);
1467 return scm_read_scsh_block_comment (chr
, port
);
1471 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1473 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474 nested. So care must be taken. */
1475 int nesting_level
= 1;
1477 int a
= scm_getc_unlocked (port
);
1480 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1481 "unterminated `#| ... |#' comment", SCM_EOL
);
1483 while (nesting_level
> 0)
1485 int b
= scm_getc_unlocked (port
);
1488 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1489 "unterminated `#| ... |#' comment", SCM_EOL
);
1491 if (a
== '|' && b
== '#')
1496 else if (a
== '#' && b
== '|')
1505 return SCM_UNSPECIFIED
;
1509 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1510 scm_t_read_opts
*opts
)
1514 c
= flush_ws (port
, opts
, (char *) NULL
);
1516 scm_i_input_error ("read_commented_expression", port
,
1517 "no expression after #; comment", SCM_EOL
);
1518 scm_ungetc_unlocked (c
, port
);
1519 scm_read_expression (port
, opts
);
1520 return SCM_UNSPECIFIED
;
1524 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1526 /* Guile's extended symbol read syntax looks like this:
1528 #{This is all a symbol name}#
1530 So here, CHR is expected to be `{'. */
1533 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1535 buf
= scm_i_string_start_writing (buf
);
1537 while ((chr
= scm_getc_unlocked (port
)) != EOF
)
1548 scm_i_string_set_x (buf
, len
++, '}');
1554 else if (chr
== '\\')
1556 /* It used to be that print.c would print extended-read-syntax
1557 symbols with backslashes before "non-standard" chars, but
1558 this routine wouldn't do anything with those escapes.
1559 Bummer. What we've done is to change print.c to output
1560 R6RS hex escapes for those characters, relying on the fact
1561 that the extended read syntax would never put a `\' before
1562 an `x'. For now, we just ignore other instances of
1563 backslash in the string. */
1564 switch ((chr
= scm_getc_unlocked (port
)))
1572 SCM_READ_HEX_ESCAPE (10, ';');
1573 scm_i_string_set_x (buf
, len
++, c
);
1581 scm_i_string_stop_writing ();
1582 scm_i_input_error ("scm_read_extended_symbol", port
,
1583 "illegal character in escape sequence: ~S",
1584 scm_list_1 (SCM_MAKE_CHAR (c
)));
1588 scm_i_string_set_x (buf
, len
++, chr
);
1593 scm_i_string_set_x (buf
, len
++, chr
);
1595 if (len
>= scm_i_string_length (buf
) - 2)
1599 scm_i_string_stop_writing ();
1600 addy
= scm_i_make_string (1024, NULL
, 0);
1601 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1603 buf
= scm_i_string_start_writing (buf
);
1608 scm_i_string_stop_writing ();
1610 scm_i_input_error ("scm_read_extended_symbol", port
,
1611 "end of file while reading symbol", SCM_EOL
);
1613 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1618 /* Top-level token readers, i.e., dispatchers. */
1621 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1625 proc
= scm_get_hash_procedure (chr
);
1626 if (scm_is_true (scm_procedure_p (proc
)))
1628 long line
= SCM_LINUM (port
);
1629 int column
= SCM_COL (port
) - 2;
1632 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1634 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1635 && !scm_i_has_source_properties (got
))
1636 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1641 return SCM_UNSPECIFIED
;
1644 /* The reader for the sharp `#' character. It basically dispatches reads
1645 among the above token readers. */
1647 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1648 long line
, int column
)
1649 #define FUNC_NAME "scm_lreadr"
1653 chr
= scm_getc_unlocked (port
);
1655 result
= scm_read_sharp_extension (chr
, port
, opts
);
1656 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1662 return (scm_read_character (chr
, port
, opts
));
1664 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1669 /* This one may return either a boolean or an SRFI-4 vector. */
1670 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1672 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1674 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1678 return (scm_read_boolean (chr
, port
));
1680 return (scm_read_keyword (chr
, port
, opts
));
1681 case '0': case '1': case '2': case '3': case '4':
1682 case '5': case '6': case '7': case '8': case '9':
1684 return (scm_read_array (chr
, port
, opts
, line
, column
));
1698 return (scm_read_number_and_radix (chr
, port
, opts
));
1700 return (scm_read_extended_symbol (chr
, port
));
1702 return (scm_read_shebang (chr
, port
, opts
));
1704 return (scm_read_commented_expression (chr
, port
, opts
));
1708 return (scm_read_syntax (chr
, port
, opts
));
1710 return (scm_read_nil (chr
, port
, opts
));
1712 result
= scm_read_sharp_extension (chr
, port
, opts
);
1713 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1715 /* To remain compatible with 1.8 and earlier, the following
1716 characters have lower precedence than `read-hash-extend'
1721 return scm_read_r6rs_block_comment (chr
, port
);
1723 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1724 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1731 return SCM_UNSPECIFIED
;
1736 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1737 #define FUNC_NAME "read_inner_expression"
1743 chr
= scm_getc_unlocked (port
);
1747 case SCM_WHITE_SPACES
:
1748 case SCM_LINE_INCREMENTORS
:
1751 (void) scm_read_semicolon_comment (chr
, port
);
1754 if (opts
->curly_infix_p
)
1756 if (opts
->neoteric_p
)
1757 return scm_read_sexp (chr
, port
, opts
);
1762 /* Enable neoteric expressions within curly braces */
1763 opts
->neoteric_p
= 1;
1764 expr
= scm_read_sexp (chr
, port
, opts
);
1765 opts
->neoteric_p
= 0;
1770 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1772 if (opts
->square_brackets_p
)
1773 return scm_read_sexp (chr
, port
, opts
);
1774 else if (opts
->curly_infix_p
)
1776 /* The syntax of neoteric expressions requires that '[' be
1777 a delimiter when curly-infix is enabled, so it cannot
1778 be part of an unescaped symbol. We might as well do
1779 something useful with it, so we adopt Kawa's convention:
1780 [...] => ($bracket-list$ ...) */
1781 long line
= SCM_LINUM (port
);
1782 int column
= SCM_COL (port
) - 1;
1783 return maybe_annotate_source
1784 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1785 port
, opts
, line
, column
);
1788 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1790 return (scm_read_sexp (chr
, port
, opts
));
1792 return (scm_read_string (chr
, port
, opts
));
1794 if (opts
->r7rs_symbols_p
)
1795 return scm_read_r7rs_symbol (chr
, port
, opts
);
1797 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1801 return (scm_read_quote (chr
, port
, opts
));
1804 long line
= SCM_LINUM (port
);
1805 int column
= SCM_COL (port
) - 1;
1806 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1807 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1808 /* We read a comment or some such. */
1814 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1817 if (opts
->curly_infix_p
)
1818 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1820 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1822 if (opts
->square_brackets_p
)
1823 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1824 /* otherwise fall through */
1828 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1829 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1834 if (((chr
>= '0') && (chr
<= '9'))
1835 || (strchr ("+-.", chr
)))
1836 return (scm_read_number (chr
, port
, opts
));
1838 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1846 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1847 #define FUNC_NAME "scm_read_expression"
1849 if (!opts
->neoteric_p
)
1850 return read_inner_expression (port
, opts
);
1857 if (opts
->record_positions_p
)
1859 /* We need to get the position of the first non-whitespace
1860 character in order to correctly annotate neoteric
1861 expressions. For example, for the expression 'f(x)', the
1862 first call to 'read_inner_expression' reads the 'f' (which
1863 cannot be annotated), and then we later read the '(x)' and
1864 use it to construct the new list (f x). */
1865 int c
= flush_ws (port
, opts
, (char *) NULL
);
1868 scm_ungetc_unlocked (c
, port
);
1869 line
= SCM_LINUM (port
);
1870 column
= SCM_COL (port
);
1873 expr
= read_inner_expression (port
, opts
);
1875 /* 'expr' is the first component of the neoteric expression. Now
1876 we loop, and as long as the next character is '(', '[', or '{',
1877 (without any intervening whitespace), we use it to construct a
1878 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1881 int chr
= scm_getc_unlocked (port
);
1884 /* e(...) => (e ...) */
1885 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1886 else if (chr
== '[')
1887 /* e[...] => ($bracket-apply$ e ...) */
1888 expr
= scm_cons (sym_bracket_apply
,
1890 scm_read_sexp (chr
, port
, opts
)));
1891 else if (chr
== '{')
1893 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1895 if (scm_is_null (arg
))
1896 expr
= scm_list_1 (expr
); /* e{} => (e) */
1898 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1903 scm_ungetc_unlocked (chr
, port
);
1906 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1914 /* Actual reader. */
1916 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1918 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1920 "Read an s-expression from the input port @var{port}, or from\n"
1921 "the current input port if @var{port} is not specified.\n"
1922 "Any whitespace before the next token is discarded.")
1923 #define FUNC_NAME s_scm_read
1925 scm_t_read_opts opts
;
1928 if (SCM_UNBNDP (port
))
1929 port
= scm_current_input_port ();
1930 SCM_VALIDATE_OPINPORT (1, port
);
1932 init_read_options (port
, &opts
);
1934 c
= flush_ws (port
, &opts
, (char *) NULL
);
1937 scm_ungetc_unlocked (c
, port
);
1939 return (scm_read_expression (port
, &opts
));
1946 /* Manipulate the read-hash-procedures alist. This could be written in
1947 Scheme, but maybe it will also be used by C code during initialisation. */
1948 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1949 (SCM chr
, SCM proc
),
1950 "Install the procedure @var{proc} for reading expressions\n"
1951 "starting with the character sequence @code{#} and @var{chr}.\n"
1952 "@var{proc} will be called with two arguments: the character\n"
1953 "@var{chr} and the port to read further data from. The object\n"
1954 "returned will be the return value of @code{read}. \n"
1955 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1957 #define FUNC_NAME s_scm_read_hash_extend
1962 SCM_VALIDATE_CHAR (1, chr
);
1963 SCM_ASSERT (scm_is_false (proc
)
1964 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1965 proc
, SCM_ARG2
, FUNC_NAME
);
1967 /* Check if chr is already in the alist. */
1968 this = scm_i_read_hash_procedures_ref ();
1972 if (scm_is_null (this))
1974 /* not found, so add it to the beginning. */
1975 if (scm_is_true (proc
))
1977 SCM
new = scm_cons (scm_cons (chr
, proc
),
1978 scm_i_read_hash_procedures_ref ());
1979 scm_i_read_hash_procedures_set_x (new);
1983 if (scm_is_eq (chr
, SCM_CAAR (this)))
1985 /* already in the alist. */
1986 if (scm_is_false (proc
))
1989 if (scm_is_false (prev
))
1991 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1992 scm_i_read_hash_procedures_set_x (rest
);
1995 scm_set_cdr_x (prev
, SCM_CDR (this));
2000 scm_set_cdr_x (SCM_CAR (this), proc
);
2005 this = SCM_CDR (this);
2008 return SCM_UNSPECIFIED
;
2012 /* Recover the read-hash procedure corresponding to char c. */
2014 scm_get_hash_procedure (int c
)
2016 SCM rest
= scm_i_read_hash_procedures_ref ();
2020 if (scm_is_null (rest
))
2023 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
2024 return SCM_CDAR (rest
);
2026 rest
= SCM_CDR (rest
);
2031 is_encoding_char (char c
)
2033 if (c
>= 'a' && c
<= 'z') return 1;
2034 if (c
>= 'A' && c
<= 'Z') return 1;
2035 if (c
>= '0' && c
<= '9') return 1;
2036 return strchr ("_-.:/,+=()", c
) != NULL
;
2039 /* Maximum size of an encoding name. This is a bit more than the
2040 longest name listed at
2041 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2043 #define ENCODING_NAME_MAX_SIZE 20
2045 /* Number of bytes at the beginning or end of a file that are scanned
2046 for a "coding:" declaration. */
2047 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2050 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2051 coding declaration. Returns either NULL or a string whose storage
2052 has been allocated with `scm_gc_malloc'. */
2054 scm_i_scan_for_encoding (SCM port
)
2057 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
2058 size_t bytes_read
, encoding_length
, i
;
2059 char *encoding
= NULL
;
2060 char *pos
, *encoding_start
;
2063 pt
= SCM_PTAB_ENTRY (port
);
2065 if (pt
->rw_active
== SCM_PORT_WRITE
)
2066 scm_flush_unlocked (port
);
2069 pt
->rw_active
= SCM_PORT_READ
;
2071 if (pt
->read_pos
== pt
->read_end
)
2073 /* We can use the read buffer, and thus avoid a seek. */
2074 if (scm_fill_input_unlocked (port
) == EOF
)
2077 bytes_read
= pt
->read_end
- pt
->read_pos
;
2078 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
2079 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
2081 if (bytes_read
<= 1)
2082 /* An unbuffered port -- don't scan. */
2085 memcpy (header
, pt
->read_pos
, bytes_read
);
2086 header
[bytes_read
] = '\0';
2090 /* Try to read some bytes and then seek back. Not all ports
2091 support seeking back; and indeed some file ports (like
2092 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2093 check performed by SCM_FPORT_FDES---but fail to seek
2094 backwards. Hence this block comes second. We prefer to use
2095 the read buffer in-place. */
2096 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2099 bytes_read
= scm_c_read_unlocked (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2100 header
[bytes_read
] = '\0';
2101 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2104 /* search past "coding[:=]" */
2108 if ((pos
= strstr(pos
, "coding")) == NULL
)
2111 pos
+= strlen ("coding");
2112 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2113 (*pos
== ':' || *pos
== '='))
2121 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2122 (*pos
== ' ' || *pos
== '\t'))
2125 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
- ENCODING_NAME_MAX_SIZE
)
2126 /* We found the "coding:" string, but there is probably not enough
2127 room to store an encoding name in its entirety, so ignore it.
2128 This makes sure we do not end up returning a truncated encoding
2132 /* grab the next token */
2133 encoding_start
= pos
;
2135 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2136 && encoding_start
+ i
- header
< bytes_read
2137 && is_encoding_char (encoding_start
[i
]))
2140 encoding_length
= i
;
2141 if (encoding_length
== 0)
2144 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2146 /* push backwards to make sure we were in a comment */
2148 pos
= encoding_start
;
2149 while (pos
>= header
)
2156 else if (*pos
== '\n' || pos
== header
)
2158 /* This wasn't in a semicolon comment. Check for a
2159 hash-bang comment. */
2160 char *beg
= strstr (header
, "#!");
2161 char *end
= strstr (header
, "!#");
2162 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2173 /* This wasn't in a comment */
2179 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2181 "Scans the port for an Emacs-like character coding declaration\n"
2182 "near the top of the contents of a port with random-accessible contents.\n"
2183 "The coding declaration is of the form\n"
2184 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2186 "Returns a string containing the character encoding of the file\n"
2187 "if a declaration was found, or @code{#f} otherwise.\n")
2188 #define FUNC_NAME s_scm_file_encoding
2193 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2195 enc
= scm_i_scan_for_encoding (port
);
2200 s_enc
= scm_string_upcase (scm_from_locale_string (enc
));
2209 /* Per-port read options.
2211 We store per-port read options in the 'port-read-options' port
2212 property, which is stored in the internal port structure. The value
2213 stored is a single integer that contains a two-bit field for each
2216 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2217 the applicable value should be inherited from the corresponding
2218 global read option. Otherwise, the bit field contains the value of
2219 the read option. For boolean read options that have been set
2220 per-port, the possible values are 0 or 1. If the 'keyword_style'
2221 read option has been set per-port, its possible values are those in
2222 'enum t_keyword_style'. */
2224 /* Key to read options in port properties. */
2225 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2227 /* Offsets of bit fields for each per-port override */
2228 #define READ_OPTION_COPY_SOURCE_P 0
2229 #define READ_OPTION_RECORD_POSITIONS_P 2
2230 #define READ_OPTION_CASE_INSENSITIVE_P 4
2231 #define READ_OPTION_KEYWORD_STYLE 6
2232 #define READ_OPTION_R6RS_ESCAPES_P 8
2233 #define READ_OPTION_SQUARE_BRACKETS_P 10
2234 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2235 #define READ_OPTION_CURLY_INFIX_P 14
2236 #define READ_OPTION_R7RS_SYMBOLS_P 16
2238 /* The total width in bits of the per-port overrides */
2239 #define READ_OPTIONS_NUM_BITS 18
2241 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2242 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2244 #define READ_OPTION_MASK 3
2245 #define READ_OPTION_INHERIT 3
2248 set_port_read_option (SCM port
, int option
, int new_value
)
2250 SCM scm_read_options
;
2251 unsigned int read_options
;
2253 new_value
&= READ_OPTION_MASK
;
2255 scm_dynwind_begin (0);
2256 scm_dynwind_lock_port (port
);
2258 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2259 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2260 read_options
= scm_to_uint (scm_read_options
);
2262 read_options
= READ_OPTIONS_INHERIT_ALL
;
2263 read_options
&= ~(READ_OPTION_MASK
<< option
);
2264 read_options
|= new_value
<< option
;
2265 scm_read_options
= scm_from_uint (read_options
);
2266 scm_i_set_port_property_x (port
, sym_port_read_options
, scm_read_options
);
2271 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2273 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2276 opts
->case_insensitive_p
= value
;
2277 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2280 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2282 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2285 opts
->square_brackets_p
= value
;
2286 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2289 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2291 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2294 opts
->curly_infix_p
= value
;
2295 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2298 /* Initialize OPTS based on PORT's read options and the global read
2301 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2303 SCM val
, scm_read_options
;
2304 unsigned int read_options
, x
;
2306 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2308 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2309 read_options
= scm_to_uint (scm_read_options
);
2311 read_options
= READ_OPTIONS_INHERIT_ALL
;
2313 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2314 if (x
== READ_OPTION_INHERIT
)
2316 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2317 if (scm_is_eq (val
, scm_keyword_prefix
))
2318 x
= KEYWORD_STYLE_PREFIX
;
2319 else if (scm_is_eq (val
, scm_keyword_postfix
))
2320 x
= KEYWORD_STYLE_POSTFIX
;
2322 x
= KEYWORD_STYLE_HASH_PREFIX
;
2324 opts
->keyword_style
= x
;
2326 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2329 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2330 if (x == READ_OPTION_INHERIT) \
2331 x = !!SCM_ ## NAME; \
2336 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2337 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2338 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2339 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2340 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2341 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2342 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2343 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P
, r7rs_symbols_p
);
2345 #undef RESOLVE_BOOLEAN_OPTION
2347 opts
->neoteric_p
= 0;
2353 SCM read_hash_procs
;
2355 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2357 scm_i_read_hash_procedures
=
2358 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2360 scm_init_opts (scm_read_options
, scm_read_opts
);
2361 #include "libguile/read.x"