1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include <c-strcase.h>
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/ports-internal.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
63 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
64 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
66 SCM_SYMBOL (sym_nil
, "nil");
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx
, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
73 scm_t_option scm_read_opts
[] =
75 { SCM_OPTION_BOOLEAN
, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN
, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN
, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
102 KEYWORD_STYLE_HASH_PREFIX
,
103 KEYWORD_STYLE_PREFIX
,
104 KEYWORD_STYLE_POSTFIX
109 enum t_keyword_style keyword_style
;
110 unsigned int copy_source_p
: 1;
111 unsigned int record_positions_p
: 1;
112 unsigned int case_insensitive_p
: 1;
113 unsigned int r6rs_escapes_p
: 1;
114 unsigned int square_brackets_p
: 1;
115 unsigned int hungry_eol_escapes_p
: 1;
116 unsigned int curly_infix_p
: 1;
117 unsigned int neoteric_p
: 1;
118 unsigned int r7rs_symbols_p
: 1;
121 typedef struct t_read_opts scm_t_read_opts
;
125 Give meaningful error messages for errors
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
139 scm_i_input_error (char const *function
,
140 SCM port
, const char *message
, SCM arg
)
142 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
144 : scm_from_locale_string ("#<unknown port>"));
146 SCM string_port
= scm_open_output_string ();
147 SCM string
= SCM_EOL
;
148 scm_simple_format (string_port
,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
151 scm_from_long (SCM_LINUM (port
) + 1),
152 scm_from_int (SCM_COL (port
) + 1),
153 scm_from_locale_string (message
)));
155 string
= scm_get_output_string (string_port
);
156 scm_close_output_port (string_port
);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
165 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
172 SCM ans
= scm_options (setting
,
175 if (SCM_COPY_SOURCE_P
)
176 SCM_RECORD_POSITIONS_P
= 1;
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM
*scm_i_read_hash_procedures
;
186 scm_i_read_hash_procedures_ref (void)
188 return scm_fluid_ref (*scm_i_read_hash_procedures
);
192 scm_i_read_hash_procedures_set_x (SCM value
)
194 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
229 #define CHAR_IS_R5RS_DELIMITER(c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
245 /* Read an SCSH block comment. */
246 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
247 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
248 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
249 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
250 static SCM
scm_get_hash_procedure (int);
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
257 read_token (SCM port
, scm_t_read_opts
*opts
,
258 char *buf
, size_t buf_size
, size_t *read
)
262 while (*read
< buf_size
)
266 chr
= scm_get_byte_or_eof_unlocked (port
);
270 else if (CHAR_IS_DELIMITER (chr
))
272 scm_unget_byte_unlocked (chr
, port
);
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
288 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
289 char *buffer
, size_t buffer_size
, size_t *read
)
292 size_t bytes_read
, overflow_size
= 0;
293 char *overflow_buffer
= NULL
;
297 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
300 if (overflow
|| overflow_size
!= 0)
302 if (overflow_size
== 0)
304 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
305 memcpy (overflow_buffer
, buffer
, bytes_read
);
306 overflow_size
= bytes_read
;
311 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
313 memcpy (new_buf
, overflow_buffer
, overflow_size
);
314 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
316 overflow_buffer
= new_buf
;
317 overflow_size
+= bytes_read
;
324 *read
= overflow_size
;
328 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
334 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
338 switch (c
= scm_getc_unlocked (port
))
344 scm_i_input_error (eoferr
,
353 switch (c
= scm_getc_unlocked (port
))
359 case SCM_LINE_INCREMENTORS
:
365 switch (c
= scm_getc_unlocked (port
))
368 eoferr
= "read_sharp";
371 scm_read_shebang (c
, port
, opts
);
374 scm_read_commented_expression (c
, port
, opts
);
377 if (scm_is_false (scm_get_hash_procedure (c
)))
379 scm_read_r6rs_block_comment (c
, port
);
384 scm_ungetc_unlocked (c
, port
);
389 case SCM_LINE_INCREMENTORS
:
390 case SCM_SINGLE_SPACES
:
405 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
406 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
407 long line
, int column
);
411 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
412 long line
, int column
)
414 if (opts
->record_positions_p
)
415 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
420 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
421 #define FUNC_NAME "scm_i_lreadparen"
424 SCM tmp
, tl
, ans
= SCM_EOL
;
425 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
426 const int terminating_char
= ((chr
== '{') ? '}'
427 : ((chr
== '[') ? ']'
430 /* Need to capture line and column numbers here. */
431 long line
= SCM_LINUM (port
);
432 int column
= SCM_COL (port
) - 1;
434 c
= flush_ws (port
, opts
, FUNC_NAME
);
435 if (terminating_char
== c
)
438 scm_ungetc_unlocked (c
, port
);
439 tmp
= scm_read_expression (port
, opts
);
441 /* Note that it is possible for scm_read_expression to return
442 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
443 check that it's a real dot by checking `c'. */
444 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
446 ans
= scm_read_expression (port
, opts
);
447 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
448 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
453 /* Build the head of the list structure. */
454 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
456 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
460 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
461 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
462 scm_i_input_error (FUNC_NAME
, port
,
463 "in pair: mismatched close paren: ~A",
464 scm_list_1 (SCM_MAKE_CHAR (c
)));
466 scm_ungetc_unlocked (c
, port
);
467 tmp
= scm_read_expression (port
, opts
);
469 /* See above note about scm_sym_dot. */
470 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
472 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
474 c
= flush_ws (port
, opts
, FUNC_NAME
);
475 if (terminating_char
!= c
)
476 scm_i_input_error (FUNC_NAME
, port
,
477 "in pair: missing close paren", SCM_EOL
);
481 new_tail
= scm_cons (tmp
, SCM_EOL
);
482 SCM_SETCDR (tl
, new_tail
);
488 /* In addition to finding the length, 'scm_ilength' checks for
489 improper or circular lists, in which case it returns -1. */
490 int len
= scm_ilength (ans
);
492 /* The (len == 0) case is handled above */
494 /* Return directly to avoid re-annotating the element's source
495 location with the position of the outer brace. Also, it
496 might not be possible to annotate the element. */
497 return scm_car (ans
); /* {e} => e */
499 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
500 else if (len
>= 3 && (len
& 1))
502 /* It's a proper list whose length is odd and at least 3. If
503 the elements at odd indices (the infix operator positions)
504 are all 'equal?', then it's a simple curly-infix list.
505 Otherwise it's a mixed curly-infix list. */
506 SCM op
= scm_cadr (ans
);
508 /* Check to see if the elements at odd indices are 'equal?' */
509 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
511 if (scm_is_null (tl
))
513 /* Convert simple curly-infix list to prefix:
514 {a <op> b <op> ...} => (<op> a b ...) */
516 while (scm_is_pair (scm_cdr (tl
)))
519 SCM_SETCDR (tl
, tmp
);
522 ans
= scm_cons (op
, ans
);
525 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
527 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
528 ans
= scm_cons (sym_nfx
, ans
);
534 /* Mixed curly-infix (possibly improper) list:
535 {e . tail} => ($nfx$ e . tail) */
536 ans
= scm_cons (sym_nfx
, ans
);
539 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
544 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
545 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
547 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
553 while (i < ndigits) \
555 a = scm_getc_unlocked (port); \
559 && (a == (scm_t_wchar) terminator) \
562 if ('0' <= a && a <= '9') \
564 else if ('A' <= a && a <= 'F') \
566 else if ('a' <= a && a <= 'f') \
579 skip_intraline_whitespace (SCM port
)
585 c
= scm_getc_unlocked (port
);
589 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
591 scm_ungetc_unlocked (c
, port
);
594 /* Read either a double-quoted string or an R7RS-style symbol delimited
595 by vertical lines, depending on the value of 'chr' ('"' or '|').
596 Regardless, the result is always returned as a string. */
598 scm_read_string_like_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
599 #define FUNC_NAME "scm_lreadr"
601 /* For strings smaller than C_STR, this function creates only one Scheme
602 object (the string returned). */
605 size_t c_str_len
= 0;
606 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
608 /* Need to capture line and column numbers here. */
609 long line
= SCM_LINUM (port
);
610 int column
= SCM_COL (port
) - 1;
612 while (chr
!= (c
= scm_getc_unlocked (port
)))
617 scm_i_input_error (FUNC_NAME
, port
,
619 ? "end of file in symbol"
620 : "end of file in string constant"),
624 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
626 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
632 switch (c
= scm_getc_unlocked (port
))
640 if (opts
->hungry_eol_escapes_p
)
641 skip_intraline_whitespace (port
);
668 if (opts
->r6rs_escapes_p
|| chr
== '|')
669 SCM_READ_HEX_ESCAPE (10, ';');
671 SCM_READ_HEX_ESCAPE (2, '\0');
674 if (!opts
->r6rs_escapes_p
)
676 SCM_READ_HEX_ESCAPE (4, '\0');
680 if (!opts
->r6rs_escapes_p
)
682 SCM_READ_HEX_ESCAPE (6, '\0');
689 scm_i_input_error (FUNC_NAME
, port
,
690 "illegal character in escape sequence: ~S",
691 scm_list_1 (SCM_MAKE_CHAR (c
)));
695 c_str
[c_str_len
++] = c
;
698 if (scm_is_null (str
))
699 /* Fast path: we got a string that fits in C_STR. */
700 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
704 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
706 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
709 return maybe_annotate_source (str
, port
, opts
, line
, column
);
714 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
716 return scm_read_string_like_syntax (chr
, port
, opts
);
720 scm_read_r7rs_symbol (int chr
, SCM port
, scm_t_read_opts
*opts
)
722 return scm_string_to_symbol (scm_read_string_like_syntax (chr
, port
, opts
));
726 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
728 SCM result
, str
= SCM_EOL
;
729 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
732 /* Need to capture line and column numbers here. */
733 long line
= SCM_LINUM (port
);
734 int column
= SCM_COL (port
) - 1;
736 scm_ungetc_unlocked (chr
, port
);
737 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
740 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
742 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
743 if (scm_is_false (result
))
745 /* Return a symbol instead of a number */
746 if (opts
->case_insensitive_p
)
747 str
= scm_string_downcase_x (str
);
748 result
= scm_string_to_symbol (str
);
750 else if (SCM_NIMP (result
))
751 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
753 SCM_COL (port
) += scm_i_string_length (str
);
758 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
761 int ends_with_colon
= 0;
763 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
764 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
767 scm_ungetc_unlocked (chr
, port
);
768 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
771 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
773 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
775 str
= scm_from_port_stringn (buffer
, bytes_read
- 1, port
);
777 if (opts
->case_insensitive_p
)
778 str
= scm_string_downcase_x (str
);
779 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
783 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
785 if (opts
->case_insensitive_p
)
786 str
= scm_string_downcase_x (str
);
787 result
= scm_string_to_symbol (str
);
790 SCM_COL (port
) += scm_i_string_length (str
);
795 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
796 #define FUNC_NAME "scm_lreadr"
800 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
827 scm_ungetc_unlocked (chr
, port
);
828 scm_ungetc_unlocked ('#', port
);
832 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
835 str
= scm_from_port_stringn (buffer
, read
, port
);
837 result
= scm_string_to_number (str
, scm_from_uint (radix
));
839 SCM_COL (port
) += scm_i_string_length (str
);
841 if (scm_is_true (result
))
844 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
851 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
854 long line
= SCM_LINUM (port
);
855 int column
= SCM_COL (port
) - 1;
860 p
= scm_sym_quasiquote
;
871 c
= scm_getc_unlocked (port
);
873 p
= scm_sym_uq_splicing
;
876 scm_ungetc_unlocked (c
, port
);
883 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
884 "scm_read_quote", chr
);
888 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
889 return maybe_annotate_source (p
, port
, opts
, line
, column
);
892 SCM_SYMBOL (sym_syntax
, "syntax");
893 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
894 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
895 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
898 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
901 long line
= SCM_LINUM (port
);
902 int column
= SCM_COL (port
) - 1;
918 c
= scm_getc_unlocked (port
);
920 p
= sym_unsyntax_splicing
;
923 scm_ungetc_unlocked (c
, port
);
930 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
931 "scm_read_syntax", chr
);
935 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
936 return maybe_annotate_source (p
, port
, opts
, line
, column
);
940 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
942 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
944 if (!scm_is_eq (id
, sym_nil
))
945 scm_i_input_error ("scm_read_nil", port
,
946 "unexpected input while reading #nil: ~a",
949 return SCM_ELISP_NIL
;
953 scm_read_semicolon_comment (int chr
, SCM port
)
957 /* We use the get_byte here because there is no need to get the
958 locale correct with comment input. This presumes that newline
959 always represents itself no matter what the encoding is. */
960 for (c
= scm_get_byte_or_eof_unlocked (port
);
961 (c
!= EOF
) && (c
!= '\n');
962 c
= scm_get_byte_or_eof_unlocked (port
));
964 return SCM_UNSPECIFIED
;
967 /* If the EXPECTED_CHARS are the next ones available from PORT, then
968 consume them and return 1. Otherwise leave the port position where
969 it was and return 0. EXPECTED_CHARS should be all lowercase, and
970 will be matched case-insensitively against the characters read from
973 try_read_ci_chars (SCM port
, const char *expected_chars
)
975 int num_chars_wanted
= strlen (expected_chars
);
976 int num_chars_read
= 0;
977 char *chars_read
= alloca (num_chars_wanted
);
980 while (num_chars_read
< num_chars_wanted
)
982 c
= scm_getc_unlocked (port
);
985 else if (c_tolower (c
) != expected_chars
[num_chars_read
])
987 scm_ungetc_unlocked (c
, port
);
991 chars_read
[num_chars_read
++] = c
;
994 if (num_chars_read
== num_chars_wanted
)
998 while (num_chars_read
> 0)
999 scm_ungetc_unlocked (chars_read
[--num_chars_read
], port
);
1005 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1008 scm_read_boolean (int chr
, SCM port
)
1014 try_read_ci_chars (port
, "rue");
1019 try_read_ci_chars (port
, "alse");
1023 return SCM_UNSPECIFIED
;
1027 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1028 #define FUNC_NAME "scm_lreadr"
1030 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
1032 size_t charname_len
, bytes_read
;
1035 scm_t_port_internal
*pti
;
1037 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
1040 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
1042 if (bytes_read
== 0)
1044 chr
= scm_getc_unlocked (port
);
1046 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
1047 "while reading character", SCM_EOL
);
1049 /* CHR must be a token delimiter, like a whitespace. */
1050 return (SCM_MAKE_CHAR (chr
));
1053 pti
= SCM_PORT_GET_INTERNAL (port
);
1055 /* Simple ASCII characters can be processed immediately. Also, simple
1056 ISO-8859-1 characters can be processed immediately if the encoding for this
1057 port is ISO-8859-1. */
1058 if (bytes_read
== 1 &&
1059 ((unsigned char) buffer
[0] <= 127
1060 || pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
))
1062 SCM_COL (port
) += 1;
1063 return SCM_MAKE_CHAR (buffer
[0]);
1066 /* Otherwise, convert the buffer into a proper scheme string for
1068 charname
= scm_from_port_stringn (buffer
, bytes_read
, port
);
1069 charname_len
= scm_i_string_length (charname
);
1070 SCM_COL (port
) += charname_len
;
1071 cp
= scm_i_string_ref (charname
, 0);
1072 if (charname_len
== 1)
1073 return SCM_MAKE_CHAR (cp
);
1075 /* Ignore dotted circles, which may be used to keep combining characters from
1076 combining with the backslash in #\charname. */
1077 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1078 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1080 if (cp
>= '0' && cp
< '8')
1082 /* Dirk:FIXME:: This type of character syntax is not R5RS
1083 * compliant. Further, it should be verified that the constant
1084 * does only consist of octal digits. */
1085 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1086 if (SCM_I_INUMP (p
))
1088 scm_t_wchar c
= scm_to_uint32 (p
);
1089 if (SCM_IS_UNICODE_CHAR (c
))
1090 return SCM_MAKE_CHAR (c
);
1092 scm_i_input_error (FUNC_NAME
, port
,
1093 "out-of-range octal character escape: ~a",
1094 scm_list_1 (charname
));
1098 if (cp
== 'x' && (charname_len
> 1))
1102 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1103 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1104 scm_from_uint (16));
1105 if (SCM_I_INUMP (p
))
1107 scm_t_wchar c
= scm_to_uint32 (p
);
1108 if (SCM_IS_UNICODE_CHAR (c
))
1109 return SCM_MAKE_CHAR (c
);
1111 scm_i_input_error (FUNC_NAME
, port
,
1112 "out-of-range hex character escape: ~a",
1113 scm_list_1 (charname
));
1117 /* The names of characters should never have non-Latin1
1119 if (scm_i_is_narrow_string (charname
)
1120 || scm_i_try_narrow_string (charname
))
1121 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1123 if (scm_is_true (ch
))
1127 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1128 scm_list_1 (charname
));
1130 return SCM_UNSPECIFIED
;
1135 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1139 /* Read the symbol that comprises the keyword. Doing this instead of
1140 invoking a specific symbol reader function allows `scm_read_keyword ()'
1141 to adapt to the delimiters currently valid of symbols.
1143 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1144 symbol
= scm_read_expression (port
, opts
);
1145 if (!scm_is_symbol (symbol
))
1146 scm_i_input_error ("scm_read_keyword", port
,
1147 "keyword prefix `~a' not followed by a symbol: ~s",
1148 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1150 return (scm_symbol_to_keyword (symbol
));
1154 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1155 long line
, int column
)
1157 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1158 guarantee that it's going to do what we want. After all, this is an
1159 implementation detail of `scm_read_vector ()', not a desirable
1161 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1162 port
, opts
, line
, column
);
1165 /* Helper used by scm_read_array */
1167 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1176 c
= scm_getc_unlocked (port
);
1179 while ('0' <= c
&& c
<= '9')
1181 if (((SSIZE_MAX
- (c
-'0')) / 10) <= res
)
1182 scm_i_input_error ("read_decimal_integer", port
,
1183 "number too large", SCM_EOL
);
1184 res
= 10*res
+ c
-'0';
1186 c
= scm_getc_unlocked (port
);
1194 /* Read an array. This function can also read vectors and uniform
1195 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1198 C is the first character read after the '#'. */
1200 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1203 scm_t_wchar tag_buf
[8];
1206 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1208 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1209 the array code can not deal with zero-length dimensions yet, and
1210 we want to allow zero-length vectors, of course. */
1212 return scm_read_vector (c
, port
, opts
, line
, column
);
1214 /* Disambiguate between '#f' and uniform floating point vectors. */
1217 c
= scm_getc_unlocked (port
);
1218 if (c
!= '3' && c
!= '6')
1220 if (c
== 'a' && try_read_ci_chars (port
, "lse"))
1223 scm_ungetc_unlocked (c
, port
);
1229 goto continue_reading_tag
;
1234 c
= read_decimal_integer (port
, c
, &rank
);
1236 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1241 continue_reading_tag
:
1242 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1243 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1245 tag_buf
[tag_len
++] = c
;
1246 c
= scm_getc_unlocked (port
);
1252 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1253 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1254 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1259 if (c
== '@' || c
== ':')
1265 ssize_t lbnd
= 0, len
= 0;
1270 c
= scm_getc_unlocked (port
);
1271 c
= read_decimal_integer (port
, c
, &lbnd
);
1274 s
= scm_from_ssize_t (lbnd
);
1278 c
= scm_getc_unlocked (port
);
1279 c
= read_decimal_integer (port
, c
, &len
);
1281 scm_i_input_error (NULL
, port
,
1282 "array length must be non-negative",
1285 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1288 shape
= scm_cons (s
, shape
);
1289 } while (c
== '@' || c
== ':');
1291 shape
= scm_reverse_x (shape
, SCM_EOL
);
1294 /* Read nested lists of elements. */
1296 scm_i_input_error (NULL
, port
,
1297 "missing '(' in vector or array literal",
1299 elements
= scm_read_sexp (c
, port
, opts
);
1301 if (scm_is_false (shape
))
1302 shape
= scm_from_ssize_t (rank
);
1303 else if (scm_ilength (shape
) != rank
)
1306 "the number of shape specifications must match the array rank",
1309 /* Handle special print syntax of rank zero arrays; see
1310 scm_i_print_array for a rationale. */
1313 if (!scm_is_pair (elements
))
1314 scm_i_input_error (NULL
, port
,
1315 "too few elements in array literal, need 1",
1317 if (!scm_is_null (SCM_CDR (elements
)))
1318 scm_i_input_error (NULL
, port
,
1319 "too many elements in array literal, want 1",
1321 elements
= SCM_CAR (elements
);
1324 /* Construct array, annotate with source location, and return. */
1325 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1326 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1330 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1331 long line
, int column
)
1333 return scm_read_array (chr
, port
, opts
, line
, column
);
1337 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1338 long line
, int column
)
1340 chr
= scm_getc_unlocked (port
);
1344 chr
= scm_getc_unlocked (port
);
1348 chr
= scm_getc_unlocked (port
);
1352 return maybe_annotate_source
1353 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1354 port
, opts
, line
, column
);
1357 scm_i_input_error ("read_bytevector", port
,
1358 "invalid bytevector prefix",
1359 SCM_MAKE_CHAR (chr
));
1360 return SCM_UNSPECIFIED
;
1364 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1365 long line
, int column
)
1367 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1368 terribly inefficient but who cares? */
1369 SCM s_bits
= SCM_EOL
;
1371 for (chr
= scm_getc_unlocked (port
);
1372 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1373 chr
= scm_getc_unlocked (port
))
1375 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1379 scm_ungetc_unlocked (chr
, port
);
1381 return maybe_annotate_source
1382 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1383 port
, opts
, line
, column
);
1387 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1393 int c
= scm_getc_unlocked (port
);
1396 scm_i_input_error ("skip_block_comment", port
,
1397 "unterminated `#! ... !#' comment", SCM_EOL
);
1401 else if (c
== '#' && bang_seen
)
1407 return SCM_UNSPECIFIED
;
1410 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1412 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1414 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1418 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1420 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1424 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1426 c
= scm_getc_unlocked (port
);
1428 scm_i_input_error ("skip_block_comment", port
,
1429 "unterminated `#! ... !#' comment", SCM_EOL
);
1430 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1432 else if (CHAR_IS_DELIMITER (c
))
1434 scm_ungetc_unlocked (c
, port
);
1436 if (0 == strcmp ("r6rs", name
))
1437 ; /* Silently ignore */
1438 else if (0 == strcmp ("fold-case", name
))
1439 set_port_case_insensitive_p (port
, opts
, 1);
1440 else if (0 == strcmp ("no-fold-case", name
))
1441 set_port_case_insensitive_p (port
, opts
, 0);
1442 else if (0 == strcmp ("curly-infix", name
))
1443 set_port_curly_infix_p (port
, opts
, 1);
1444 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1446 set_port_curly_infix_p (port
, opts
, 1);
1447 set_port_square_brackets_p (port
, opts
, 0);
1452 return SCM_UNSPECIFIED
;
1456 scm_ungetc_unlocked (c
, port
);
1461 scm_ungetc_unlocked (name
[--i
], port
);
1462 return scm_read_scsh_block_comment (chr
, port
);
1466 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1468 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1469 nested. So care must be taken. */
1470 int nesting_level
= 1;
1472 int a
= scm_getc_unlocked (port
);
1475 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1476 "unterminated `#| ... |#' comment", SCM_EOL
);
1478 while (nesting_level
> 0)
1480 int b
= scm_getc_unlocked (port
);
1483 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1484 "unterminated `#| ... |#' comment", SCM_EOL
);
1486 if (a
== '|' && b
== '#')
1491 else if (a
== '#' && b
== '|')
1500 return SCM_UNSPECIFIED
;
1504 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1505 scm_t_read_opts
*opts
)
1509 c
= flush_ws (port
, opts
, (char *) NULL
);
1511 scm_i_input_error ("read_commented_expression", port
,
1512 "no expression after #; comment", SCM_EOL
);
1513 scm_ungetc_unlocked (c
, port
);
1514 scm_read_expression (port
, opts
);
1515 return SCM_UNSPECIFIED
;
1519 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1521 /* Guile's extended symbol read syntax looks like this:
1523 #{This is all a symbol name}#
1525 So here, CHR is expected to be `{'. */
1528 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1530 buf
= scm_i_string_start_writing (buf
);
1532 while ((chr
= scm_getc_unlocked (port
)) != EOF
)
1543 scm_i_string_set_x (buf
, len
++, '}');
1549 else if (chr
== '\\')
1551 /* It used to be that print.c would print extended-read-syntax
1552 symbols with backslashes before "non-standard" chars, but
1553 this routine wouldn't do anything with those escapes.
1554 Bummer. What we've done is to change print.c to output
1555 R6RS hex escapes for those characters, relying on the fact
1556 that the extended read syntax would never put a `\' before
1557 an `x'. For now, we just ignore other instances of
1558 backslash in the string. */
1559 switch ((chr
= scm_getc_unlocked (port
)))
1567 SCM_READ_HEX_ESCAPE (10, ';');
1568 scm_i_string_set_x (buf
, len
++, c
);
1576 scm_i_string_stop_writing ();
1577 scm_i_input_error ("scm_read_extended_symbol", port
,
1578 "illegal character in escape sequence: ~S",
1579 scm_list_1 (SCM_MAKE_CHAR (c
)));
1583 scm_i_string_set_x (buf
, len
++, chr
);
1588 scm_i_string_set_x (buf
, len
++, chr
);
1590 if (len
>= scm_i_string_length (buf
) - 2)
1594 scm_i_string_stop_writing ();
1595 addy
= scm_i_make_string (1024, NULL
, 0);
1596 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1598 buf
= scm_i_string_start_writing (buf
);
1603 scm_i_string_stop_writing ();
1605 scm_i_input_error ("scm_read_extended_symbol", port
,
1606 "end of file while reading symbol", SCM_EOL
);
1608 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1613 /* Top-level token readers, i.e., dispatchers. */
1616 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1620 proc
= scm_get_hash_procedure (chr
);
1621 if (scm_is_true (scm_procedure_p (proc
)))
1623 long line
= SCM_LINUM (port
);
1624 int column
= SCM_COL (port
) - 2;
1627 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1629 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1630 && !scm_i_has_source_properties (got
))
1631 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1636 return SCM_UNSPECIFIED
;
1639 /* The reader for the sharp `#' character. It basically dispatches reads
1640 among the above token readers. */
1642 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1643 long line
, int column
)
1644 #define FUNC_NAME "scm_lreadr"
1648 chr
= scm_getc_unlocked (port
);
1650 result
= scm_read_sharp_extension (chr
, port
, opts
);
1651 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1657 return (scm_read_character (chr
, port
, opts
));
1659 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1664 /* This one may return either a boolean or an SRFI-4 vector. */
1665 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1667 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1669 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1673 return (scm_read_boolean (chr
, port
));
1675 return (scm_read_keyword (chr
, port
, opts
));
1676 case '0': case '1': case '2': case '3': case '4':
1677 case '5': case '6': case '7': case '8': case '9':
1679 return (scm_read_array (chr
, port
, opts
, line
, column
));
1693 return (scm_read_number_and_radix (chr
, port
, opts
));
1695 return (scm_read_extended_symbol (chr
, port
));
1697 return (scm_read_shebang (chr
, port
, opts
));
1699 return (scm_read_commented_expression (chr
, port
, opts
));
1703 return (scm_read_syntax (chr
, port
, opts
));
1705 return (scm_read_nil (chr
, port
, opts
));
1707 result
= scm_read_sharp_extension (chr
, port
, opts
);
1708 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1710 /* To remain compatible with 1.8 and earlier, the following
1711 characters have lower precedence than `read-hash-extend'
1716 return scm_read_r6rs_block_comment (chr
, port
);
1718 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1719 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1726 return SCM_UNSPECIFIED
;
1731 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1732 #define FUNC_NAME "read_inner_expression"
1738 chr
= scm_getc_unlocked (port
);
1742 case SCM_WHITE_SPACES
:
1743 case SCM_LINE_INCREMENTORS
:
1746 (void) scm_read_semicolon_comment (chr
, port
);
1749 if (opts
->curly_infix_p
)
1751 if (opts
->neoteric_p
)
1752 return scm_read_sexp (chr
, port
, opts
);
1757 /* Enable neoteric expressions within curly braces */
1758 opts
->neoteric_p
= 1;
1759 expr
= scm_read_sexp (chr
, port
, opts
);
1760 opts
->neoteric_p
= 0;
1765 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1767 if (opts
->square_brackets_p
)
1768 return scm_read_sexp (chr
, port
, opts
);
1769 else if (opts
->curly_infix_p
)
1771 /* The syntax of neoteric expressions requires that '[' be
1772 a delimiter when curly-infix is enabled, so it cannot
1773 be part of an unescaped symbol. We might as well do
1774 something useful with it, so we adopt Kawa's convention:
1775 [...] => ($bracket-list$ ...) */
1776 long line
= SCM_LINUM (port
);
1777 int column
= SCM_COL (port
) - 1;
1778 return maybe_annotate_source
1779 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1780 port
, opts
, line
, column
);
1783 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1785 return (scm_read_sexp (chr
, port
, opts
));
1787 return (scm_read_string (chr
, port
, opts
));
1789 if (opts
->r7rs_symbols_p
)
1790 return scm_read_r7rs_symbol (chr
, port
, opts
);
1792 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1796 return (scm_read_quote (chr
, port
, opts
));
1799 long line
= SCM_LINUM (port
);
1800 int column
= SCM_COL (port
) - 1;
1801 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1802 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1803 /* We read a comment or some such. */
1809 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1812 if (opts
->curly_infix_p
)
1813 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1815 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1817 if (opts
->square_brackets_p
)
1818 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1819 /* otherwise fall through */
1823 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1824 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1829 if (((chr
>= '0') && (chr
<= '9'))
1830 || (strchr ("+-.", chr
)))
1831 return (scm_read_number (chr
, port
, opts
));
1833 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1841 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1842 #define FUNC_NAME "scm_read_expression"
1844 if (!opts
->neoteric_p
)
1845 return read_inner_expression (port
, opts
);
1852 if (opts
->record_positions_p
)
1854 /* We need to get the position of the first non-whitespace
1855 character in order to correctly annotate neoteric
1856 expressions. For example, for the expression 'f(x)', the
1857 first call to 'read_inner_expression' reads the 'f' (which
1858 cannot be annotated), and then we later read the '(x)' and
1859 use it to construct the new list (f x). */
1860 int c
= flush_ws (port
, opts
, (char *) NULL
);
1863 scm_ungetc_unlocked (c
, port
);
1864 line
= SCM_LINUM (port
);
1865 column
= SCM_COL (port
);
1868 expr
= read_inner_expression (port
, opts
);
1870 /* 'expr' is the first component of the neoteric expression. Now
1871 we loop, and as long as the next character is '(', '[', or '{',
1872 (without any intervening whitespace), we use it to construct a
1873 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1876 int chr
= scm_getc_unlocked (port
);
1879 /* e(...) => (e ...) */
1880 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1881 else if (chr
== '[')
1882 /* e[...] => ($bracket-apply$ e ...) */
1883 expr
= scm_cons (sym_bracket_apply
,
1885 scm_read_sexp (chr
, port
, opts
)));
1886 else if (chr
== '{')
1888 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1890 if (scm_is_null (arg
))
1891 expr
= scm_list_1 (expr
); /* e{} => (e) */
1893 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1898 scm_ungetc_unlocked (chr
, port
);
1901 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1909 /* Actual reader. */
1911 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1913 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1915 "Read an s-expression from the input port @var{port}, or from\n"
1916 "the current input port if @var{port} is not specified.\n"
1917 "Any whitespace before the next token is discarded.")
1918 #define FUNC_NAME s_scm_read
1920 scm_t_read_opts opts
;
1923 if (SCM_UNBNDP (port
))
1924 port
= scm_current_input_port ();
1925 SCM_VALIDATE_OPINPORT (1, port
);
1927 init_read_options (port
, &opts
);
1929 c
= flush_ws (port
, &opts
, (char *) NULL
);
1932 scm_ungetc_unlocked (c
, port
);
1934 return (scm_read_expression (port
, &opts
));
1941 /* Manipulate the read-hash-procedures alist. This could be written in
1942 Scheme, but maybe it will also be used by C code during initialisation. */
1943 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1944 (SCM chr
, SCM proc
),
1945 "Install the procedure @var{proc} for reading expressions\n"
1946 "starting with the character sequence @code{#} and @var{chr}.\n"
1947 "@var{proc} will be called with two arguments: the character\n"
1948 "@var{chr} and the port to read further data from. The object\n"
1949 "returned will be the return value of @code{read}. \n"
1950 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1952 #define FUNC_NAME s_scm_read_hash_extend
1957 SCM_VALIDATE_CHAR (1, chr
);
1958 SCM_ASSERT (scm_is_false (proc
)
1959 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1960 proc
, SCM_ARG2
, FUNC_NAME
);
1962 /* Check if chr is already in the alist. */
1963 this = scm_i_read_hash_procedures_ref ();
1967 if (scm_is_null (this))
1969 /* not found, so add it to the beginning. */
1970 if (scm_is_true (proc
))
1972 SCM
new = scm_cons (scm_cons (chr
, proc
),
1973 scm_i_read_hash_procedures_ref ());
1974 scm_i_read_hash_procedures_set_x (new);
1978 if (scm_is_eq (chr
, SCM_CAAR (this)))
1980 /* already in the alist. */
1981 if (scm_is_false (proc
))
1984 if (scm_is_false (prev
))
1986 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1987 scm_i_read_hash_procedures_set_x (rest
);
1990 scm_set_cdr_x (prev
, SCM_CDR (this));
1995 scm_set_cdr_x (SCM_CAR (this), proc
);
2000 this = SCM_CDR (this);
2003 return SCM_UNSPECIFIED
;
2007 /* Recover the read-hash procedure corresponding to char c. */
2009 scm_get_hash_procedure (int c
)
2011 SCM rest
= scm_i_read_hash_procedures_ref ();
2015 if (scm_is_null (rest
))
2018 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
2019 return SCM_CDAR (rest
);
2021 rest
= SCM_CDR (rest
);
2026 is_encoding_char (char c
)
2028 if (c
>= 'a' && c
<= 'z') return 1;
2029 if (c
>= 'A' && c
<= 'Z') return 1;
2030 if (c
>= '0' && c
<= '9') return 1;
2031 return strchr ("_-.:/,+=()", c
) != NULL
;
2034 /* Maximum size of an encoding name. This is a bit more than the
2035 longest name listed at
2036 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2038 #define ENCODING_NAME_MAX_SIZE 20
2040 /* Number of bytes at the beginning or end of a file that are scanned
2041 for a "coding:" declaration. */
2042 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2045 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2046 coding declaration. Returns either NULL or a string whose storage
2047 has been allocated with `scm_gc_malloc'. */
2049 scm_i_scan_for_encoding (SCM port
)
2052 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
2053 size_t bytes_read
, encoding_length
, i
;
2054 char *encoding
= NULL
;
2055 char *pos
, *encoding_start
;
2058 pt
= SCM_PTAB_ENTRY (port
);
2060 if (pt
->rw_active
== SCM_PORT_WRITE
)
2061 scm_flush_unlocked (port
);
2064 pt
->rw_active
= SCM_PORT_READ
;
2066 if (pt
->read_pos
== pt
->read_end
)
2068 /* We can use the read buffer, and thus avoid a seek. */
2069 if (scm_fill_input_unlocked (port
) == EOF
)
2072 bytes_read
= pt
->read_end
- pt
->read_pos
;
2073 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
2074 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
2076 if (bytes_read
<= 1)
2077 /* An unbuffered port -- don't scan. */
2080 memcpy (header
, pt
->read_pos
, bytes_read
);
2081 header
[bytes_read
] = '\0';
2085 /* Try to read some bytes and then seek back. Not all ports
2086 support seeking back; and indeed some file ports (like
2087 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2088 check performed by SCM_FPORT_FDES---but fail to seek
2089 backwards. Hence this block comes second. We prefer to use
2090 the read buffer in-place. */
2091 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2094 bytes_read
= scm_c_read_unlocked (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2095 header
[bytes_read
] = '\0';
2096 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2099 /* search past "coding[:=]" */
2103 if ((pos
= strstr(pos
, "coding")) == NULL
)
2106 pos
+= strlen ("coding");
2107 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2108 (*pos
== ':' || *pos
== '='))
2116 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2117 (*pos
== ' ' || *pos
== '\t'))
2120 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
- ENCODING_NAME_MAX_SIZE
)
2121 /* We found the "coding:" string, but there is probably not enough
2122 room to store an encoding name in its entirety, so ignore it.
2123 This makes sure we do not end up returning a truncated encoding
2127 /* grab the next token */
2128 encoding_start
= pos
;
2130 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2131 && encoding_start
+ i
- header
< bytes_read
2132 && is_encoding_char (encoding_start
[i
]))
2135 encoding_length
= i
;
2136 if (encoding_length
== 0)
2139 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2141 /* push backwards to make sure we were in a comment */
2143 pos
= encoding_start
;
2144 while (pos
>= header
)
2151 else if (*pos
== '\n' || pos
== header
)
2153 /* This wasn't in a semicolon comment. Check for a
2154 hash-bang comment. */
2155 char *beg
= strstr (header
, "#!");
2156 char *end
= strstr (header
, "!#");
2157 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2168 /* This wasn't in a comment */
2174 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2176 "Scans the port for an Emacs-like character coding declaration\n"
2177 "near the top of the contents of a port with random-accessible contents.\n"
2178 "The coding declaration is of the form\n"
2179 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2181 "Returns a string containing the character encoding of the file\n"
2182 "if a declaration was found, or @code{#f} otherwise.\n")
2183 #define FUNC_NAME s_scm_file_encoding
2188 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2190 enc
= scm_i_scan_for_encoding (port
);
2195 s_enc
= scm_string_upcase (scm_from_locale_string (enc
));
2204 /* Per-port read options.
2206 We store per-port read options in the 'port-read-options' port
2207 property, which is stored in the internal port structure. The value
2208 stored is a single integer that contains a two-bit field for each
2211 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2212 the applicable value should be inherited from the corresponding
2213 global read option. Otherwise, the bit field contains the value of
2214 the read option. For boolean read options that have been set
2215 per-port, the possible values are 0 or 1. If the 'keyword_style'
2216 read option has been set per-port, its possible values are those in
2217 'enum t_keyword_style'. */
2219 /* Key to read options in port properties. */
2220 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2222 /* Offsets of bit fields for each per-port override */
2223 #define READ_OPTION_COPY_SOURCE_P 0
2224 #define READ_OPTION_RECORD_POSITIONS_P 2
2225 #define READ_OPTION_CASE_INSENSITIVE_P 4
2226 #define READ_OPTION_KEYWORD_STYLE 6
2227 #define READ_OPTION_R6RS_ESCAPES_P 8
2228 #define READ_OPTION_SQUARE_BRACKETS_P 10
2229 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2230 #define READ_OPTION_CURLY_INFIX_P 14
2231 #define READ_OPTION_R7RS_SYMBOLS_P 16
2233 /* The total width in bits of the per-port overrides */
2234 #define READ_OPTIONS_NUM_BITS 18
2236 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2237 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2239 #define READ_OPTION_MASK 3
2240 #define READ_OPTION_INHERIT 3
2243 set_port_read_option (SCM port
, int option
, int new_value
)
2245 SCM scm_read_options
;
2246 unsigned int read_options
;
2248 new_value
&= READ_OPTION_MASK
;
2250 scm_dynwind_begin (0);
2251 scm_dynwind_lock_port (port
);
2253 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2254 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2255 read_options
= scm_to_uint (scm_read_options
);
2257 read_options
= READ_OPTIONS_INHERIT_ALL
;
2258 read_options
&= ~(READ_OPTION_MASK
<< option
);
2259 read_options
|= new_value
<< option
;
2260 scm_read_options
= scm_from_uint (read_options
);
2261 scm_i_set_port_property_x (port
, sym_port_read_options
, scm_read_options
);
2266 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2268 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2271 opts
->case_insensitive_p
= value
;
2272 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2275 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2277 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2280 opts
->square_brackets_p
= value
;
2281 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2284 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2286 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2289 opts
->curly_infix_p
= value
;
2290 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2293 /* Initialize OPTS based on PORT's read options and the global read
2296 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2298 SCM val
, scm_read_options
;
2299 unsigned int read_options
, x
;
2301 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2303 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2304 read_options
= scm_to_uint (scm_read_options
);
2306 read_options
= READ_OPTIONS_INHERIT_ALL
;
2308 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2309 if (x
== READ_OPTION_INHERIT
)
2311 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2312 if (scm_is_eq (val
, scm_keyword_prefix
))
2313 x
= KEYWORD_STYLE_PREFIX
;
2314 else if (scm_is_eq (val
, scm_keyword_postfix
))
2315 x
= KEYWORD_STYLE_POSTFIX
;
2317 x
= KEYWORD_STYLE_HASH_PREFIX
;
2319 opts
->keyword_style
= x
;
2321 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2324 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2325 if (x == READ_OPTION_INHERIT) \
2326 x = !!SCM_ ## NAME; \
2331 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2332 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2333 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2334 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2335 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2336 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2337 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2338 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P
, r7rs_symbols_p
);
2340 #undef RESOLVE_BOOLEAN_OPTION
2342 opts
->neoteric_p
= 0;
2348 SCM read_hash_procs
;
2350 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2352 scm_i_read_hash_procedures
=
2353 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2355 scm_init_opts (scm_read_options
, scm_read_opts
);
2356 #include "libguile/read.x"