1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include <c-strcase.h>
36 #include "libguile/_scm.h"
37 #include "libguile/bytevectors.h"
38 #include "libguile/chars.h"
39 #include "libguile/eval.h"
40 #include "libguile/arrays.h"
41 #include "libguile/bitvectors.h"
42 #include "libguile/keywords.h"
43 #include "libguile/alist.h"
44 #include "libguile/srcprop.h"
45 #include "libguile/hashtab.h"
46 #include "libguile/hash.h"
47 #include "libguile/ports.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
63 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
64 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
66 SCM_SYMBOL (sym_nil
, "nil");
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx
, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
73 scm_t_option scm_read_opts
[] =
75 { SCM_OPTION_BOOLEAN
, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN
, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN
, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
102 KEYWORD_STYLE_HASH_PREFIX
,
103 KEYWORD_STYLE_PREFIX
,
104 KEYWORD_STYLE_POSTFIX
109 enum t_keyword_style keyword_style
;
110 unsigned int copy_source_p
: 1;
111 unsigned int record_positions_p
: 1;
112 unsigned int case_insensitive_p
: 1;
113 unsigned int r6rs_escapes_p
: 1;
114 unsigned int square_brackets_p
: 1;
115 unsigned int hungry_eol_escapes_p
: 1;
116 unsigned int curly_infix_p
: 1;
117 unsigned int neoteric_p
: 1;
118 unsigned int r7rs_symbols_p
: 1;
121 typedef struct t_read_opts scm_t_read_opts
;
125 Give meaningful error messages for errors
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
139 scm_i_input_error (char const *function
,
140 SCM port
, const char *message
, SCM arg
)
142 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
144 : scm_from_locale_string ("#<unknown port>"));
146 SCM string_port
= scm_open_output_string ();
147 SCM string
= SCM_EOL
;
148 scm_simple_format (string_port
,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
151 scm_from_long (SCM_LINUM (port
) + 1),
152 scm_from_int (SCM_COL (port
) + 1),
153 scm_from_locale_string (message
)));
155 string
= scm_get_output_string (string_port
);
156 scm_close_output_port (string_port
);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
165 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
172 SCM ans
= scm_options (setting
,
175 if (SCM_COPY_SOURCE_P
)
176 SCM_RECORD_POSITIONS_P
= 1;
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM
*scm_i_read_hash_procedures
;
186 scm_i_read_hash_procedures_ref (void)
188 return scm_fluid_ref (*scm_i_read_hash_procedures
);
192 scm_i_read_hash_procedures_set_x (SCM value
)
194 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
229 #define CHAR_IS_R5RS_DELIMITER(c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
245 /* Read an SCSH block comment. */
246 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
247 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
248 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
249 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
250 static SCM
scm_get_hash_procedure (int);
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
257 read_token (SCM port
, scm_t_read_opts
*opts
,
258 char *buf
, size_t buf_size
, size_t *read
)
262 while (*read
< buf_size
)
266 chr
= scm_get_byte_or_eof (port
);
270 else if (CHAR_IS_DELIMITER (chr
))
272 scm_unget_byte (chr
, port
);
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
288 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
289 char *buffer
, size_t buffer_size
, size_t *read
)
292 size_t bytes_read
, overflow_size
= 0;
293 char *overflow_buffer
= NULL
;
297 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
300 if (overflow
|| overflow_size
!= 0)
302 if (overflow_size
== 0)
304 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
305 memcpy (overflow_buffer
, buffer
, bytes_read
);
306 overflow_size
= bytes_read
;
311 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
313 memcpy (new_buf
, overflow_buffer
, overflow_size
);
314 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
316 overflow_buffer
= new_buf
;
317 overflow_size
+= bytes_read
;
324 *read
= overflow_size
;
328 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
334 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
338 switch (c
= scm_getc (port
))
344 scm_i_input_error (eoferr
,
353 switch (c
= scm_getc (port
))
359 case SCM_LINE_INCREMENTORS
:
365 switch (c
= scm_getc (port
))
368 eoferr
= "read_sharp";
371 scm_read_shebang (c
, port
, opts
);
374 scm_read_commented_expression (c
, port
, opts
);
377 if (scm_is_false (scm_get_hash_procedure (c
)))
379 scm_read_r6rs_block_comment (c
, port
);
384 scm_ungetc (c
, port
);
389 case SCM_LINE_INCREMENTORS
:
390 case SCM_SINGLE_SPACES
:
405 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
406 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
407 long line
, int column
);
411 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
412 long line
, int column
)
414 if (opts
->record_positions_p
)
415 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
420 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
421 #define FUNC_NAME "scm_i_lreadparen"
424 SCM tmp
, tl
, ans
= SCM_EOL
;
425 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
426 const int terminating_char
= ((chr
== '{') ? '}'
427 : ((chr
== '[') ? ']'
430 /* Need to capture line and column numbers here. */
431 long line
= SCM_LINUM (port
);
432 int column
= SCM_COL (port
) - 1;
434 c
= flush_ws (port
, opts
, FUNC_NAME
);
435 if (terminating_char
== c
)
438 scm_ungetc (c
, port
);
439 tmp
= scm_read_expression (port
, opts
);
441 /* Note that it is possible for scm_read_expression to return
442 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
443 check that it's a real dot by checking `c'. */
444 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
446 ans
= scm_read_expression (port
, opts
);
447 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
448 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
453 /* Build the head of the list structure. */
454 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
456 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
460 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
461 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
462 scm_i_input_error (FUNC_NAME
, port
,
463 "in pair: mismatched close paren: ~A",
464 scm_list_1 (SCM_MAKE_CHAR (c
)));
466 scm_ungetc (c
, port
);
467 tmp
= scm_read_expression (port
, opts
);
469 /* See above note about scm_sym_dot. */
470 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
472 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
474 c
= flush_ws (port
, opts
, FUNC_NAME
);
475 if (terminating_char
!= c
)
476 scm_i_input_error (FUNC_NAME
, port
,
477 "in pair: missing close paren", SCM_EOL
);
481 new_tail
= scm_cons (tmp
, SCM_EOL
);
482 SCM_SETCDR (tl
, new_tail
);
488 /* In addition to finding the length, 'scm_ilength' checks for
489 improper or circular lists, in which case it returns -1. */
490 int len
= scm_ilength (ans
);
492 /* The (len == 0) case is handled above */
494 /* Return directly to avoid re-annotating the element's source
495 location with the position of the outer brace. Also, it
496 might not be possible to annotate the element. */
497 return scm_car (ans
); /* {e} => e */
499 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
500 else if (len
>= 3 && (len
& 1))
502 /* It's a proper list whose length is odd and at least 3. If
503 the elements at odd indices (the infix operator positions)
504 are all 'equal?', then it's a simple curly-infix list.
505 Otherwise it's a mixed curly-infix list. */
506 SCM op
= scm_cadr (ans
);
508 /* Check to see if the elements at odd indices are 'equal?' */
509 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
511 if (scm_is_null (tl
))
513 /* Convert simple curly-infix list to prefix:
514 {a <op> b <op> ...} => (<op> a b ...) */
516 while (scm_is_pair (scm_cdr (tl
)))
519 SCM_SETCDR (tl
, tmp
);
522 ans
= scm_cons (op
, ans
);
525 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
527 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
528 ans
= scm_cons (sym_nfx
, ans
);
534 /* Mixed curly-infix (possibly improper) list:
535 {e . tail} => ($nfx$ e . tail) */
536 ans
= scm_cons (sym_nfx
, ans
);
539 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
544 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
545 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
547 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
553 while (i < ndigits) \
555 a = scm_getc (port); \
559 && (a == (scm_t_wchar) terminator) \
562 if ('0' <= a && a <= '9') \
564 else if ('A' <= a && a <= 'F') \
566 else if ('a' <= a && a <= 'f') \
579 skip_intraline_whitespace (SCM port
)
589 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
591 scm_ungetc (c
, port
);
594 /* Read either a double-quoted string or an R7RS-style symbol delimited
595 by vertical lines, depending on the value of 'chr' ('"' or '|').
596 Regardless, the result is always returned as a string. */
598 scm_read_string_like_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
599 #define FUNC_NAME "scm_lreadr"
601 /* For strings smaller than C_STR, this function creates only one Scheme
602 object (the string returned). */
605 size_t c_str_len
= 0;
606 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
608 /* Need to capture line and column numbers here. */
609 long line
= SCM_LINUM (port
);
610 int column
= SCM_COL (port
) - 1;
612 while (chr
!= (c
= scm_getc (port
)))
617 scm_i_input_error (FUNC_NAME
, port
,
619 ? "end of file in symbol"
620 : "end of file in string constant"),
624 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
626 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
632 switch (c
= scm_getc (port
))
640 if (opts
->hungry_eol_escapes_p
)
641 skip_intraline_whitespace (port
);
668 if (opts
->r6rs_escapes_p
|| chr
== '|')
669 SCM_READ_HEX_ESCAPE (10, ';');
671 SCM_READ_HEX_ESCAPE (2, '\0');
674 if (!opts
->r6rs_escapes_p
)
676 SCM_READ_HEX_ESCAPE (4, '\0');
680 if (!opts
->r6rs_escapes_p
)
682 SCM_READ_HEX_ESCAPE (6, '\0');
689 scm_i_input_error (FUNC_NAME
, port
,
690 "illegal character in escape sequence: ~S",
691 scm_list_1 (SCM_MAKE_CHAR (c
)));
695 c_str
[c_str_len
++] = c
;
698 if (scm_is_null (str
))
699 /* Fast path: we got a string that fits in C_STR. */
700 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
704 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
706 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
709 return maybe_annotate_source (str
, port
, opts
, line
, column
);
714 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
716 return scm_read_string_like_syntax (chr
, port
, opts
);
720 scm_read_r7rs_symbol (int chr
, SCM port
, scm_t_read_opts
*opts
)
722 return scm_string_to_symbol (scm_read_string_like_syntax (chr
, port
, opts
));
726 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
728 SCM result
, str
= SCM_EOL
;
729 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
731 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
733 /* Need to capture line and column numbers here. */
734 long line
= SCM_LINUM (port
);
735 int column
= SCM_COL (port
) - 1;
737 scm_ungetc (chr
, port
);
738 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
741 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
743 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
744 if (scm_is_false (result
))
746 /* Return a symbol instead of a number */
747 if (opts
->case_insensitive_p
)
748 str
= scm_string_downcase_x (str
);
749 result
= scm_string_to_symbol (str
);
751 else if (SCM_NIMP (result
))
752 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
754 SCM_COL (port
) += scm_i_string_length (str
);
759 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
762 int ends_with_colon
= 0;
764 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
765 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
766 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
769 scm_ungetc (chr
, port
);
770 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
773 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
775 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
777 str
= scm_from_stringn (buffer
, bytes_read
- 1,
778 pt
->encoding
, pt
->ilseq_handler
);
780 if (opts
->case_insensitive_p
)
781 str
= scm_string_downcase_x (str
);
782 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
786 str
= scm_from_stringn (buffer
, bytes_read
,
787 pt
->encoding
, pt
->ilseq_handler
);
789 if (opts
->case_insensitive_p
)
790 str
= scm_string_downcase_x (str
);
791 result
= scm_string_to_symbol (str
);
794 SCM_COL (port
) += scm_i_string_length (str
);
799 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
800 #define FUNC_NAME "scm_lreadr"
804 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
832 scm_ungetc (chr
, port
);
833 scm_ungetc ('#', port
);
837 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
840 pt
= SCM_PTAB_ENTRY (port
);
841 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
843 result
= scm_string_to_number (str
, scm_from_uint (radix
));
845 SCM_COL (port
) += scm_i_string_length (str
);
847 if (scm_is_true (result
))
850 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
857 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
860 long line
= SCM_LINUM (port
);
861 int column
= SCM_COL (port
) - 1;
866 p
= scm_sym_quasiquote
;
879 p
= scm_sym_uq_splicing
;
882 scm_ungetc (c
, port
);
889 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
890 "scm_read_quote", chr
);
894 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
895 return maybe_annotate_source (p
, port
, opts
, line
, column
);
898 SCM_SYMBOL (sym_syntax
, "syntax");
899 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
900 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
901 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
904 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
907 long line
= SCM_LINUM (port
);
908 int column
= SCM_COL (port
) - 1;
926 p
= sym_unsyntax_splicing
;
929 scm_ungetc (c
, port
);
936 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
937 "scm_read_syntax", chr
);
941 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
942 return maybe_annotate_source (p
, port
, opts
, line
, column
);
946 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
948 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
950 if (!scm_is_eq (id
, sym_nil
))
951 scm_i_input_error ("scm_read_nil", port
,
952 "unexpected input while reading #nil: ~a",
955 return SCM_ELISP_NIL
;
959 scm_read_semicolon_comment (int chr
, SCM port
)
963 /* We use the get_byte here because there is no need to get the
964 locale correct with comment input. This presumes that newline
965 always represents itself no matter what the encoding is. */
966 for (c
= scm_get_byte_or_eof (port
);
967 (c
!= EOF
) && (c
!= '\n');
968 c
= scm_get_byte_or_eof (port
));
970 return SCM_UNSPECIFIED
;
973 /* If the EXPECTED_CHARS are the next ones available from PORT, then
974 consume them and return 1. Otherwise leave the port position where
975 it was and return 0. EXPECTED_CHARS should be all lowercase, and
976 will be matched case-insensitively against the characters read from
979 try_read_ci_chars (SCM port
, const char *expected_chars
)
981 int num_chars_wanted
= strlen (expected_chars
);
982 int num_chars_read
= 0;
983 char *chars_read
= alloca (num_chars_wanted
);
986 while (num_chars_read
< num_chars_wanted
)
991 else if (c_tolower (c
) != expected_chars
[num_chars_read
])
993 scm_ungetc (c
, port
);
997 chars_read
[num_chars_read
++] = c
;
1000 if (num_chars_read
== num_chars_wanted
)
1004 while (num_chars_read
> 0)
1005 scm_ungetc (chars_read
[--num_chars_read
], port
);
1011 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1014 scm_read_boolean (int chr
, SCM port
)
1020 try_read_ci_chars (port
, "rue");
1025 try_read_ci_chars (port
, "alse");
1029 return SCM_UNSPECIFIED
;
1033 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1034 #define FUNC_NAME "scm_lreadr"
1036 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
1038 size_t charname_len
, bytes_read
;
1043 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
1046 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
1048 if (bytes_read
== 0)
1050 chr
= scm_getc (port
);
1052 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
1053 "while reading character", SCM_EOL
);
1055 /* CHR must be a token delimiter, like a whitespace. */
1056 return (SCM_MAKE_CHAR (chr
));
1059 pt
= SCM_PTAB_ENTRY (port
);
1061 /* Simple ASCII characters can be processed immediately. Also, simple
1062 ISO-8859-1 characters can be processed immediately if the encoding for this
1063 port is ISO-8859-1. */
1064 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
1066 SCM_COL (port
) += 1;
1067 return SCM_MAKE_CHAR (buffer
[0]);
1070 /* Otherwise, convert the buffer into a proper scheme string for
1072 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
1074 charname_len
= scm_i_string_length (charname
);
1075 SCM_COL (port
) += charname_len
;
1076 cp
= scm_i_string_ref (charname
, 0);
1077 if (charname_len
== 1)
1078 return SCM_MAKE_CHAR (cp
);
1080 /* Ignore dotted circles, which may be used to keep combining characters from
1081 combining with the backslash in #\charname. */
1082 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1083 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1085 if (cp
>= '0' && cp
< '8')
1087 /* Dirk:FIXME:: This type of character syntax is not R5RS
1088 * compliant. Further, it should be verified that the constant
1089 * does only consist of octal digits. */
1090 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1091 if (SCM_I_INUMP (p
))
1093 scm_t_wchar c
= scm_to_uint32 (p
);
1094 if (SCM_IS_UNICODE_CHAR (c
))
1095 return SCM_MAKE_CHAR (c
);
1097 scm_i_input_error (FUNC_NAME
, port
,
1098 "out-of-range octal character escape: ~a",
1099 scm_list_1 (charname
));
1103 if (cp
== 'x' && (charname_len
> 1))
1107 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1109 scm_from_uint (16));
1110 if (SCM_I_INUMP (p
))
1112 scm_t_wchar c
= scm_to_uint32 (p
);
1113 if (SCM_IS_UNICODE_CHAR (c
))
1114 return SCM_MAKE_CHAR (c
);
1116 scm_i_input_error (FUNC_NAME
, port
,
1117 "out-of-range hex character escape: ~a",
1118 scm_list_1 (charname
));
1122 /* The names of characters should never have non-Latin1
1124 if (scm_i_is_narrow_string (charname
)
1125 || scm_i_try_narrow_string (charname
))
1126 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1128 if (scm_is_true (ch
))
1132 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1133 scm_list_1 (charname
));
1135 return SCM_UNSPECIFIED
;
1140 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1144 /* Read the symbol that comprises the keyword. Doing this instead of
1145 invoking a specific symbol reader function allows `scm_read_keyword ()'
1146 to adapt to the delimiters currently valid of symbols.
1148 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1149 symbol
= scm_read_expression (port
, opts
);
1150 if (!scm_is_symbol (symbol
))
1151 scm_i_input_error ("scm_read_keyword", port
,
1152 "keyword prefix `~a' not followed by a symbol: ~s",
1153 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1155 return (scm_symbol_to_keyword (symbol
));
1159 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1160 long line
, int column
)
1162 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163 guarantee that it's going to do what we want. After all, this is an
1164 implementation detail of `scm_read_vector ()', not a desirable
1166 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1167 port
, opts
, line
, column
);
1170 /* Helper used by scm_read_array */
1172 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1181 c
= scm_getc (port
);
1184 while ('0' <= c
&& c
<= '9')
1186 if (((SSIZE_MAX
- (c
-'0')) / 10) <= res
)
1187 scm_i_input_error ("read_decimal_integer", port
,
1188 "number too large", SCM_EOL
);
1189 res
= 10*res
+ c
-'0';
1191 c
= scm_getc (port
);
1199 /* Read an array. This function can also read vectors and uniform
1200 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1203 C is the first character read after the '#'. */
1205 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1208 scm_t_wchar tag_buf
[8];
1211 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1213 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1214 the array code can not deal with zero-length dimensions yet, and
1215 we want to allow zero-length vectors, of course. */
1217 return scm_read_vector (c
, port
, opts
, line
, column
);
1219 /* Disambiguate between '#f' and uniform floating point vectors. */
1222 c
= scm_getc (port
);
1223 if (c
!= '3' && c
!= '6')
1225 if (c
== 'a' && try_read_ci_chars (port
, "lse"))
1228 scm_ungetc (c
, port
);
1234 goto continue_reading_tag
;
1239 c
= read_decimal_integer (port
, c
, &rank
);
1241 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1246 continue_reading_tag
:
1247 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1248 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1250 tag_buf
[tag_len
++] = c
;
1251 c
= scm_getc (port
);
1257 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1258 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1259 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1264 if (c
== '@' || c
== ':')
1270 ssize_t lbnd
= 0, len
= 0;
1275 c
= scm_getc (port
);
1276 c
= read_decimal_integer (port
, c
, &lbnd
);
1279 s
= scm_from_ssize_t (lbnd
);
1283 c
= scm_getc (port
);
1284 c
= read_decimal_integer (port
, c
, &len
);
1286 scm_i_input_error (NULL
, port
,
1287 "array length must be non-negative",
1290 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1293 shape
= scm_cons (s
, shape
);
1294 } while (c
== '@' || c
== ':');
1296 shape
= scm_reverse_x (shape
, SCM_EOL
);
1299 /* Read nested lists of elements. */
1301 scm_i_input_error (NULL
, port
,
1302 "missing '(' in vector or array literal",
1304 elements
= scm_read_sexp (c
, port
, opts
);
1306 if (scm_is_false (shape
))
1307 shape
= scm_from_ssize_t (rank
);
1308 else if (scm_ilength (shape
) != rank
)
1311 "the number of shape specifications must match the array rank",
1314 /* Handle special print syntax of rank zero arrays; see
1315 scm_i_print_array for a rationale. */
1318 if (!scm_is_pair (elements
))
1319 scm_i_input_error (NULL
, port
,
1320 "too few elements in array literal, need 1",
1322 if (!scm_is_null (SCM_CDR (elements
)))
1323 scm_i_input_error (NULL
, port
,
1324 "too many elements in array literal, want 1",
1326 elements
= SCM_CAR (elements
);
1329 /* Construct array, annotate with source location, and return. */
1330 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1331 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1335 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1336 long line
, int column
)
1338 return scm_read_array (chr
, port
, opts
, line
, column
);
1342 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1343 long line
, int column
)
1345 chr
= scm_getc (port
);
1349 chr
= scm_getc (port
);
1353 chr
= scm_getc (port
);
1357 return maybe_annotate_source
1358 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1359 port
, opts
, line
, column
);
1362 scm_i_input_error ("read_bytevector", port
,
1363 "invalid bytevector prefix",
1364 SCM_MAKE_CHAR (chr
));
1365 return SCM_UNSPECIFIED
;
1369 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1370 long line
, int column
)
1372 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1373 terribly inefficient but who cares? */
1374 SCM s_bits
= SCM_EOL
;
1376 for (chr
= scm_getc (port
);
1377 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1378 chr
= scm_getc (port
))
1380 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1384 scm_ungetc (chr
, port
);
1386 return maybe_annotate_source
1387 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1388 port
, opts
, line
, column
);
1392 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1398 int c
= scm_getc (port
);
1401 scm_i_input_error ("skip_block_comment", port
,
1402 "unterminated `#! ... !#' comment", SCM_EOL
);
1406 else if (c
== '#' && bang_seen
)
1412 return SCM_UNSPECIFIED
;
1415 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1417 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1419 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1423 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1425 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1429 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1431 c
= scm_getc (port
);
1433 scm_i_input_error ("skip_block_comment", port
,
1434 "unterminated `#! ... !#' comment", SCM_EOL
);
1435 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1437 else if (CHAR_IS_DELIMITER (c
))
1439 scm_ungetc (c
, port
);
1441 if (0 == strcmp ("r6rs", name
))
1442 ; /* Silently ignore */
1443 else if (0 == strcmp ("fold-case", name
))
1444 set_port_case_insensitive_p (port
, opts
, 1);
1445 else if (0 == strcmp ("no-fold-case", name
))
1446 set_port_case_insensitive_p (port
, opts
, 0);
1447 else if (0 == strcmp ("curly-infix", name
))
1448 set_port_curly_infix_p (port
, opts
, 1);
1449 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1451 set_port_curly_infix_p (port
, opts
, 1);
1452 set_port_square_brackets_p (port
, opts
, 0);
1457 return SCM_UNSPECIFIED
;
1461 scm_ungetc (c
, port
);
1466 scm_ungetc (name
[--i
], port
);
1467 return scm_read_scsh_block_comment (chr
, port
);
1471 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1473 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474 nested. So care must be taken. */
1475 int nesting_level
= 1;
1477 int a
= scm_getc (port
);
1480 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1481 "unterminated `#| ... |#' comment", SCM_EOL
);
1483 while (nesting_level
> 0)
1485 int b
= scm_getc (port
);
1488 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1489 "unterminated `#| ... |#' comment", SCM_EOL
);
1491 if (a
== '|' && b
== '#')
1496 else if (a
== '#' && b
== '|')
1505 return SCM_UNSPECIFIED
;
1509 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1510 scm_t_read_opts
*opts
)
1514 c
= flush_ws (port
, opts
, (char *) NULL
);
1516 scm_i_input_error ("read_commented_expression", port
,
1517 "no expression after #; comment", SCM_EOL
);
1518 scm_ungetc (c
, port
);
1519 scm_read_expression (port
, opts
);
1520 return SCM_UNSPECIFIED
;
1524 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1526 /* Guile's extended symbol read syntax looks like this:
1528 #{This is all a symbol name}#
1530 So here, CHR is expected to be `{'. */
1533 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1535 buf
= scm_i_string_start_writing (buf
);
1537 while ((chr
= scm_getc (port
)) != EOF
)
1548 scm_i_string_set_x (buf
, len
++, '}');
1554 else if (chr
== '\\')
1556 /* It used to be that print.c would print extended-read-syntax
1557 symbols with backslashes before "non-standard" chars, but
1558 this routine wouldn't do anything with those escapes.
1559 Bummer. What we've done is to change print.c to output
1560 R6RS hex escapes for those characters, relying on the fact
1561 that the extended read syntax would never put a `\' before
1562 an `x'. For now, we just ignore other instances of
1563 backslash in the string. */
1564 switch ((chr
= scm_getc (port
)))
1572 SCM_READ_HEX_ESCAPE (10, ';');
1573 scm_i_string_set_x (buf
, len
++, c
);
1581 scm_i_string_stop_writing ();
1582 scm_i_input_error ("scm_read_extended_symbol", port
,
1583 "illegal character in escape sequence: ~S",
1584 scm_list_1 (SCM_MAKE_CHAR (c
)));
1588 scm_i_string_set_x (buf
, len
++, chr
);
1593 scm_i_string_set_x (buf
, len
++, chr
);
1595 if (len
>= scm_i_string_length (buf
) - 2)
1599 scm_i_string_stop_writing ();
1600 addy
= scm_i_make_string (1024, NULL
, 0);
1601 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1603 buf
= scm_i_string_start_writing (buf
);
1608 scm_i_string_stop_writing ();
1610 scm_i_input_error ("scm_read_extended_symbol", port
,
1611 "end of file while reading symbol", SCM_EOL
);
1613 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1618 /* Top-level token readers, i.e., dispatchers. */
1621 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1625 proc
= scm_get_hash_procedure (chr
);
1626 if (scm_is_true (scm_procedure_p (proc
)))
1628 long line
= SCM_LINUM (port
);
1629 int column
= SCM_COL (port
) - 2;
1632 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1634 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1635 && !scm_i_has_source_properties (got
))
1636 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1641 return SCM_UNSPECIFIED
;
1644 /* The reader for the sharp `#' character. It basically dispatches reads
1645 among the above token readers. */
1647 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1648 long line
, int column
)
1649 #define FUNC_NAME "scm_lreadr"
1653 chr
= scm_getc (port
);
1655 result
= scm_read_sharp_extension (chr
, port
, opts
);
1656 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1662 return (scm_read_character (chr
, port
, opts
));
1664 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1669 /* This one may return either a boolean or an SRFI-4 vector. */
1670 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1672 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1674 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1678 return (scm_read_boolean (chr
, port
));
1680 return (scm_read_keyword (chr
, port
, opts
));
1681 case '0': case '1': case '2': case '3': case '4':
1682 case '5': case '6': case '7': case '8': case '9':
1684 #if SCM_ENABLE_DEPRECATED
1685 /* See below for 'i' and 'e'. */
1691 return (scm_read_array (chr
, port
, opts
, line
, column
));
1695 #if SCM_ENABLE_DEPRECATED
1697 /* When next char is '(', it really is an old-style
1699 scm_t_wchar next_c
= scm_getc (port
);
1701 scm_ungetc (next_c
, port
);
1703 return scm_read_array (chr
, port
, opts
, line
, column
);
1717 return (scm_read_number_and_radix (chr
, port
, opts
));
1719 return (scm_read_extended_symbol (chr
, port
));
1721 return (scm_read_shebang (chr
, port
, opts
));
1723 return (scm_read_commented_expression (chr
, port
, opts
));
1727 return (scm_read_syntax (chr
, port
, opts
));
1729 return (scm_read_nil (chr
, port
, opts
));
1731 result
= scm_read_sharp_extension (chr
, port
, opts
);
1732 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1734 /* To remain compatible with 1.8 and earlier, the following
1735 characters have lower precedence than `read-hash-extend'
1740 return scm_read_r6rs_block_comment (chr
, port
);
1742 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1743 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1750 return SCM_UNSPECIFIED
;
1755 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1756 #define FUNC_NAME "read_inner_expression"
1762 chr
= scm_getc (port
);
1766 case SCM_WHITE_SPACES
:
1767 case SCM_LINE_INCREMENTORS
:
1770 (void) scm_read_semicolon_comment (chr
, port
);
1773 if (opts
->curly_infix_p
)
1775 if (opts
->neoteric_p
)
1776 return scm_read_sexp (chr
, port
, opts
);
1781 /* Enable neoteric expressions within curly braces */
1782 opts
->neoteric_p
= 1;
1783 expr
= scm_read_sexp (chr
, port
, opts
);
1784 opts
->neoteric_p
= 0;
1789 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1791 if (opts
->square_brackets_p
)
1792 return scm_read_sexp (chr
, port
, opts
);
1793 else if (opts
->curly_infix_p
)
1795 /* The syntax of neoteric expressions requires that '[' be
1796 a delimiter when curly-infix is enabled, so it cannot
1797 be part of an unescaped symbol. We might as well do
1798 something useful with it, so we adopt Kawa's convention:
1799 [...] => ($bracket-list$ ...) */
1800 long line
= SCM_LINUM (port
);
1801 int column
= SCM_COL (port
) - 1;
1802 return maybe_annotate_source
1803 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1804 port
, opts
, line
, column
);
1807 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1809 return (scm_read_sexp (chr
, port
, opts
));
1811 return (scm_read_string (chr
, port
, opts
));
1813 if (opts
->r7rs_symbols_p
)
1814 return scm_read_r7rs_symbol (chr
, port
, opts
);
1816 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1820 return (scm_read_quote (chr
, port
, opts
));
1823 long line
= SCM_LINUM (port
);
1824 int column
= SCM_COL (port
) - 1;
1825 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1826 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1827 /* We read a comment or some such. */
1833 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1836 if (opts
->curly_infix_p
)
1837 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1839 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1841 if (opts
->square_brackets_p
)
1842 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1843 /* otherwise fall through */
1847 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1848 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1853 if (((chr
>= '0') && (chr
<= '9'))
1854 || (strchr ("+-.", chr
)))
1855 return (scm_read_number (chr
, port
, opts
));
1857 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1865 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1866 #define FUNC_NAME "scm_read_expression"
1868 if (!opts
->neoteric_p
)
1869 return read_inner_expression (port
, opts
);
1876 if (opts
->record_positions_p
)
1878 /* We need to get the position of the first non-whitespace
1879 character in order to correctly annotate neoteric
1880 expressions. For example, for the expression 'f(x)', the
1881 first call to 'read_inner_expression' reads the 'f' (which
1882 cannot be annotated), and then we later read the '(x)' and
1883 use it to construct the new list (f x). */
1884 int c
= flush_ws (port
, opts
, (char *) NULL
);
1887 scm_ungetc (c
, port
);
1888 line
= SCM_LINUM (port
);
1889 column
= SCM_COL (port
);
1892 expr
= read_inner_expression (port
, opts
);
1894 /* 'expr' is the first component of the neoteric expression. Now
1895 we loop, and as long as the next character is '(', '[', or '{',
1896 (without any intervening whitespace), we use it to construct a
1897 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1900 int chr
= scm_getc (port
);
1903 /* e(...) => (e ...) */
1904 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1905 else if (chr
== '[')
1906 /* e[...] => ($bracket-apply$ e ...) */
1907 expr
= scm_cons (sym_bracket_apply
,
1909 scm_read_sexp (chr
, port
, opts
)));
1910 else if (chr
== '{')
1912 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1914 if (scm_is_null (arg
))
1915 expr
= scm_list_1 (expr
); /* e{} => (e) */
1917 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1922 scm_ungetc (chr
, port
);
1925 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1933 /* Actual reader. */
1935 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1937 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1939 "Read an s-expression from the input port @var{port}, or from\n"
1940 "the current input port if @var{port} is not specified.\n"
1941 "Any whitespace before the next token is discarded.")
1942 #define FUNC_NAME s_scm_read
1944 scm_t_read_opts opts
;
1947 if (SCM_UNBNDP (port
))
1948 port
= scm_current_input_port ();
1949 SCM_VALIDATE_OPINPORT (1, port
);
1951 init_read_options (port
, &opts
);
1953 c
= flush_ws (port
, &opts
, (char *) NULL
);
1956 scm_ungetc (c
, port
);
1958 return (scm_read_expression (port
, &opts
));
1965 /* Manipulate the read-hash-procedures alist. This could be written in
1966 Scheme, but maybe it will also be used by C code during initialisation. */
1967 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1968 (SCM chr
, SCM proc
),
1969 "Install the procedure @var{proc} for reading expressions\n"
1970 "starting with the character sequence @code{#} and @var{chr}.\n"
1971 "@var{proc} will be called with two arguments: the character\n"
1972 "@var{chr} and the port to read further data from. The object\n"
1973 "returned will be the return value of @code{read}. \n"
1974 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1976 #define FUNC_NAME s_scm_read_hash_extend
1981 SCM_VALIDATE_CHAR (1, chr
);
1982 SCM_ASSERT (scm_is_false (proc
)
1983 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1984 proc
, SCM_ARG2
, FUNC_NAME
);
1986 /* Check if chr is already in the alist. */
1987 this = scm_i_read_hash_procedures_ref ();
1991 if (scm_is_null (this))
1993 /* not found, so add it to the beginning. */
1994 if (scm_is_true (proc
))
1996 SCM
new = scm_cons (scm_cons (chr
, proc
),
1997 scm_i_read_hash_procedures_ref ());
1998 scm_i_read_hash_procedures_set_x (new);
2002 if (scm_is_eq (chr
, SCM_CAAR (this)))
2004 /* already in the alist. */
2005 if (scm_is_false (proc
))
2008 if (scm_is_false (prev
))
2010 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
2011 scm_i_read_hash_procedures_set_x (rest
);
2014 scm_set_cdr_x (prev
, SCM_CDR (this));
2019 scm_set_cdr_x (SCM_CAR (this), proc
);
2024 this = SCM_CDR (this);
2027 return SCM_UNSPECIFIED
;
2031 /* Recover the read-hash procedure corresponding to char c. */
2033 scm_get_hash_procedure (int c
)
2035 SCM rest
= scm_i_read_hash_procedures_ref ();
2039 if (scm_is_null (rest
))
2042 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
2043 return SCM_CDAR (rest
);
2045 rest
= SCM_CDR (rest
);
2049 #define SCM_ENCODING_SEARCH_SIZE (500)
2051 /* Search the first few hundred characters of a file for an Emacs-like coding
2052 declaration. Returns either NULL or a string whose storage has been
2053 allocated with `scm_gc_malloc ()'. */
2055 scm_i_scan_for_encoding (SCM port
)
2058 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
2059 size_t bytes_read
, encoding_length
, i
;
2060 char *encoding
= NULL
;
2061 char *pos
, *encoding_start
;
2064 pt
= SCM_PTAB_ENTRY (port
);
2066 if (pt
->rw_active
== SCM_PORT_WRITE
)
2070 pt
->rw_active
= SCM_PORT_READ
;
2072 if (pt
->read_pos
== pt
->read_end
)
2074 /* We can use the read buffer, and thus avoid a seek. */
2075 if (scm_fill_input (port
) == EOF
)
2078 bytes_read
= pt
->read_end
- pt
->read_pos
;
2079 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
2080 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
2082 if (bytes_read
<= 1)
2083 /* An unbuffered port -- don't scan. */
2086 memcpy (header
, pt
->read_pos
, bytes_read
);
2087 header
[bytes_read
] = '\0';
2091 /* Try to read some bytes and then seek back. Not all ports
2092 support seeking back; and indeed some file ports (like
2093 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2094 check performed by SCM_FPORT_FDES---but fail to seek
2095 backwards. Hence this block comes second. We prefer to use
2096 the read buffer in-place. */
2097 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2100 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2101 header
[bytes_read
] = '\0';
2102 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2105 /* search past "coding[:=]" */
2109 if ((pos
= strstr(pos
, "coding")) == NULL
)
2112 pos
+= strlen("coding");
2113 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2114 (*pos
== ':' || *pos
== '='))
2122 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2123 (*pos
== ' ' || *pos
== '\t'))
2126 /* grab the next token */
2127 encoding_start
= pos
;
2129 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2130 && encoding_start
+ i
- header
< bytes_read
2131 && (isalnum ((int) encoding_start
[i
])
2132 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
2135 encoding_length
= i
;
2136 if (encoding_length
== 0)
2139 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2140 for (i
= 0; i
< encoding_length
; i
++)
2141 encoding
[i
] = toupper ((int) encoding
[i
]);
2143 /* push backwards to make sure we were in a comment */
2145 pos
= encoding_start
;
2146 while (pos
>= header
)
2153 else if (*pos
== '\n' || pos
== header
)
2155 /* This wasn't in a semicolon comment. Check for a
2156 hash-bang comment. */
2157 char *beg
= strstr (header
, "#!");
2158 char *end
= strstr (header
, "!#");
2159 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2170 /* This wasn't in a comment */
2176 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2178 "Scans the port for an Emacs-like character coding declaration\n"
2179 "near the top of the contents of a port with random-accessible contents.\n"
2180 "The coding declaration is of the form\n"
2181 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2183 "Returns a string containing the character encoding of the file\n"
2184 "if a declaration was found, or @code{#f} otherwise.\n")
2185 #define FUNC_NAME s_scm_file_encoding
2190 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2192 enc
= scm_i_scan_for_encoding (port
);
2197 s_enc
= scm_from_locale_string (enc
);
2206 /* Per-port read options.
2208 We store per-port read options in the 'port-read-options' port
2209 property, which is stored in the internal port structure. The value
2210 stored is a single integer that contains a two-bit field for each
2213 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2214 the applicable value should be inherited from the corresponding
2215 global read option. Otherwise, the bit field contains the value of
2216 the read option. For boolean read options that have been set
2217 per-port, the possible values are 0 or 1. If the 'keyword_style'
2218 read option has been set per-port, its possible values are those in
2219 'enum t_keyword_style'. */
2221 /* Key to read options in port properties. */
2222 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2224 /* Offsets of bit fields for each per-port override */
2225 #define READ_OPTION_COPY_SOURCE_P 0
2226 #define READ_OPTION_RECORD_POSITIONS_P 2
2227 #define READ_OPTION_CASE_INSENSITIVE_P 4
2228 #define READ_OPTION_KEYWORD_STYLE 6
2229 #define READ_OPTION_R6RS_ESCAPES_P 8
2230 #define READ_OPTION_SQUARE_BRACKETS_P 10
2231 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2232 #define READ_OPTION_CURLY_INFIX_P 14
2233 #define READ_OPTION_R7RS_SYMBOLS_P 16
2235 /* The total width in bits of the per-port overrides */
2236 #define READ_OPTIONS_NUM_BITS 18
2238 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2239 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2241 #define READ_OPTION_MASK 3
2242 #define READ_OPTION_INHERIT 3
2245 set_port_read_option (SCM port
, int option
, int new_value
)
2247 SCM scm_read_options
;
2248 unsigned int read_options
;
2250 new_value
&= READ_OPTION_MASK
;
2251 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2252 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2253 read_options
= scm_to_uint (scm_read_options
);
2255 read_options
= READ_OPTIONS_INHERIT_ALL
;
2256 read_options
&= ~(READ_OPTION_MASK
<< option
);
2257 read_options
|= new_value
<< option
;
2258 scm_read_options
= scm_from_uint (read_options
);
2259 scm_i_set_port_property_x (port
, sym_port_read_options
, scm_read_options
);
2262 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2264 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2267 opts
->case_insensitive_p
= value
;
2268 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2271 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2273 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2276 opts
->square_brackets_p
= value
;
2277 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2280 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2282 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2285 opts
->curly_infix_p
= value
;
2286 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2289 /* Initialize OPTS based on PORT's read options and the global read
2292 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2294 SCM val
, scm_read_options
;
2295 unsigned int read_options
, x
;
2297 scm_read_options
= scm_i_port_property (port
, sym_port_read_options
);
2299 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2300 read_options
= scm_to_uint (scm_read_options
);
2302 read_options
= READ_OPTIONS_INHERIT_ALL
;
2304 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2305 if (x
== READ_OPTION_INHERIT
)
2307 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2308 if (scm_is_eq (val
, scm_keyword_prefix
))
2309 x
= KEYWORD_STYLE_PREFIX
;
2310 else if (scm_is_eq (val
, scm_keyword_postfix
))
2311 x
= KEYWORD_STYLE_POSTFIX
;
2313 x
= KEYWORD_STYLE_HASH_PREFIX
;
2315 opts
->keyword_style
= x
;
2317 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2320 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2321 if (x == READ_OPTION_INHERIT) \
2322 x = !!SCM_ ## NAME; \
2327 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2328 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2329 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2330 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2331 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2332 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2333 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2334 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P
, r7rs_symbols_p
);
2336 #undef RESOLVE_BOOLEAN_OPTION
2338 opts
->neoteric_p
= 0;
2344 SCM read_hash_procs
;
2346 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2348 scm_i_read_hash_procedures
=
2349 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2351 scm_init_opts (scm_read_options
, scm_read_opts
);
2352 #include "libguile/read.x"