1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include <c-strcase.h>
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/ports-internal.h"
47 #include "libguile/fports.h"
48 #include "libguile/root.h"
49 #include "libguile/strings.h"
50 #include "libguile/strports.h"
51 #include "libguile/vectors.h"
52 #include "libguile/validate.h"
53 #include "libguile/srfi-4.h"
54 #include "libguile/srfi-13.h"
56 #include "libguile/read.h"
57 #include "libguile/private-options.h"
62 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
63 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
64 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
65 SCM_SYMBOL (sym_nil
, "nil");
67 /* SRFI-105 curly infix expression support */
68 SCM_SYMBOL (sym_nfx
, "$nfx$");
69 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
70 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
72 scm_t_option scm_read_opts
[] =
74 { SCM_OPTION_BOOLEAN
, "copy", 0,
75 "Copy source code expressions." },
76 { SCM_OPTION_BOOLEAN
, "positions", 1,
77 "Record positions of source code expressions." },
78 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
79 "Convert symbols to lower case."},
80 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
81 "Style of keyword recognition: #f, 'prefix or 'postfix."},
82 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
83 "Use R6RS variable-length character and string hex escapes."},
84 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
85 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
87 "In strings, consume leading whitespace after an escaped end-of-line."},
88 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
89 "Support SRFI-105 curly infix expressions."},
93 /* Internal read options structure. This is initialized by 'scm_read'
94 from the global and per-port read options, and a pointer is passed
95 down to all helper functions. */
99 KEYWORD_STYLE_HASH_PREFIX
,
100 KEYWORD_STYLE_PREFIX
,
101 KEYWORD_STYLE_POSTFIX
106 enum t_keyword_style keyword_style
;
107 unsigned int copy_source_p
: 1;
108 unsigned int record_positions_p
: 1;
109 unsigned int case_insensitive_p
: 1;
110 unsigned int r6rs_escapes_p
: 1;
111 unsigned int square_brackets_p
: 1;
112 unsigned int hungry_eol_escapes_p
: 1;
113 unsigned int curly_infix_p
: 1;
114 unsigned int neoteric_p
: 1;
117 typedef struct t_read_opts scm_t_read_opts
;
121 Give meaningful error messages for errors
125 FILE:LINE:COL: MESSAGE
126 This happened in ....
128 This is not standard GNU format, but the test-suite likes the real
129 message to be in front.
135 scm_i_input_error (char const *function
,
136 SCM port
, const char *message
, SCM arg
)
138 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
140 : scm_from_locale_string ("#<unknown port>"));
142 SCM string_port
= scm_open_output_string ();
143 SCM string
= SCM_EOL
;
144 scm_simple_format (string_port
,
145 scm_from_locale_string ("~A:~S:~S: ~A"),
147 scm_from_long (SCM_LINUM (port
) + 1),
148 scm_from_int (SCM_COL (port
) + 1),
149 scm_from_locale_string (message
)));
151 string
= scm_get_output_string (string_port
);
152 scm_close_output_port (string_port
);
153 scm_error_scm (scm_from_latin1_symbol ("read-error"),
154 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
161 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
163 "Option interface for the read options. Instead of using\n"
164 "this procedure directly, use the procedures @code{read-enable},\n"
165 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
166 #define FUNC_NAME s_scm_read_options
168 SCM ans
= scm_options (setting
,
171 if (SCM_COPY_SOURCE_P
)
172 SCM_RECORD_POSITIONS_P
= 1;
177 /* A fluid referring to an association list mapping extra hash
178 characters to procedures. */
179 static SCM
*scm_i_read_hash_procedures
;
182 scm_i_read_hash_procedures_ref (void)
184 return scm_fluid_ref (*scm_i_read_hash_procedures
);
188 scm_i_read_hash_procedures_set_x (SCM value
)
190 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
197 /* Size of the C buffer used to read symbols and numbers. */
198 #define READER_BUFFER_SIZE 128
200 /* Number of 32-bit codepoints in the buffer used to read strings. */
201 #define READER_STRING_BUFFER_SIZE 128
203 /* The maximum size of Scheme character names. */
204 #define READER_CHAR_NAME_MAX_SIZE 50
206 /* The maximum size of reader directive names. */
207 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
210 /* `isblank' is only in C99. */
211 #define CHAR_IS_BLANK_(_chr) \
212 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
213 || ((_chr) == '\f') || ((_chr) == '\r'))
216 # define CHAR_IS_BLANK(_chr) \
217 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
219 # define CHAR_IS_BLANK CHAR_IS_BLANK_
223 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
225 #define CHAR_IS_R5RS_DELIMITER(c) \
227 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
229 #define CHAR_IS_DELIMITER(c) \
230 (CHAR_IS_R5RS_DELIMITER (c) \
231 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
232 || opts->curly_infix_p)) \
233 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
235 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
237 #define CHAR_IS_EXPONENT_MARKER(_chr) \
238 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
239 || ((_chr) == 'd') || ((_chr) == 'l'))
241 /* Read an SCSH block comment. */
242 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
243 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
244 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
245 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
246 static SCM
scm_get_hash_procedure (int);
248 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
249 result in the pre-allocated buffer BUF. Return zero if the whole token has
250 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
251 bytes actually read. */
253 read_token (SCM port
, scm_t_read_opts
*opts
,
254 char *buf
, size_t buf_size
, size_t *read
)
258 while (*read
< buf_size
)
262 chr
= scm_get_byte_or_eof_unlocked (port
);
266 else if (CHAR_IS_DELIMITER (chr
))
268 scm_unget_byte_unlocked (chr
, port
);
281 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
282 if the token doesn't fit in BUFFER_SIZE bytes. */
284 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
285 char *buffer
, size_t buffer_size
, size_t *read
)
288 size_t bytes_read
, overflow_size
= 0;
289 char *overflow_buffer
= NULL
;
293 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
296 if (overflow
|| overflow_size
!= 0)
298 if (overflow_size
== 0)
300 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
301 memcpy (overflow_buffer
, buffer
, bytes_read
);
302 overflow_size
= bytes_read
;
307 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
309 memcpy (new_buf
, overflow_buffer
, overflow_size
);
310 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
312 overflow_buffer
= new_buf
;
313 overflow_size
+= bytes_read
;
320 *read
= overflow_size
;
324 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
327 /* Skip whitespace from PORT and return the first non-whitespace character
328 read. Raise an error on end-of-file. */
330 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
334 switch (c
= scm_getc_unlocked (port
))
340 scm_i_input_error (eoferr
,
349 switch (c
= scm_getc_unlocked (port
))
355 case SCM_LINE_INCREMENTORS
:
361 switch (c
= scm_getc_unlocked (port
))
364 eoferr
= "read_sharp";
367 scm_read_shebang (c
, port
, opts
);
370 scm_read_commented_expression (c
, port
, opts
);
373 if (scm_is_false (scm_get_hash_procedure (c
)))
375 scm_read_r6rs_block_comment (c
, port
);
380 scm_ungetc_unlocked (c
, port
);
385 case SCM_LINE_INCREMENTORS
:
386 case SCM_SINGLE_SPACES
:
401 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
402 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
403 long line
, int column
);
407 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
408 long line
, int column
)
410 if (opts
->record_positions_p
)
411 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
416 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
417 #define FUNC_NAME "scm_i_lreadparen"
420 SCM tmp
, tl
, ans
= SCM_EOL
;
421 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
422 const int terminating_char
= ((chr
== '{') ? '}'
423 : ((chr
== '[') ? ']'
426 /* Need to capture line and column numbers here. */
427 long line
= SCM_LINUM (port
);
428 int column
= SCM_COL (port
) - 1;
430 c
= flush_ws (port
, opts
, FUNC_NAME
);
431 if (terminating_char
== c
)
434 scm_ungetc_unlocked (c
, port
);
435 tmp
= scm_read_expression (port
, opts
);
437 /* Note that it is possible for scm_read_expression to return
438 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
439 check that it's a real dot by checking `c'. */
440 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
442 ans
= scm_read_expression (port
, opts
);
443 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
444 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
449 /* Build the head of the list structure. */
450 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
452 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
456 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
457 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
458 scm_i_input_error (FUNC_NAME
, port
,
459 "in pair: mismatched close paren: ~A",
460 scm_list_1 (SCM_MAKE_CHAR (c
)));
462 scm_ungetc_unlocked (c
, port
);
463 tmp
= scm_read_expression (port
, opts
);
465 /* See above note about scm_sym_dot. */
466 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
468 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
470 c
= flush_ws (port
, opts
, FUNC_NAME
);
471 if (terminating_char
!= c
)
472 scm_i_input_error (FUNC_NAME
, port
,
473 "in pair: missing close paren", SCM_EOL
);
477 new_tail
= scm_cons (tmp
, SCM_EOL
);
478 SCM_SETCDR (tl
, new_tail
);
484 /* In addition to finding the length, 'scm_ilength' checks for
485 improper or circular lists, in which case it returns -1. */
486 int len
= scm_ilength (ans
);
488 /* The (len == 0) case is handled above */
490 /* Return directly to avoid re-annotating the element's source
491 location with the position of the outer brace. Also, it
492 might not be possible to annotate the element. */
493 return scm_car (ans
); /* {e} => e */
495 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
496 else if (len
>= 3 && (len
& 1))
498 /* It's a proper list whose length is odd and at least 3. If
499 the elements at odd indices (the infix operator positions)
500 are all 'equal?', then it's a simple curly-infix list.
501 Otherwise it's a mixed curly-infix list. */
502 SCM op
= scm_cadr (ans
);
504 /* Check to see if the elements at odd indices are 'equal?' */
505 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
507 if (scm_is_null (tl
))
509 /* Convert simple curly-infix list to prefix:
510 {a <op> b <op> ...} => (<op> a b ...) */
512 while (scm_is_pair (scm_cdr (tl
)))
515 SCM_SETCDR (tl
, tmp
);
518 ans
= scm_cons (op
, ans
);
521 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
523 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
524 ans
= scm_cons (sym_nfx
, ans
);
530 /* Mixed curly-infix (possibly improper) list:
531 {e . tail} => ($nfx$ e . tail) */
532 ans
= scm_cons (sym_nfx
, ans
);
535 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
540 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
541 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
543 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
549 while (i < ndigits) \
551 a = scm_getc_unlocked (port); \
555 && (a == (scm_t_wchar) terminator) \
558 if ('0' <= a && a <= '9') \
560 else if ('A' <= a && a <= 'F') \
562 else if ('a' <= a && a <= 'f') \
575 skip_intraline_whitespace (SCM port
)
581 c
= scm_getc_unlocked (port
);
585 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
587 scm_ungetc_unlocked (c
, port
);
591 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
592 #define FUNC_NAME "scm_lreadr"
594 /* For strings smaller than C_STR, this function creates only one Scheme
595 object (the string returned). */
598 size_t c_str_len
= 0;
599 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
601 /* Need to capture line and column numbers here. */
602 long line
= SCM_LINUM (port
);
603 int column
= SCM_COL (port
) - 1;
605 while ('"' != (c
= scm_getc_unlocked (port
)))
610 scm_i_input_error (FUNC_NAME
, port
,
611 "end of file in string constant", SCM_EOL
);
614 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
616 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
622 switch (c
= scm_getc_unlocked (port
))
630 if (opts
->hungry_eol_escapes_p
)
631 skip_intraline_whitespace (port
);
658 if (opts
->r6rs_escapes_p
)
659 SCM_READ_HEX_ESCAPE (10, ';');
661 SCM_READ_HEX_ESCAPE (2, '\0');
664 if (!opts
->r6rs_escapes_p
)
666 SCM_READ_HEX_ESCAPE (4, '\0');
670 if (!opts
->r6rs_escapes_p
)
672 SCM_READ_HEX_ESCAPE (6, '\0');
677 scm_i_input_error (FUNC_NAME
, port
,
678 "illegal character in escape sequence: ~S",
679 scm_list_1 (SCM_MAKE_CHAR (c
)));
683 c_str
[c_str_len
++] = c
;
686 if (scm_is_null (str
))
687 /* Fast path: we got a string that fits in C_STR. */
688 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
692 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
694 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
697 return maybe_annotate_source (str
, port
, opts
, line
, column
);
703 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
705 SCM result
, str
= SCM_EOL
;
706 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
709 /* Need to capture line and column numbers here. */
710 long line
= SCM_LINUM (port
);
711 int column
= SCM_COL (port
) - 1;
713 scm_ungetc_unlocked (chr
, port
);
714 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
717 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
719 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
720 if (scm_is_false (result
))
722 /* Return a symbol instead of a number */
723 if (opts
->case_insensitive_p
)
724 str
= scm_string_downcase_x (str
);
725 result
= scm_string_to_symbol (str
);
727 else if (SCM_NIMP (result
))
728 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
730 SCM_COL (port
) += scm_i_string_length (str
);
735 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
738 int ends_with_colon
= 0;
740 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
741 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
744 scm_ungetc_unlocked (chr
, port
);
745 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
748 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
750 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
752 str
= scm_from_port_stringn (buffer
, bytes_read
- 1, port
);
754 if (opts
->case_insensitive_p
)
755 str
= scm_string_downcase_x (str
);
756 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
760 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
762 if (opts
->case_insensitive_p
)
763 str
= scm_string_downcase_x (str
);
764 result
= scm_string_to_symbol (str
);
767 SCM_COL (port
) += scm_i_string_length (str
);
772 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
773 #define FUNC_NAME "scm_lreadr"
777 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
804 scm_ungetc_unlocked (chr
, port
);
805 scm_ungetc_unlocked ('#', port
);
809 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
812 str
= scm_from_port_stringn (buffer
, read
, port
);
814 result
= scm_string_to_number (str
, scm_from_uint (radix
));
816 SCM_COL (port
) += scm_i_string_length (str
);
818 if (scm_is_true (result
))
821 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
828 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
831 long line
= SCM_LINUM (port
);
832 int column
= SCM_COL (port
) - 1;
837 p
= scm_sym_quasiquote
;
848 c
= scm_getc_unlocked (port
);
850 p
= scm_sym_uq_splicing
;
853 scm_ungetc_unlocked (c
, port
);
860 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
861 "scm_read_quote", chr
);
865 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
866 return maybe_annotate_source (p
, port
, opts
, line
, column
);
869 SCM_SYMBOL (sym_syntax
, "syntax");
870 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
871 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
872 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
875 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
878 long line
= SCM_LINUM (port
);
879 int column
= SCM_COL (port
) - 1;
895 c
= scm_getc_unlocked (port
);
897 p
= sym_unsyntax_splicing
;
900 scm_ungetc_unlocked (c
, port
);
907 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
908 "scm_read_syntax", chr
);
912 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
913 return maybe_annotate_source (p
, port
, opts
, line
, column
);
917 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
919 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
921 if (!scm_is_eq (id
, sym_nil
))
922 scm_i_input_error ("scm_read_nil", port
,
923 "unexpected input while reading #nil: ~a",
926 return SCM_ELISP_NIL
;
930 scm_read_semicolon_comment (int chr
, SCM port
)
934 /* We use the get_byte here because there is no need to get the
935 locale correct with comment input. This presumes that newline
936 always represents itself no matter what the encoding is. */
937 for (c
= scm_get_byte_or_eof_unlocked (port
);
938 (c
!= EOF
) && (c
!= '\n');
939 c
= scm_get_byte_or_eof_unlocked (port
));
941 return SCM_UNSPECIFIED
;
945 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
948 scm_read_boolean (int chr
, SCM port
)
961 return SCM_UNSPECIFIED
;
965 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
966 #define FUNC_NAME "scm_lreadr"
968 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
970 size_t charname_len
, bytes_read
;
973 scm_t_port_internal
*pti
;
975 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
978 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
982 chr
= scm_getc_unlocked (port
);
984 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
985 "while reading character", SCM_EOL
);
987 /* CHR must be a token delimiter, like a whitespace. */
988 return (SCM_MAKE_CHAR (chr
));
991 pti
= SCM_PORT_GET_INTERNAL (port
);
993 /* Simple ASCII characters can be processed immediately. Also, simple
994 ISO-8859-1 characters can be processed immediately if the encoding for this
995 port is ISO-8859-1. */
996 if (bytes_read
== 1 &&
997 ((unsigned char) buffer
[0] <= 127
998 || pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
))
1000 SCM_COL (port
) += 1;
1001 return SCM_MAKE_CHAR (buffer
[0]);
1004 /* Otherwise, convert the buffer into a proper scheme string for
1006 charname
= scm_from_port_stringn (buffer
, bytes_read
, port
);
1007 charname_len
= scm_i_string_length (charname
);
1008 SCM_COL (port
) += charname_len
;
1009 cp
= scm_i_string_ref (charname
, 0);
1010 if (charname_len
== 1)
1011 return SCM_MAKE_CHAR (cp
);
1013 /* Ignore dotted circles, which may be used to keep combining characters from
1014 combining with the backslash in #\charname. */
1015 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1016 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1018 if (cp
>= '0' && cp
< '8')
1020 /* Dirk:FIXME:: This type of character syntax is not R5RS
1021 * compliant. Further, it should be verified that the constant
1022 * does only consist of octal digits. */
1023 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1024 if (SCM_I_INUMP (p
))
1026 scm_t_wchar c
= scm_to_uint32 (p
);
1027 if (SCM_IS_UNICODE_CHAR (c
))
1028 return SCM_MAKE_CHAR (c
);
1030 scm_i_input_error (FUNC_NAME
, port
,
1031 "out-of-range octal character escape: ~a",
1032 scm_list_1 (charname
));
1036 if (cp
== 'x' && (charname_len
> 1))
1040 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1041 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1042 scm_from_uint (16));
1043 if (SCM_I_INUMP (p
))
1045 scm_t_wchar c
= scm_to_uint32 (p
);
1046 if (SCM_IS_UNICODE_CHAR (c
))
1047 return SCM_MAKE_CHAR (c
);
1049 scm_i_input_error (FUNC_NAME
, port
,
1050 "out-of-range hex character escape: ~a",
1051 scm_list_1 (charname
));
1055 /* The names of characters should never have non-Latin1
1057 if (scm_i_is_narrow_string (charname
)
1058 || scm_i_try_narrow_string (charname
))
1059 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1061 if (scm_is_true (ch
))
1065 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1066 scm_list_1 (charname
));
1068 return SCM_UNSPECIFIED
;
1073 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1077 /* Read the symbol that comprises the keyword. Doing this instead of
1078 invoking a specific symbol reader function allows `scm_read_keyword ()'
1079 to adapt to the delimiters currently valid of symbols.
1081 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1082 symbol
= scm_read_expression (port
, opts
);
1083 if (!scm_is_symbol (symbol
))
1084 scm_i_input_error ("scm_read_keyword", port
,
1085 "keyword prefix `~a' not followed by a symbol: ~s",
1086 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1088 return (scm_symbol_to_keyword (symbol
));
1092 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1093 long line
, int column
)
1095 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1096 guarantee that it's going to do what we want. After all, this is an
1097 implementation detail of `scm_read_vector ()', not a desirable
1099 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1100 port
, opts
, line
, column
);
1103 /* Helper used by scm_read_array */
1105 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1114 c
= scm_getc_unlocked (port
);
1117 while ('0' <= c
&& c
<= '9')
1119 res
= 10*res
+ c
-'0';
1121 c
= scm_getc_unlocked (port
);
1129 /* Read an array. This function can also read vectors and uniform
1130 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1133 C is the first character read after the '#'. */
1135 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1138 scm_t_wchar tag_buf
[8];
1141 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1143 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1144 the array code can not deal with zero-length dimensions yet, and
1145 we want to allow zero-length vectors, of course. */
1147 return scm_read_vector (c
, port
, opts
, line
, column
);
1149 /* Disambiguate between '#f' and uniform floating point vectors. */
1152 c
= scm_getc_unlocked (port
);
1153 if (c
!= '3' && c
!= '6')
1156 scm_ungetc_unlocked (c
, port
);
1162 goto continue_reading_tag
;
1167 c
= read_decimal_integer (port
, c
, &rank
);
1169 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1174 continue_reading_tag
:
1175 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1176 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1178 tag_buf
[tag_len
++] = c
;
1179 c
= scm_getc_unlocked (port
);
1185 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1186 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1187 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1192 if (c
== '@' || c
== ':')
1198 ssize_t lbnd
= 0, len
= 0;
1203 c
= scm_getc_unlocked (port
);
1204 c
= read_decimal_integer (port
, c
, &lbnd
);
1207 s
= scm_from_ssize_t (lbnd
);
1211 c
= scm_getc_unlocked (port
);
1212 c
= read_decimal_integer (port
, c
, &len
);
1214 scm_i_input_error (NULL
, port
,
1215 "array length must be non-negative",
1218 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1221 shape
= scm_cons (s
, shape
);
1222 } while (c
== '@' || c
== ':');
1224 shape
= scm_reverse_x (shape
, SCM_EOL
);
1227 /* Read nested lists of elements. */
1229 scm_i_input_error (NULL
, port
,
1230 "missing '(' in vector or array literal",
1232 elements
= scm_read_sexp (c
, port
, opts
);
1234 if (scm_is_false (shape
))
1235 shape
= scm_from_ssize_t (rank
);
1236 else if (scm_ilength (shape
) != rank
)
1239 "the number of shape specifications must match the array rank",
1242 /* Handle special print syntax of rank zero arrays; see
1243 scm_i_print_array for a rationale. */
1246 if (!scm_is_pair (elements
))
1247 scm_i_input_error (NULL
, port
,
1248 "too few elements in array literal, need 1",
1250 if (!scm_is_null (SCM_CDR (elements
)))
1251 scm_i_input_error (NULL
, port
,
1252 "too many elements in array literal, want 1",
1254 elements
= SCM_CAR (elements
);
1257 /* Construct array, annotate with source location, and return. */
1258 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1259 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1263 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1264 long line
, int column
)
1266 return scm_read_array (chr
, port
, opts
, line
, column
);
1270 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1271 long line
, int column
)
1273 chr
= scm_getc_unlocked (port
);
1277 chr
= scm_getc_unlocked (port
);
1281 chr
= scm_getc_unlocked (port
);
1285 return maybe_annotate_source
1286 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1287 port
, opts
, line
, column
);
1290 scm_i_input_error ("read_bytevector", port
,
1291 "invalid bytevector prefix",
1292 SCM_MAKE_CHAR (chr
));
1293 return SCM_UNSPECIFIED
;
1297 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1298 long line
, int column
)
1300 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1301 terribly inefficient but who cares? */
1302 SCM s_bits
= SCM_EOL
;
1304 for (chr
= scm_getc_unlocked (port
);
1305 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1306 chr
= scm_getc_unlocked (port
))
1308 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1312 scm_ungetc_unlocked (chr
, port
);
1314 return maybe_annotate_source
1315 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1316 port
, opts
, line
, column
);
1320 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1326 int c
= scm_getc_unlocked (port
);
1329 scm_i_input_error ("skip_block_comment", port
,
1330 "unterminated `#! ... !#' comment", SCM_EOL
);
1334 else if (c
== '#' && bang_seen
)
1340 return SCM_UNSPECIFIED
;
1343 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1345 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1347 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1351 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1353 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1357 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1359 c
= scm_getc_unlocked (port
);
1361 scm_i_input_error ("skip_block_comment", port
,
1362 "unterminated `#! ... !#' comment", SCM_EOL
);
1363 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1365 else if (CHAR_IS_DELIMITER (c
))
1367 scm_ungetc_unlocked (c
, port
);
1369 if (0 == strcmp ("r6rs", name
))
1370 ; /* Silently ignore */
1371 else if (0 == strcmp ("fold-case", name
))
1372 set_port_case_insensitive_p (port
, opts
, 1);
1373 else if (0 == strcmp ("no-fold-case", name
))
1374 set_port_case_insensitive_p (port
, opts
, 0);
1375 else if (0 == strcmp ("curly-infix", name
))
1376 set_port_curly_infix_p (port
, opts
, 1);
1377 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1379 set_port_curly_infix_p (port
, opts
, 1);
1380 set_port_square_brackets_p (port
, opts
, 0);
1385 return SCM_UNSPECIFIED
;
1389 scm_ungetc_unlocked (c
, port
);
1394 scm_ungetc_unlocked (name
[--i
], port
);
1395 return scm_read_scsh_block_comment (chr
, port
);
1399 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1401 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1402 nested. So care must be taken. */
1403 int nesting_level
= 1;
1405 int a
= scm_getc_unlocked (port
);
1408 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1409 "unterminated `#| ... |#' comment", SCM_EOL
);
1411 while (nesting_level
> 0)
1413 int b
= scm_getc_unlocked (port
);
1416 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1417 "unterminated `#| ... |#' comment", SCM_EOL
);
1419 if (a
== '|' && b
== '#')
1424 else if (a
== '#' && b
== '|')
1433 return SCM_UNSPECIFIED
;
1437 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1438 scm_t_read_opts
*opts
)
1442 c
= flush_ws (port
, opts
, (char *) NULL
);
1444 scm_i_input_error ("read_commented_expression", port
,
1445 "no expression after #; comment", SCM_EOL
);
1446 scm_ungetc_unlocked (c
, port
);
1447 scm_read_expression (port
, opts
);
1448 return SCM_UNSPECIFIED
;
1452 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1454 /* Guile's extended symbol read syntax looks like this:
1456 #{This is all a symbol name}#
1458 So here, CHR is expected to be `{'. */
1461 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1463 buf
= scm_i_string_start_writing (buf
);
1465 while ((chr
= scm_getc_unlocked (port
)) != EOF
)
1476 scm_i_string_set_x (buf
, len
++, '}');
1482 else if (chr
== '\\')
1484 /* It used to be that print.c would print extended-read-syntax
1485 symbols with backslashes before "non-standard" chars, but
1486 this routine wouldn't do anything with those escapes.
1487 Bummer. What we've done is to change print.c to output
1488 R6RS hex escapes for those characters, relying on the fact
1489 that the extended read syntax would never put a `\' before
1490 an `x'. For now, we just ignore other instances of
1491 backslash in the string. */
1492 switch ((chr
= scm_getc_unlocked (port
)))
1500 SCM_READ_HEX_ESCAPE (10, ';');
1501 scm_i_string_set_x (buf
, len
++, c
);
1509 scm_i_string_stop_writing ();
1510 scm_i_input_error ("scm_read_extended_symbol", port
,
1511 "illegal character in escape sequence: ~S",
1512 scm_list_1 (SCM_MAKE_CHAR (c
)));
1516 scm_i_string_set_x (buf
, len
++, chr
);
1521 scm_i_string_set_x (buf
, len
++, chr
);
1523 if (len
>= scm_i_string_length (buf
) - 2)
1527 scm_i_string_stop_writing ();
1528 addy
= scm_i_make_string (1024, NULL
, 0);
1529 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1531 buf
= scm_i_string_start_writing (buf
);
1536 scm_i_string_stop_writing ();
1538 scm_i_input_error ("scm_read_extended_symbol", port
,
1539 "end of file while reading symbol", SCM_EOL
);
1541 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1546 /* Top-level token readers, i.e., dispatchers. */
1549 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1553 proc
= scm_get_hash_procedure (chr
);
1554 if (scm_is_true (scm_procedure_p (proc
)))
1556 long line
= SCM_LINUM (port
);
1557 int column
= SCM_COL (port
) - 2;
1560 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1562 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1563 && !scm_i_has_source_properties (got
))
1564 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1569 return SCM_UNSPECIFIED
;
1572 /* The reader for the sharp `#' character. It basically dispatches reads
1573 among the above token readers. */
1575 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1576 long line
, int column
)
1577 #define FUNC_NAME "scm_lreadr"
1581 chr
= scm_getc_unlocked (port
);
1583 result
= scm_read_sharp_extension (chr
, port
, opts
);
1584 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1590 return (scm_read_character (chr
, port
, opts
));
1592 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1597 /* This one may return either a boolean or an SRFI-4 vector. */
1598 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1600 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1602 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1606 return (scm_read_boolean (chr
, port
));
1608 return (scm_read_keyword (chr
, port
, opts
));
1609 case '0': case '1': case '2': case '3': case '4':
1610 case '5': case '6': case '7': case '8': case '9':
1612 return (scm_read_array (chr
, port
, opts
, line
, column
));
1626 return (scm_read_number_and_radix (chr
, port
, opts
));
1628 return (scm_read_extended_symbol (chr
, port
));
1630 return (scm_read_shebang (chr
, port
, opts
));
1632 return (scm_read_commented_expression (chr
, port
, opts
));
1636 return (scm_read_syntax (chr
, port
, opts
));
1638 return (scm_read_nil (chr
, port
, opts
));
1640 result
= scm_read_sharp_extension (chr
, port
, opts
);
1641 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1643 /* To remain compatible with 1.8 and earlier, the following
1644 characters have lower precedence than `read-hash-extend'
1649 return scm_read_r6rs_block_comment (chr
, port
);
1651 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1652 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1659 return SCM_UNSPECIFIED
;
1664 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1665 #define FUNC_NAME "read_inner_expression"
1671 chr
= scm_getc_unlocked (port
);
1675 case SCM_WHITE_SPACES
:
1676 case SCM_LINE_INCREMENTORS
:
1679 (void) scm_read_semicolon_comment (chr
, port
);
1682 if (opts
->curly_infix_p
)
1684 if (opts
->neoteric_p
)
1685 return scm_read_sexp (chr
, port
, opts
);
1690 /* Enable neoteric expressions within curly braces */
1691 opts
->neoteric_p
= 1;
1692 expr
= scm_read_sexp (chr
, port
, opts
);
1693 opts
->neoteric_p
= 0;
1698 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1700 if (opts
->square_brackets_p
)
1701 return scm_read_sexp (chr
, port
, opts
);
1702 else if (opts
->curly_infix_p
)
1704 /* The syntax of neoteric expressions requires that '[' be
1705 a delimiter when curly-infix is enabled, so it cannot
1706 be part of an unescaped symbol. We might as well do
1707 something useful with it, so we adopt Kawa's convention:
1708 [...] => ($bracket-list$ ...) */
1709 long line
= SCM_LINUM (port
);
1710 int column
= SCM_COL (port
) - 1;
1711 return maybe_annotate_source
1712 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1713 port
, opts
, line
, column
);
1716 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1718 return (scm_read_sexp (chr
, port
, opts
));
1720 return (scm_read_string (chr
, port
, opts
));
1724 return (scm_read_quote (chr
, port
, opts
));
1727 long line
= SCM_LINUM (port
);
1728 int column
= SCM_COL (port
) - 1;
1729 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1730 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1731 /* We read a comment or some such. */
1737 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1740 if (opts
->curly_infix_p
)
1741 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1743 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1745 if (opts
->square_brackets_p
)
1746 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1747 /* otherwise fall through */
1751 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1752 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1757 if (((chr
>= '0') && (chr
<= '9'))
1758 || (strchr ("+-.", chr
)))
1759 return (scm_read_number (chr
, port
, opts
));
1761 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1769 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1770 #define FUNC_NAME "scm_read_expression"
1772 if (!opts
->neoteric_p
)
1773 return read_inner_expression (port
, opts
);
1780 if (opts
->record_positions_p
)
1782 /* We need to get the position of the first non-whitespace
1783 character in order to correctly annotate neoteric
1784 expressions. For example, for the expression 'f(x)', the
1785 first call to 'read_inner_expression' reads the 'f' (which
1786 cannot be annotated), and then we later read the '(x)' and
1787 use it to construct the new list (f x). */
1788 int c
= flush_ws (port
, opts
, (char *) NULL
);
1791 scm_ungetc_unlocked (c
, port
);
1792 line
= SCM_LINUM (port
);
1793 column
= SCM_COL (port
);
1796 expr
= read_inner_expression (port
, opts
);
1798 /* 'expr' is the first component of the neoteric expression. Now
1799 we loop, and as long as the next character is '(', '[', or '{',
1800 (without any intervening whitespace), we use it to construct a
1801 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1804 int chr
= scm_getc_unlocked (port
);
1807 /* e(...) => (e ...) */
1808 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1809 else if (chr
== '[')
1810 /* e[...] => ($bracket-apply$ e ...) */
1811 expr
= scm_cons (sym_bracket_apply
,
1813 scm_read_sexp (chr
, port
, opts
)));
1814 else if (chr
== '{')
1816 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1818 if (scm_is_null (arg
))
1819 expr
= scm_list_1 (expr
); /* e{} => (e) */
1821 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1826 scm_ungetc_unlocked (chr
, port
);
1829 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1837 /* Actual reader. */
1839 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1841 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1843 "Read an s-expression from the input port @var{port}, or from\n"
1844 "the current input port if @var{port} is not specified.\n"
1845 "Any whitespace before the next token is discarded.")
1846 #define FUNC_NAME s_scm_read
1848 scm_t_read_opts opts
;
1851 if (SCM_UNBNDP (port
))
1852 port
= scm_current_input_port ();
1853 SCM_VALIDATE_OPINPORT (1, port
);
1855 init_read_options (port
, &opts
);
1857 c
= flush_ws (port
, &opts
, (char *) NULL
);
1860 scm_ungetc_unlocked (c
, port
);
1862 return (scm_read_expression (port
, &opts
));
1869 /* Manipulate the read-hash-procedures alist. This could be written in
1870 Scheme, but maybe it will also be used by C code during initialisation. */
1871 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1872 (SCM chr
, SCM proc
),
1873 "Install the procedure @var{proc} for reading expressions\n"
1874 "starting with the character sequence @code{#} and @var{chr}.\n"
1875 "@var{proc} will be called with two arguments: the character\n"
1876 "@var{chr} and the port to read further data from. The object\n"
1877 "returned will be the return value of @code{read}. \n"
1878 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1880 #define FUNC_NAME s_scm_read_hash_extend
1885 SCM_VALIDATE_CHAR (1, chr
);
1886 SCM_ASSERT (scm_is_false (proc
)
1887 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1888 proc
, SCM_ARG2
, FUNC_NAME
);
1890 /* Check if chr is already in the alist. */
1891 this = scm_i_read_hash_procedures_ref ();
1895 if (scm_is_null (this))
1897 /* not found, so add it to the beginning. */
1898 if (scm_is_true (proc
))
1900 SCM
new = scm_cons (scm_cons (chr
, proc
),
1901 scm_i_read_hash_procedures_ref ());
1902 scm_i_read_hash_procedures_set_x (new);
1906 if (scm_is_eq (chr
, SCM_CAAR (this)))
1908 /* already in the alist. */
1909 if (scm_is_false (proc
))
1912 if (scm_is_false (prev
))
1914 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1915 scm_i_read_hash_procedures_set_x (rest
);
1918 scm_set_cdr_x (prev
, SCM_CDR (this));
1923 scm_set_cdr_x (SCM_CAR (this), proc
);
1928 this = SCM_CDR (this);
1931 return SCM_UNSPECIFIED
;
1935 /* Recover the read-hash procedure corresponding to char c. */
1937 scm_get_hash_procedure (int c
)
1939 SCM rest
= scm_i_read_hash_procedures_ref ();
1943 if (scm_is_null (rest
))
1946 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1947 return SCM_CDAR (rest
);
1949 rest
= SCM_CDR (rest
);
1953 #define SCM_ENCODING_SEARCH_SIZE (500)
1956 is_encoding_char (char c
)
1958 if (c
>= 'a' && c
<= 'z') return 1;
1959 if (c
>= 'A' && c
<= 'Z') return 1;
1960 if (c
>= '0' && c
<= '9') return 1;
1961 return strchr ("_-.:/,+=()", c
) != NULL
;
1964 /* Search the first few hundred characters of a file for an Emacs-like coding
1965 declaration. Returns either NULL or a string whose storage has been
1966 allocated with `scm_gc_malloc ()'. */
1968 scm_i_scan_for_encoding (SCM port
)
1971 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1972 size_t bytes_read
, encoding_length
, i
;
1973 char *encoding
= NULL
;
1974 char *pos
, *encoding_start
;
1977 pt
= SCM_PTAB_ENTRY (port
);
1979 if (pt
->rw_active
== SCM_PORT_WRITE
)
1980 scm_flush_unlocked (port
);
1983 pt
->rw_active
= SCM_PORT_READ
;
1985 if (pt
->read_pos
== pt
->read_end
)
1987 /* We can use the read buffer, and thus avoid a seek. */
1988 if (scm_fill_input_unlocked (port
) == EOF
)
1991 bytes_read
= pt
->read_end
- pt
->read_pos
;
1992 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1993 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1995 if (bytes_read
<= 1)
1996 /* An unbuffered port -- don't scan. */
1999 memcpy (header
, pt
->read_pos
, bytes_read
);
2000 header
[bytes_read
] = '\0';
2004 /* Try to read some bytes and then seek back. Not all ports
2005 support seeking back; and indeed some file ports (like
2006 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2007 check performed by SCM_FPORT_FDES---but fail to seek
2008 backwards. Hence this block comes second. We prefer to use
2009 the read buffer in-place. */
2010 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2013 bytes_read
= scm_c_read_unlocked (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2014 header
[bytes_read
] = '\0';
2015 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2018 /* search past "coding[:=]" */
2022 if ((pos
= strstr(pos
, "coding")) == NULL
)
2025 pos
+= strlen("coding");
2026 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2027 (*pos
== ':' || *pos
== '='))
2035 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2036 (*pos
== ' ' || *pos
== '\t'))
2039 /* grab the next token */
2040 encoding_start
= pos
;
2042 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2043 && encoding_start
+ i
- header
< bytes_read
2044 && is_encoding_char (encoding_start
[i
]))
2047 encoding_length
= i
;
2048 if (encoding_length
== 0)
2051 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2053 /* push backwards to make sure we were in a comment */
2055 pos
= encoding_start
;
2056 while (pos
>= header
)
2063 else if (*pos
== '\n' || pos
== header
)
2065 /* This wasn't in a semicolon comment. Check for a
2066 hash-bang comment. */
2067 char *beg
= strstr (header
, "#!");
2068 char *end
= strstr (header
, "!#");
2069 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2080 /* This wasn't in a comment */
2086 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2088 "Scans the port for an Emacs-like character coding declaration\n"
2089 "near the top of the contents of a port with random-accessible contents.\n"
2090 "The coding declaration is of the form\n"
2091 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2093 "Returns a string containing the character encoding of the file\n"
2094 "if a declaration was found, or @code{#f} otherwise.\n")
2095 #define FUNC_NAME s_scm_file_encoding
2100 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2102 enc
= scm_i_scan_for_encoding (port
);
2107 s_enc
= scm_string_upcase (scm_from_locale_string (enc
));
2116 /* Per-port read options.
2118 We store per-port read options in the 'port-read-options' key of the
2119 port's alist, which is stored in the internal port structure. The
2120 value stored in the alist is a single integer that contains a two-bit
2121 field for each read option.
2123 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2124 the applicable value should be inherited from the corresponding
2125 global read option. Otherwise, the bit field contains the value of
2126 the read option. For boolean read options that have been set
2127 per-port, the possible values are 0 or 1. If the 'keyword_style'
2128 read option has been set per-port, its possible values are those in
2129 'enum t_keyword_style'. */
2131 /* Key to read options in per-port alists. */
2132 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2134 /* Offsets of bit fields for each per-port override */
2135 #define READ_OPTION_COPY_SOURCE_P 0
2136 #define READ_OPTION_RECORD_POSITIONS_P 2
2137 #define READ_OPTION_CASE_INSENSITIVE_P 4
2138 #define READ_OPTION_KEYWORD_STYLE 6
2139 #define READ_OPTION_R6RS_ESCAPES_P 8
2140 #define READ_OPTION_SQUARE_BRACKETS_P 10
2141 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2142 #define READ_OPTION_CURLY_INFIX_P 14
2144 /* The total width in bits of the per-port overrides */
2145 #define READ_OPTIONS_NUM_BITS 16
2147 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2148 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2150 #define READ_OPTION_MASK 3
2151 #define READ_OPTION_INHERIT 3
2154 set_port_read_option (SCM port
, int option
, int new_value
)
2156 SCM alist
, scm_read_options
;
2157 unsigned int read_options
;
2159 new_value
&= READ_OPTION_MASK
;
2160 alist
= scm_i_port_alist (port
);
2161 scm_read_options
= scm_assq_ref (alist
, sym_port_read_options
);
2162 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2163 read_options
= scm_to_uint (scm_read_options
);
2165 read_options
= READ_OPTIONS_INHERIT_ALL
;
2166 read_options
&= ~(READ_OPTION_MASK
<< option
);
2167 read_options
|= new_value
<< option
;
2168 scm_read_options
= scm_from_uint (read_options
);
2169 alist
= scm_assq_set_x (alist
, sym_port_read_options
, scm_read_options
);
2170 scm_i_set_port_alist_x (port
, alist
);
2173 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2175 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2178 opts
->case_insensitive_p
= value
;
2179 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2182 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2184 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2187 opts
->square_brackets_p
= value
;
2188 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2191 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2193 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2196 opts
->curly_infix_p
= value
;
2197 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2200 /* Initialize OPTS based on PORT's read options and the global read
2203 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2205 SCM alist
, val
, scm_read_options
;
2206 unsigned int read_options
, x
;
2208 alist
= scm_i_port_alist (port
);
2209 scm_read_options
= scm_assq_ref (alist
, sym_port_read_options
);
2211 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2212 read_options
= scm_to_uint (scm_read_options
);
2214 read_options
= READ_OPTIONS_INHERIT_ALL
;
2216 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2217 if (x
== READ_OPTION_INHERIT
)
2219 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2220 if (scm_is_eq (val
, scm_keyword_prefix
))
2221 x
= KEYWORD_STYLE_PREFIX
;
2222 else if (scm_is_eq (val
, scm_keyword_postfix
))
2223 x
= KEYWORD_STYLE_POSTFIX
;
2225 x
= KEYWORD_STYLE_HASH_PREFIX
;
2227 opts
->keyword_style
= x
;
2229 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2232 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2233 if (x == READ_OPTION_INHERIT) \
2234 x = !!SCM_ ## NAME; \
2239 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2240 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2241 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2242 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2243 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2244 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2245 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2247 #undef RESOLVE_BOOLEAN_OPTION
2249 opts
->neoteric_p
= 0;
2255 SCM read_hash_procs
;
2257 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2259 scm_i_read_hash_procedures
=
2260 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2262 scm_init_opts (scm_read_options
, scm_read_opts
);
2263 #include "libguile/read.x"