1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include <c-strcase.h>
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/fports.h"
48 #include "libguile/root.h"
49 #include "libguile/strings.h"
50 #include "libguile/strports.h"
51 #include "libguile/vectors.h"
52 #include "libguile/validate.h"
53 #include "libguile/srfi-4.h"
54 #include "libguile/srfi-13.h"
56 #include "libguile/read.h"
57 #include "libguile/private-options.h"
62 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
63 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
64 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
65 SCM_SYMBOL (sym_nil
, "nil");
67 /* SRFI-105 curly infix expression support */
68 SCM_SYMBOL (sym_nfx
, "$nfx$");
69 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
70 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
72 scm_t_option scm_read_opts
[] =
74 { SCM_OPTION_BOOLEAN
, "copy", 0,
75 "Copy source code expressions." },
76 { SCM_OPTION_BOOLEAN
, "positions", 1,
77 "Record positions of source code expressions." },
78 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
79 "Convert symbols to lower case."},
80 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
81 "Style of keyword recognition: #f, 'prefix or 'postfix."},
82 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
83 "Use R6RS variable-length character and string hex escapes."},
84 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
85 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
87 "In strings, consume leading whitespace after an escaped end-of-line."},
88 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
89 "Support SRFI-105 curly infix expressions."},
93 /* Internal read options structure. This is initialized by 'scm_read'
94 from the global and per-port read options, and a pointer is passed
95 down to all helper functions. */
99 KEYWORD_STYLE_HASH_PREFIX
,
100 KEYWORD_STYLE_PREFIX
,
101 KEYWORD_STYLE_POSTFIX
106 enum t_keyword_style keyword_style
;
107 unsigned int copy_source_p
: 1;
108 unsigned int record_positions_p
: 1;
109 unsigned int case_insensitive_p
: 1;
110 unsigned int r6rs_escapes_p
: 1;
111 unsigned int square_brackets_p
: 1;
112 unsigned int hungry_eol_escapes_p
: 1;
113 unsigned int curly_infix_p
: 1;
114 unsigned int neoteric_p
: 1;
117 typedef struct t_read_opts scm_t_read_opts
;
121 Give meaningful error messages for errors
125 FILE:LINE:COL: MESSAGE
126 This happened in ....
128 This is not standard GNU format, but the test-suite likes the real
129 message to be in front.
135 scm_i_input_error (char const *function
,
136 SCM port
, const char *message
, SCM arg
)
138 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
140 : scm_from_locale_string ("#<unknown port>"));
142 SCM string_port
= scm_open_output_string ();
143 SCM string
= SCM_EOL
;
144 scm_simple_format (string_port
,
145 scm_from_locale_string ("~A:~S:~S: ~A"),
147 scm_from_long (SCM_LINUM (port
) + 1),
148 scm_from_int (SCM_COL (port
) + 1),
149 scm_from_locale_string (message
)));
151 string
= scm_get_output_string (string_port
);
152 scm_close_output_port (string_port
);
153 scm_error_scm (scm_from_latin1_symbol ("read-error"),
154 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
161 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
163 "Option interface for the read options. Instead of using\n"
164 "this procedure directly, use the procedures @code{read-enable},\n"
165 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
166 #define FUNC_NAME s_scm_read_options
168 SCM ans
= scm_options (setting
,
171 if (SCM_COPY_SOURCE_P
)
172 SCM_RECORD_POSITIONS_P
= 1;
177 /* A fluid referring to an association list mapping extra hash
178 characters to procedures. */
179 static SCM
*scm_i_read_hash_procedures
;
182 scm_i_read_hash_procedures_ref (void)
184 return scm_fluid_ref (*scm_i_read_hash_procedures
);
188 scm_i_read_hash_procedures_set_x (SCM value
)
190 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
197 /* Size of the C buffer used to read symbols and numbers. */
198 #define READER_BUFFER_SIZE 128
200 /* Number of 32-bit codepoints in the buffer used to read strings. */
201 #define READER_STRING_BUFFER_SIZE 128
203 /* The maximum size of Scheme character names. */
204 #define READER_CHAR_NAME_MAX_SIZE 50
206 /* The maximum size of reader directive names. */
207 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
210 /* `isblank' is only in C99. */
211 #define CHAR_IS_BLANK_(_chr) \
212 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
213 || ((_chr) == '\f') || ((_chr) == '\r'))
216 # define CHAR_IS_BLANK(_chr) \
217 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
219 # define CHAR_IS_BLANK CHAR_IS_BLANK_
223 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
225 #define CHAR_IS_R5RS_DELIMITER(c) \
227 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
229 #define CHAR_IS_DELIMITER(c) \
230 (CHAR_IS_R5RS_DELIMITER (c) \
231 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
232 || opts->curly_infix_p)) \
233 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
235 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
237 #define CHAR_IS_EXPONENT_MARKER(_chr) \
238 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
239 || ((_chr) == 'd') || ((_chr) == 'l'))
241 /* Read an SCSH block comment. */
242 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
243 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
244 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
245 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
246 static SCM
scm_get_hash_procedure (int);
248 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
249 result in the pre-allocated buffer BUF. Return zero if the whole token has
250 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
251 bytes actually read. */
253 read_token (SCM port
, scm_t_read_opts
*opts
,
254 char *buf
, size_t buf_size
, size_t *read
)
258 while (*read
< buf_size
)
262 chr
= scm_get_byte_or_eof (port
);
266 else if (CHAR_IS_DELIMITER (chr
))
268 scm_unget_byte (chr
, port
);
281 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
282 if the token doesn't fit in BUFFER_SIZE bytes. */
284 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
285 char *buffer
, size_t buffer_size
, size_t *read
)
288 size_t bytes_read
, overflow_size
= 0;
289 char *overflow_buffer
= NULL
;
293 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
296 if (overflow
|| overflow_size
!= 0)
298 if (overflow_size
== 0)
300 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
301 memcpy (overflow_buffer
, buffer
, bytes_read
);
302 overflow_size
= bytes_read
;
307 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
309 memcpy (new_buf
, overflow_buffer
, overflow_size
);
310 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
312 overflow_buffer
= new_buf
;
313 overflow_size
+= bytes_read
;
320 *read
= overflow_size
;
324 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
327 /* Skip whitespace from PORT and return the first non-whitespace character
328 read. Raise an error on end-of-file. */
330 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
334 switch (c
= scm_getc (port
))
340 scm_i_input_error (eoferr
,
349 switch (c
= scm_getc (port
))
355 case SCM_LINE_INCREMENTORS
:
361 switch (c
= scm_getc (port
))
364 eoferr
= "read_sharp";
367 scm_read_shebang (c
, port
, opts
);
370 scm_read_commented_expression (c
, port
, opts
);
373 if (scm_is_false (scm_get_hash_procedure (c
)))
375 scm_read_r6rs_block_comment (c
, port
);
380 scm_ungetc (c
, port
);
385 case SCM_LINE_INCREMENTORS
:
386 case SCM_SINGLE_SPACES
:
401 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
402 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
403 long line
, int column
);
407 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
408 long line
, int column
)
410 if (opts
->record_positions_p
)
411 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
416 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
417 #define FUNC_NAME "scm_i_lreadparen"
420 SCM tmp
, tl
, ans
= SCM_EOL
;
421 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
422 const int terminating_char
= ((chr
== '{') ? '}'
423 : ((chr
== '[') ? ']'
426 /* Need to capture line and column numbers here. */
427 long line
= SCM_LINUM (port
);
428 int column
= SCM_COL (port
) - 1;
430 c
= flush_ws (port
, opts
, FUNC_NAME
);
431 if (terminating_char
== c
)
434 scm_ungetc (c
, port
);
435 tmp
= scm_read_expression (port
, opts
);
437 /* Note that it is possible for scm_read_expression to return
438 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
439 check that it's a real dot by checking `c'. */
440 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
442 ans
= scm_read_expression (port
, opts
);
443 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
444 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
449 /* Build the head of the list structure. */
450 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
452 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
456 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
457 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
458 scm_i_input_error (FUNC_NAME
, port
,
459 "in pair: mismatched close paren: ~A",
460 scm_list_1 (SCM_MAKE_CHAR (c
)));
462 scm_ungetc (c
, port
);
463 tmp
= scm_read_expression (port
, opts
);
465 /* See above note about scm_sym_dot. */
466 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
468 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
470 c
= flush_ws (port
, opts
, FUNC_NAME
);
471 if (terminating_char
!= c
)
472 scm_i_input_error (FUNC_NAME
, port
,
473 "in pair: missing close paren", SCM_EOL
);
477 new_tail
= scm_cons (tmp
, SCM_EOL
);
478 SCM_SETCDR (tl
, new_tail
);
484 /* In addition to finding the length, 'scm_ilength' checks for
485 improper or circular lists, in which case it returns -1. */
486 int len
= scm_ilength (ans
);
488 /* The (len == 0) case is handled above */
490 /* Return directly to avoid re-annotating the element's source
491 location with the position of the outer brace. Also, it
492 might not be possible to annotate the element. */
493 return scm_car (ans
); /* {e} => e */
495 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
496 else if (len
>= 3 && (len
& 1))
498 /* It's a proper list whose length is odd and at least 3. If
499 the elements at odd indices (the infix operator positions)
500 are all 'equal?', then it's a simple curly-infix list.
501 Otherwise it's a mixed curly-infix list. */
502 SCM op
= scm_cadr (ans
);
504 /* Check to see if the elements at odd indices are 'equal?' */
505 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
507 if (scm_is_null (tl
))
509 /* Convert simple curly-infix list to prefix:
510 {a <op> b <op> ...} => (<op> a b ...) */
512 while (scm_is_pair (scm_cdr (tl
)))
515 SCM_SETCDR (tl
, tmp
);
518 ans
= scm_cons (op
, ans
);
521 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
523 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
524 ans
= scm_cons (sym_nfx
, ans
);
530 /* Mixed curly-infix (possibly improper) list:
531 {e . tail} => ($nfx$ e . tail) */
532 ans
= scm_cons (sym_nfx
, ans
);
535 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
540 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
541 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
543 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
549 while (i < ndigits) \
551 a = scm_getc (port); \
555 && (a == (scm_t_wchar) terminator) \
558 if ('0' <= a && a <= '9') \
560 else if ('A' <= a && a <= 'F') \
562 else if ('a' <= a && a <= 'f') \
575 skip_intraline_whitespace (SCM port
)
585 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
587 scm_ungetc (c
, port
);
591 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
592 #define FUNC_NAME "scm_lreadr"
594 /* For strings smaller than C_STR, this function creates only one Scheme
595 object (the string returned). */
598 size_t c_str_len
= 0;
599 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
601 /* Need to capture line and column numbers here. */
602 long line
= SCM_LINUM (port
);
603 int column
= SCM_COL (port
) - 1;
605 while ('"' != (c
= scm_getc (port
)))
610 scm_i_input_error (FUNC_NAME
, port
,
611 "end of file in string constant", SCM_EOL
);
614 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
616 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
622 switch (c
= scm_getc (port
))
630 if (opts
->hungry_eol_escapes_p
)
631 skip_intraline_whitespace (port
);
658 if (opts
->r6rs_escapes_p
)
659 SCM_READ_HEX_ESCAPE (10, ';');
661 SCM_READ_HEX_ESCAPE (2, '\0');
664 if (!opts
->r6rs_escapes_p
)
666 SCM_READ_HEX_ESCAPE (4, '\0');
670 if (!opts
->r6rs_escapes_p
)
672 SCM_READ_HEX_ESCAPE (6, '\0');
677 scm_i_input_error (FUNC_NAME
, port
,
678 "illegal character in escape sequence: ~S",
679 scm_list_1 (SCM_MAKE_CHAR (c
)));
683 c_str
[c_str_len
++] = c
;
686 if (scm_is_null (str
))
687 /* Fast path: we got a string that fits in C_STR. */
688 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
692 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
694 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
697 return maybe_annotate_source (str
, port
, opts
, line
, column
);
703 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
705 SCM result
, str
= SCM_EOL
;
706 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
708 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
710 /* Need to capture line and column numbers here. */
711 long line
= SCM_LINUM (port
);
712 int column
= SCM_COL (port
) - 1;
714 scm_ungetc (chr
, port
);
715 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
718 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
720 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
721 if (scm_is_false (result
))
723 /* Return a symbol instead of a number */
724 if (opts
->case_insensitive_p
)
725 str
= scm_string_downcase_x (str
);
726 result
= scm_string_to_symbol (str
);
728 else if (SCM_NIMP (result
))
729 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
731 SCM_COL (port
) += scm_i_string_length (str
);
736 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
739 int ends_with_colon
= 0;
741 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
742 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
743 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
746 scm_ungetc (chr
, port
);
747 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
750 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
752 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
754 str
= scm_from_stringn (buffer
, bytes_read
- 1,
755 pt
->encoding
, pt
->ilseq_handler
);
757 if (opts
->case_insensitive_p
)
758 str
= scm_string_downcase_x (str
);
759 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
763 str
= scm_from_stringn (buffer
, bytes_read
,
764 pt
->encoding
, pt
->ilseq_handler
);
766 if (opts
->case_insensitive_p
)
767 str
= scm_string_downcase_x (str
);
768 result
= scm_string_to_symbol (str
);
771 SCM_COL (port
) += scm_i_string_length (str
);
776 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
777 #define FUNC_NAME "scm_lreadr"
781 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
809 scm_ungetc (chr
, port
);
810 scm_ungetc ('#', port
);
814 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
817 pt
= SCM_PTAB_ENTRY (port
);
818 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
820 result
= scm_string_to_number (str
, scm_from_uint (radix
));
822 SCM_COL (port
) += scm_i_string_length (str
);
824 if (scm_is_true (result
))
827 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
834 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
837 long line
= SCM_LINUM (port
);
838 int column
= SCM_COL (port
) - 1;
843 p
= scm_sym_quasiquote
;
856 p
= scm_sym_uq_splicing
;
859 scm_ungetc (c
, port
);
866 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
867 "scm_read_quote", chr
);
871 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
872 return maybe_annotate_source (p
, port
, opts
, line
, column
);
875 SCM_SYMBOL (sym_syntax
, "syntax");
876 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
877 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
878 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
881 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
884 long line
= SCM_LINUM (port
);
885 int column
= SCM_COL (port
) - 1;
903 p
= sym_unsyntax_splicing
;
906 scm_ungetc (c
, port
);
913 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
914 "scm_read_syntax", chr
);
918 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
919 return maybe_annotate_source (p
, port
, opts
, line
, column
);
923 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
925 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
927 if (!scm_is_eq (id
, sym_nil
))
928 scm_i_input_error ("scm_read_nil", port
,
929 "unexpected input while reading #nil: ~a",
932 return SCM_ELISP_NIL
;
936 scm_read_semicolon_comment (int chr
, SCM port
)
940 /* We use the get_byte here because there is no need to get the
941 locale correct with comment input. This presumes that newline
942 always represents itself no matter what the encoding is. */
943 for (c
= scm_get_byte_or_eof (port
);
944 (c
!= EOF
) && (c
!= '\n');
945 c
= scm_get_byte_or_eof (port
));
947 return SCM_UNSPECIFIED
;
951 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
954 scm_read_boolean (int chr
, SCM port
)
967 return SCM_UNSPECIFIED
;
971 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
972 #define FUNC_NAME "scm_lreadr"
974 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
976 size_t charname_len
, bytes_read
;
981 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
984 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
988 chr
= scm_getc (port
);
990 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
991 "while reading character", SCM_EOL
);
993 /* CHR must be a token delimiter, like a whitespace. */
994 return (SCM_MAKE_CHAR (chr
));
997 pt
= SCM_PTAB_ENTRY (port
);
999 /* Simple ASCII characters can be processed immediately. Also, simple
1000 ISO-8859-1 characters can be processed immediately if the encoding for this
1001 port is ISO-8859-1. */
1002 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
1004 SCM_COL (port
) += 1;
1005 return SCM_MAKE_CHAR (buffer
[0]);
1008 /* Otherwise, convert the buffer into a proper scheme string for
1010 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
1012 charname_len
= scm_i_string_length (charname
);
1013 SCM_COL (port
) += charname_len
;
1014 cp
= scm_i_string_ref (charname
, 0);
1015 if (charname_len
== 1)
1016 return SCM_MAKE_CHAR (cp
);
1018 /* Ignore dotted circles, which may be used to keep combining characters from
1019 combining with the backslash in #\charname. */
1020 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1021 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1023 if (cp
>= '0' && cp
< '8')
1025 /* Dirk:FIXME:: This type of character syntax is not R5RS
1026 * compliant. Further, it should be verified that the constant
1027 * does only consist of octal digits. */
1028 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1029 if (SCM_I_INUMP (p
))
1031 scm_t_wchar c
= scm_to_uint32 (p
);
1032 if (SCM_IS_UNICODE_CHAR (c
))
1033 return SCM_MAKE_CHAR (c
);
1035 scm_i_input_error (FUNC_NAME
, port
,
1036 "out-of-range octal character escape: ~a",
1037 scm_list_1 (charname
));
1041 if (cp
== 'x' && (charname_len
> 1))
1045 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1046 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1047 scm_from_uint (16));
1048 if (SCM_I_INUMP (p
))
1050 scm_t_wchar c
= scm_to_uint32 (p
);
1051 if (SCM_IS_UNICODE_CHAR (c
))
1052 return SCM_MAKE_CHAR (c
);
1054 scm_i_input_error (FUNC_NAME
, port
,
1055 "out-of-range hex character escape: ~a",
1056 scm_list_1 (charname
));
1060 /* The names of characters should never have non-Latin1
1062 if (scm_i_is_narrow_string (charname
)
1063 || scm_i_try_narrow_string (charname
))
1064 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1066 if (scm_is_true (ch
))
1070 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1071 scm_list_1 (charname
));
1073 return SCM_UNSPECIFIED
;
1078 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1082 /* Read the symbol that comprises the keyword. Doing this instead of
1083 invoking a specific symbol reader function allows `scm_read_keyword ()'
1084 to adapt to the delimiters currently valid of symbols.
1086 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1087 symbol
= scm_read_expression (port
, opts
);
1088 if (!scm_is_symbol (symbol
))
1089 scm_i_input_error ("scm_read_keyword", port
,
1090 "keyword prefix `~a' not followed by a symbol: ~s",
1091 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1093 return (scm_symbol_to_keyword (symbol
));
1097 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1098 long line
, int column
)
1100 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1101 guarantee that it's going to do what we want. After all, this is an
1102 implementation detail of `scm_read_vector ()', not a desirable
1104 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1105 port
, opts
, line
, column
);
1108 /* Helper used by scm_read_array */
1110 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1119 c
= scm_getc (port
);
1122 while ('0' <= c
&& c
<= '9')
1124 res
= 10*res
+ c
-'0';
1126 c
= scm_getc (port
);
1134 /* Read an array. This function can also read vectors and uniform
1135 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1138 C is the first character read after the '#'. */
1140 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1143 scm_t_wchar tag_buf
[8];
1146 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1148 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1149 the array code can not deal with zero-length dimensions yet, and
1150 we want to allow zero-length vectors, of course. */
1152 return scm_read_vector (c
, port
, opts
, line
, column
);
1154 /* Disambiguate between '#f' and uniform floating point vectors. */
1157 c
= scm_getc (port
);
1158 if (c
!= '3' && c
!= '6')
1161 scm_ungetc (c
, port
);
1167 goto continue_reading_tag
;
1172 c
= read_decimal_integer (port
, c
, &rank
);
1174 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1179 continue_reading_tag
:
1180 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1181 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1183 tag_buf
[tag_len
++] = c
;
1184 c
= scm_getc (port
);
1190 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1191 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1192 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1197 if (c
== '@' || c
== ':')
1203 ssize_t lbnd
= 0, len
= 0;
1208 c
= scm_getc (port
);
1209 c
= read_decimal_integer (port
, c
, &lbnd
);
1212 s
= scm_from_ssize_t (lbnd
);
1216 c
= scm_getc (port
);
1217 c
= read_decimal_integer (port
, c
, &len
);
1219 scm_i_input_error (NULL
, port
,
1220 "array length must be non-negative",
1223 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1226 shape
= scm_cons (s
, shape
);
1227 } while (c
== '@' || c
== ':');
1229 shape
= scm_reverse_x (shape
, SCM_EOL
);
1232 /* Read nested lists of elements. */
1234 scm_i_input_error (NULL
, port
,
1235 "missing '(' in vector or array literal",
1237 elements
= scm_read_sexp (c
, port
, opts
);
1239 if (scm_is_false (shape
))
1240 shape
= scm_from_ssize_t (rank
);
1241 else if (scm_ilength (shape
) != rank
)
1244 "the number of shape specifications must match the array rank",
1247 /* Handle special print syntax of rank zero arrays; see
1248 scm_i_print_array for a rationale. */
1251 if (!scm_is_pair (elements
))
1252 scm_i_input_error (NULL
, port
,
1253 "too few elements in array literal, need 1",
1255 if (!scm_is_null (SCM_CDR (elements
)))
1256 scm_i_input_error (NULL
, port
,
1257 "too many elements in array literal, want 1",
1259 elements
= SCM_CAR (elements
);
1262 /* Construct array, annotate with source location, and return. */
1263 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1264 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1268 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1269 long line
, int column
)
1271 return scm_read_array (chr
, port
, opts
, line
, column
);
1275 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1276 long line
, int column
)
1278 chr
= scm_getc (port
);
1282 chr
= scm_getc (port
);
1286 chr
= scm_getc (port
);
1290 return maybe_annotate_source
1291 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1292 port
, opts
, line
, column
);
1295 scm_i_input_error ("read_bytevector", port
,
1296 "invalid bytevector prefix",
1297 SCM_MAKE_CHAR (chr
));
1298 return SCM_UNSPECIFIED
;
1302 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1303 long line
, int column
)
1305 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1306 terribly inefficient but who cares? */
1307 SCM s_bits
= SCM_EOL
;
1309 for (chr
= scm_getc (port
);
1310 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1311 chr
= scm_getc (port
))
1313 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1317 scm_ungetc (chr
, port
);
1319 return maybe_annotate_source
1320 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1321 port
, opts
, line
, column
);
1325 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1331 int c
= scm_getc (port
);
1334 scm_i_input_error ("skip_block_comment", port
,
1335 "unterminated `#! ... !#' comment", SCM_EOL
);
1339 else if (c
== '#' && bang_seen
)
1345 return SCM_UNSPECIFIED
;
1348 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1350 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1352 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1356 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1358 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1362 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1364 c
= scm_getc (port
);
1366 scm_i_input_error ("skip_block_comment", port
,
1367 "unterminated `#! ... !#' comment", SCM_EOL
);
1368 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1370 else if (CHAR_IS_DELIMITER (c
))
1372 scm_ungetc (c
, port
);
1374 if (0 == strcmp ("r6rs", name
))
1375 ; /* Silently ignore */
1376 else if (0 == strcmp ("fold-case", name
))
1377 set_port_case_insensitive_p (port
, opts
, 1);
1378 else if (0 == strcmp ("no-fold-case", name
))
1379 set_port_case_insensitive_p (port
, opts
, 0);
1380 else if (0 == strcmp ("curly-infix", name
))
1381 set_port_curly_infix_p (port
, opts
, 1);
1382 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1384 set_port_curly_infix_p (port
, opts
, 1);
1385 set_port_square_brackets_p (port
, opts
, 0);
1390 return SCM_UNSPECIFIED
;
1394 scm_ungetc (c
, port
);
1399 scm_ungetc (name
[--i
], port
);
1400 return scm_read_scsh_block_comment (chr
, port
);
1404 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1406 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1407 nested. So care must be taken. */
1408 int nesting_level
= 1;
1410 int a
= scm_getc (port
);
1413 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1414 "unterminated `#| ... |#' comment", SCM_EOL
);
1416 while (nesting_level
> 0)
1418 int b
= scm_getc (port
);
1421 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1422 "unterminated `#| ... |#' comment", SCM_EOL
);
1424 if (a
== '|' && b
== '#')
1429 else if (a
== '#' && b
== '|')
1438 return SCM_UNSPECIFIED
;
1442 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1443 scm_t_read_opts
*opts
)
1447 c
= flush_ws (port
, opts
, (char *) NULL
);
1449 scm_i_input_error ("read_commented_expression", port
,
1450 "no expression after #; comment", SCM_EOL
);
1451 scm_ungetc (c
, port
);
1452 scm_read_expression (port
, opts
);
1453 return SCM_UNSPECIFIED
;
1457 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1459 /* Guile's extended symbol read syntax looks like this:
1461 #{This is all a symbol name}#
1463 So here, CHR is expected to be `{'. */
1466 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1468 buf
= scm_i_string_start_writing (buf
);
1470 while ((chr
= scm_getc (port
)) != EOF
)
1481 scm_i_string_set_x (buf
, len
++, '}');
1487 else if (chr
== '\\')
1489 /* It used to be that print.c would print extended-read-syntax
1490 symbols with backslashes before "non-standard" chars, but
1491 this routine wouldn't do anything with those escapes.
1492 Bummer. What we've done is to change print.c to output
1493 R6RS hex escapes for those characters, relying on the fact
1494 that the extended read syntax would never put a `\' before
1495 an `x'. For now, we just ignore other instances of
1496 backslash in the string. */
1497 switch ((chr
= scm_getc (port
)))
1505 SCM_READ_HEX_ESCAPE (10, ';');
1506 scm_i_string_set_x (buf
, len
++, c
);
1514 scm_i_string_stop_writing ();
1515 scm_i_input_error ("scm_read_extended_symbol", port
,
1516 "illegal character in escape sequence: ~S",
1517 scm_list_1 (SCM_MAKE_CHAR (c
)));
1521 scm_i_string_set_x (buf
, len
++, chr
);
1526 scm_i_string_set_x (buf
, len
++, chr
);
1528 if (len
>= scm_i_string_length (buf
) - 2)
1532 scm_i_string_stop_writing ();
1533 addy
= scm_i_make_string (1024, NULL
, 0);
1534 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1536 buf
= scm_i_string_start_writing (buf
);
1541 scm_i_string_stop_writing ();
1543 scm_i_input_error ("scm_read_extended_symbol", port
,
1544 "end of file while reading symbol", SCM_EOL
);
1546 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1551 /* Top-level token readers, i.e., dispatchers. */
1554 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1558 proc
= scm_get_hash_procedure (chr
);
1559 if (scm_is_true (scm_procedure_p (proc
)))
1561 long line
= SCM_LINUM (port
);
1562 int column
= SCM_COL (port
) - 2;
1565 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1567 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1568 && !scm_i_has_source_properties (got
))
1569 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1574 return SCM_UNSPECIFIED
;
1577 /* The reader for the sharp `#' character. It basically dispatches reads
1578 among the above token readers. */
1580 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1581 long line
, int column
)
1582 #define FUNC_NAME "scm_lreadr"
1586 chr
= scm_getc (port
);
1588 result
= scm_read_sharp_extension (chr
, port
, opts
);
1589 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1595 return (scm_read_character (chr
, port
, opts
));
1597 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1602 /* This one may return either a boolean or an SRFI-4 vector. */
1603 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1605 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1607 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1611 return (scm_read_boolean (chr
, port
));
1613 return (scm_read_keyword (chr
, port
, opts
));
1614 case '0': case '1': case '2': case '3': case '4':
1615 case '5': case '6': case '7': case '8': case '9':
1617 #if SCM_ENABLE_DEPRECATED
1618 /* See below for 'i' and 'e'. */
1624 return (scm_read_array (chr
, port
, opts
, line
, column
));
1628 #if SCM_ENABLE_DEPRECATED
1630 /* When next char is '(', it really is an old-style
1632 scm_t_wchar next_c
= scm_getc (port
);
1634 scm_ungetc (next_c
, port
);
1636 return scm_read_array (chr
, port
, opts
, line
, column
);
1650 return (scm_read_number_and_radix (chr
, port
, opts
));
1652 return (scm_read_extended_symbol (chr
, port
));
1654 return (scm_read_shebang (chr
, port
, opts
));
1656 return (scm_read_commented_expression (chr
, port
, opts
));
1660 return (scm_read_syntax (chr
, port
, opts
));
1662 return (scm_read_nil (chr
, port
, opts
));
1664 result
= scm_read_sharp_extension (chr
, port
, opts
);
1665 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1667 /* To remain compatible with 1.8 and earlier, the following
1668 characters have lower precedence than `read-hash-extend'
1673 return scm_read_r6rs_block_comment (chr
, port
);
1675 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1676 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1683 return SCM_UNSPECIFIED
;
1688 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1689 #define FUNC_NAME "read_inner_expression"
1695 chr
= scm_getc (port
);
1699 case SCM_WHITE_SPACES
:
1700 case SCM_LINE_INCREMENTORS
:
1703 (void) scm_read_semicolon_comment (chr
, port
);
1706 if (opts
->curly_infix_p
)
1708 if (opts
->neoteric_p
)
1709 return scm_read_sexp (chr
, port
, opts
);
1714 /* Enable neoteric expressions within curly braces */
1715 opts
->neoteric_p
= 1;
1716 expr
= scm_read_sexp (chr
, port
, opts
);
1717 opts
->neoteric_p
= 0;
1722 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1724 if (opts
->square_brackets_p
)
1725 return scm_read_sexp (chr
, port
, opts
);
1726 else if (opts
->curly_infix_p
)
1728 /* The syntax of neoteric expressions requires that '[' be
1729 a delimiter when curly-infix is enabled, so it cannot
1730 be part of an unescaped symbol. We might as well do
1731 something useful with it, so we adopt Kawa's convention:
1732 [...] => ($bracket-list$ ...) */
1733 long line
= SCM_LINUM (port
);
1734 int column
= SCM_COL (port
) - 1;
1735 return maybe_annotate_source
1736 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1737 port
, opts
, line
, column
);
1740 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1742 return (scm_read_sexp (chr
, port
, opts
));
1744 return (scm_read_string (chr
, port
, opts
));
1748 return (scm_read_quote (chr
, port
, opts
));
1751 long line
= SCM_LINUM (port
);
1752 int column
= SCM_COL (port
) - 1;
1753 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1754 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1755 /* We read a comment or some such. */
1761 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1764 if (opts
->curly_infix_p
)
1765 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1767 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1769 if (opts
->square_brackets_p
)
1770 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1771 /* otherwise fall through */
1775 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1776 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1781 if (((chr
>= '0') && (chr
<= '9'))
1782 || (strchr ("+-.", chr
)))
1783 return (scm_read_number (chr
, port
, opts
));
1785 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1793 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1794 #define FUNC_NAME "scm_read_expression"
1796 if (!opts
->neoteric_p
)
1797 return read_inner_expression (port
, opts
);
1804 if (opts
->record_positions_p
)
1806 /* We need to get the position of the first non-whitespace
1807 character in order to correctly annotate neoteric
1808 expressions. For example, for the expression 'f(x)', the
1809 first call to 'read_inner_expression' reads the 'f' (which
1810 cannot be annotated), and then we later read the '(x)' and
1811 use it to construct the new list (f x). */
1812 int c
= flush_ws (port
, opts
, (char *) NULL
);
1815 scm_ungetc (c
, port
);
1816 line
= SCM_LINUM (port
);
1817 column
= SCM_COL (port
);
1820 expr
= read_inner_expression (port
, opts
);
1822 /* 'expr' is the first component of the neoteric expression. Now
1823 we loop, and as long as the next character is '(', '[', or '{',
1824 (without any intervening whitespace), we use it to construct a
1825 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1828 int chr
= scm_getc (port
);
1831 /* e(...) => (e ...) */
1832 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1833 else if (chr
== '[')
1834 /* e[...] => ($bracket-apply$ e ...) */
1835 expr
= scm_cons (sym_bracket_apply
,
1837 scm_read_sexp (chr
, port
, opts
)));
1838 else if (chr
== '{')
1840 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1842 if (scm_is_null (arg
))
1843 expr
= scm_list_1 (expr
); /* e{} => (e) */
1845 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1850 scm_ungetc (chr
, port
);
1853 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1861 /* Actual reader. */
1863 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1865 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1867 "Read an s-expression from the input port @var{port}, or from\n"
1868 "the current input port if @var{port} is not specified.\n"
1869 "Any whitespace before the next token is discarded.")
1870 #define FUNC_NAME s_scm_read
1872 scm_t_read_opts opts
;
1875 if (SCM_UNBNDP (port
))
1876 port
= scm_current_input_port ();
1877 SCM_VALIDATE_OPINPORT (1, port
);
1879 init_read_options (port
, &opts
);
1881 c
= flush_ws (port
, &opts
, (char *) NULL
);
1884 scm_ungetc (c
, port
);
1886 return (scm_read_expression (port
, &opts
));
1893 /* Manipulate the read-hash-procedures alist. This could be written in
1894 Scheme, but maybe it will also be used by C code during initialisation. */
1895 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1896 (SCM chr
, SCM proc
),
1897 "Install the procedure @var{proc} for reading expressions\n"
1898 "starting with the character sequence @code{#} and @var{chr}.\n"
1899 "@var{proc} will be called with two arguments: the character\n"
1900 "@var{chr} and the port to read further data from. The object\n"
1901 "returned will be the return value of @code{read}. \n"
1902 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1904 #define FUNC_NAME s_scm_read_hash_extend
1909 SCM_VALIDATE_CHAR (1, chr
);
1910 SCM_ASSERT (scm_is_false (proc
)
1911 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1912 proc
, SCM_ARG2
, FUNC_NAME
);
1914 /* Check if chr is already in the alist. */
1915 this = scm_i_read_hash_procedures_ref ();
1919 if (scm_is_null (this))
1921 /* not found, so add it to the beginning. */
1922 if (scm_is_true (proc
))
1924 SCM
new = scm_cons (scm_cons (chr
, proc
),
1925 scm_i_read_hash_procedures_ref ());
1926 scm_i_read_hash_procedures_set_x (new);
1930 if (scm_is_eq (chr
, SCM_CAAR (this)))
1932 /* already in the alist. */
1933 if (scm_is_false (proc
))
1936 if (scm_is_false (prev
))
1938 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1939 scm_i_read_hash_procedures_set_x (rest
);
1942 scm_set_cdr_x (prev
, SCM_CDR (this));
1947 scm_set_cdr_x (SCM_CAR (this), proc
);
1952 this = SCM_CDR (this);
1955 return SCM_UNSPECIFIED
;
1959 /* Recover the read-hash procedure corresponding to char c. */
1961 scm_get_hash_procedure (int c
)
1963 SCM rest
= scm_i_read_hash_procedures_ref ();
1967 if (scm_is_null (rest
))
1970 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1971 return SCM_CDAR (rest
);
1973 rest
= SCM_CDR (rest
);
1977 #define SCM_ENCODING_SEARCH_SIZE (500)
1979 /* Search the first few hundred characters of a file for an Emacs-like coding
1980 declaration. Returns either NULL or a string whose storage has been
1981 allocated with `scm_gc_malloc ()'. */
1983 scm_i_scan_for_encoding (SCM port
)
1986 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1987 size_t bytes_read
, encoding_length
, i
;
1988 char *encoding
= NULL
;
1989 char *pos
, *encoding_start
;
1992 pt
= SCM_PTAB_ENTRY (port
);
1994 if (pt
->rw_active
== SCM_PORT_WRITE
)
1998 pt
->rw_active
= SCM_PORT_READ
;
2000 if (pt
->read_pos
== pt
->read_end
)
2002 /* We can use the read buffer, and thus avoid a seek. */
2003 if (scm_fill_input (port
) == EOF
)
2006 bytes_read
= pt
->read_end
- pt
->read_pos
;
2007 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
2008 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
2010 if (bytes_read
<= 1)
2011 /* An unbuffered port -- don't scan. */
2014 memcpy (header
, pt
->read_pos
, bytes_read
);
2015 header
[bytes_read
] = '\0';
2019 /* Try to read some bytes and then seek back. Not all ports
2020 support seeking back; and indeed some file ports (like
2021 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2022 check performed by SCM_FPORT_FDES---but fail to seek
2023 backwards. Hence this block comes second. We prefer to use
2024 the read buffer in-place. */
2025 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2028 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2029 header
[bytes_read
] = '\0';
2030 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2033 /* search past "coding[:=]" */
2037 if ((pos
= strstr(pos
, "coding")) == NULL
)
2040 pos
+= strlen("coding");
2041 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2042 (*pos
== ':' || *pos
== '='))
2050 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2051 (*pos
== ' ' || *pos
== '\t'))
2054 /* grab the next token */
2055 encoding_start
= pos
;
2057 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2058 && encoding_start
+ i
- header
< bytes_read
2059 && (isalnum ((int) encoding_start
[i
])
2060 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
2063 encoding_length
= i
;
2064 if (encoding_length
== 0)
2067 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2068 for (i
= 0; i
< encoding_length
; i
++)
2069 encoding
[i
] = toupper ((int) encoding
[i
]);
2071 /* push backwards to make sure we were in a comment */
2073 pos
= encoding_start
;
2074 while (pos
>= header
)
2081 else if (*pos
== '\n' || pos
== header
)
2083 /* This wasn't in a semicolon comment. Check for a
2084 hash-bang comment. */
2085 char *beg
= strstr (header
, "#!");
2086 char *end
= strstr (header
, "!#");
2087 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2098 /* This wasn't in a comment */
2104 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2106 "Scans the port for an Emacs-like character coding declaration\n"
2107 "near the top of the contents of a port with random-accessible contents.\n"
2108 "The coding declaration is of the form\n"
2109 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2111 "Returns a string containing the character encoding of the file\n"
2112 "if a declaration was found, or @code{#f} otherwise.\n")
2113 #define FUNC_NAME s_scm_file_encoding
2118 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2120 enc
= scm_i_scan_for_encoding (port
);
2125 s_enc
= scm_from_locale_string (enc
);
2134 /* Per-port read options.
2136 We store per-port read options in the 'port-read-options' key of the
2137 port's alist, which is stored in the internal port structure. The
2138 value stored in the alist is a single integer that contains a two-bit
2139 field for each read option.
2141 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2142 the applicable value should be inherited from the corresponding
2143 global read option. Otherwise, the bit field contains the value of
2144 the read option. For boolean read options that have been set
2145 per-port, the possible values are 0 or 1. If the 'keyword_style'
2146 read option has been set per-port, its possible values are those in
2147 'enum t_keyword_style'. */
2149 /* Key to read options in per-port alists. */
2150 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2152 /* Offsets of bit fields for each per-port override */
2153 #define READ_OPTION_COPY_SOURCE_P 0
2154 #define READ_OPTION_RECORD_POSITIONS_P 2
2155 #define READ_OPTION_CASE_INSENSITIVE_P 4
2156 #define READ_OPTION_KEYWORD_STYLE 6
2157 #define READ_OPTION_R6RS_ESCAPES_P 8
2158 #define READ_OPTION_SQUARE_BRACKETS_P 10
2159 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2160 #define READ_OPTION_CURLY_INFIX_P 14
2162 /* The total width in bits of the per-port overrides */
2163 #define READ_OPTIONS_NUM_BITS 16
2165 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2166 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2168 #define READ_OPTION_MASK 3
2169 #define READ_OPTION_INHERIT 3
2172 set_port_read_option (SCM port
, int option
, int new_value
)
2174 SCM alist
, scm_read_options
;
2175 unsigned int read_options
;
2177 new_value
&= READ_OPTION_MASK
;
2178 alist
= scm_i_port_alist (port
);
2179 scm_read_options
= scm_assq_ref (alist
, sym_port_read_options
);
2180 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2181 read_options
= scm_to_uint (scm_read_options
);
2183 read_options
= READ_OPTIONS_INHERIT_ALL
;
2184 read_options
&= ~(READ_OPTION_MASK
<< option
);
2185 read_options
|= new_value
<< option
;
2186 scm_read_options
= scm_from_uint (read_options
);
2187 alist
= scm_assq_set_x (alist
, sym_port_read_options
, scm_read_options
);
2188 scm_i_set_port_alist_x (port
, alist
);
2191 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2193 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2196 opts
->case_insensitive_p
= value
;
2197 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2200 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2202 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2205 opts
->square_brackets_p
= value
;
2206 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2209 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2211 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2214 opts
->curly_infix_p
= value
;
2215 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2218 /* Initialize OPTS based on PORT's read options and the global read
2221 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2223 SCM alist
, val
, scm_read_options
;
2224 unsigned int read_options
, x
;
2226 alist
= scm_i_port_alist (port
);
2227 scm_read_options
= scm_assq_ref (alist
, sym_port_read_options
);
2229 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2230 read_options
= scm_to_uint (scm_read_options
);
2232 read_options
= READ_OPTIONS_INHERIT_ALL
;
2234 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2235 if (x
== READ_OPTION_INHERIT
)
2237 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2238 if (scm_is_eq (val
, scm_keyword_prefix
))
2239 x
= KEYWORD_STYLE_PREFIX
;
2240 else if (scm_is_eq (val
, scm_keyword_postfix
))
2241 x
= KEYWORD_STYLE_POSTFIX
;
2243 x
= KEYWORD_STYLE_HASH_PREFIX
;
2245 opts
->keyword_style
= x
;
2247 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2250 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2251 if (x == READ_OPTION_INHERIT) \
2252 x = !!SCM_ ## NAME; \
2257 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2258 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2259 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2260 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2261 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2262 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2263 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2265 #undef RESOLVE_BOOLEAN_OPTION
2267 opts
->neoteric_p
= 0;
2273 SCM read_hash_procs
;
2275 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2277 scm_i_read_hash_procedures
=
2278 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2280 scm_init_opts (scm_read_options
, scm_read_opts
);
2281 #include "libguile/read.x"