1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 /* SRFI-105 curly infix expression support */
67 SCM_SYMBOL (sym_nfx
, "$nfx$");
68 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
69 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
71 scm_t_option scm_read_opts
[] =
73 { SCM_OPTION_BOOLEAN
, "copy", 0,
74 "Copy source code expressions." },
75 { SCM_OPTION_BOOLEAN
, "positions", 1,
76 "Record positions of source code expressions." },
77 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
78 "Convert symbols to lower case."},
79 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
80 "Style of keyword recognition: #f, 'prefix or 'postfix."},
81 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
82 "Use R6RS variable-length character and string hex escapes."},
83 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
84 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
85 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
86 "In strings, consume leading whitespace after an escaped end-of-line."},
87 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
88 "Support SRFI-105 curly infix expressions."},
92 /* Internal read options structure. This is initialized by 'scm_read'
93 from the global and per-port read options, and a pointer is passed
94 down to all helper functions. */
98 KEYWORD_STYLE_HASH_PREFIX
,
100 KEYWORD_STYLE_POSTFIX
105 enum t_keyword_style keyword_style
;
106 unsigned int copy_source_p
: 1;
107 unsigned int record_positions_p
: 1;
108 unsigned int case_insensitive_p
: 1;
109 unsigned int r6rs_escapes_p
: 1;
110 unsigned int square_brackets_p
: 1;
111 unsigned int hungry_eol_escapes_p
: 1;
112 unsigned int curly_infix_p
: 1;
113 unsigned int neoteric_p
: 1;
116 typedef struct t_read_opts scm_t_read_opts
;
120 Give meaningful error messages for errors
124 FILE:LINE:COL: MESSAGE
125 This happened in ....
127 This is not standard GNU format, but the test-suite likes the real
128 message to be in front.
134 scm_i_input_error (char const *function
,
135 SCM port
, const char *message
, SCM arg
)
137 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
139 : scm_from_locale_string ("#<unknown port>"));
141 SCM string_port
= scm_open_output_string ();
142 SCM string
= SCM_EOL
;
143 scm_simple_format (string_port
,
144 scm_from_locale_string ("~A:~S:~S: ~A"),
146 scm_from_long (SCM_LINUM (port
) + 1),
147 scm_from_int (SCM_COL (port
) + 1),
148 scm_from_locale_string (message
)));
150 string
= scm_get_output_string (string_port
);
151 scm_close_output_port (string_port
);
152 scm_error_scm (scm_from_latin1_symbol ("read-error"),
153 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
160 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
162 "Option interface for the read options. Instead of using\n"
163 "this procedure directly, use the procedures @code{read-enable},\n"
164 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
165 #define FUNC_NAME s_scm_read_options
167 SCM ans
= scm_options (setting
,
170 if (SCM_COPY_SOURCE_P
)
171 SCM_RECORD_POSITIONS_P
= 1;
176 /* A fluid referring to an association list mapping extra hash
177 characters to procedures. */
178 static SCM
*scm_i_read_hash_procedures
;
181 scm_i_read_hash_procedures_ref (void)
183 return scm_fluid_ref (*scm_i_read_hash_procedures
);
187 scm_i_read_hash_procedures_set_x (SCM value
)
189 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
196 /* Size of the C buffer used to read symbols and numbers. */
197 #define READER_BUFFER_SIZE 128
199 /* Number of 32-bit codepoints in the buffer used to read strings. */
200 #define READER_STRING_BUFFER_SIZE 128
202 /* The maximum size of Scheme character names. */
203 #define READER_CHAR_NAME_MAX_SIZE 50
205 /* The maximum size of reader directive names. */
206 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
209 /* `isblank' is only in C99. */
210 #define CHAR_IS_BLANK_(_chr) \
211 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
212 || ((_chr) == '\f') || ((_chr) == '\r'))
215 # define CHAR_IS_BLANK(_chr) \
216 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
218 # define CHAR_IS_BLANK CHAR_IS_BLANK_
222 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
224 #define CHAR_IS_R5RS_DELIMITER(c) \
226 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
228 #define CHAR_IS_DELIMITER(c) \
229 (CHAR_IS_R5RS_DELIMITER (c) \
230 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
231 || opts->curly_infix_p)) \
232 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
234 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
236 #define CHAR_IS_EXPONENT_MARKER(_chr) \
237 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
238 || ((_chr) == 'd') || ((_chr) == 'l'))
240 /* Read an SCSH block comment. */
241 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
242 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
243 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
244 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
245 static SCM
scm_get_hash_procedure (int);
247 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
248 result in the pre-allocated buffer BUF. Return zero if the whole token has
249 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
250 bytes actually read. */
252 read_token (SCM port
, scm_t_read_opts
*opts
,
253 char *buf
, size_t buf_size
, size_t *read
)
257 while (*read
< buf_size
)
261 chr
= scm_get_byte_or_eof (port
);
265 else if (CHAR_IS_DELIMITER (chr
))
267 scm_unget_byte (chr
, port
);
280 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
281 if the token doesn't fit in BUFFER_SIZE bytes. */
283 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
284 char *buffer
, size_t buffer_size
, size_t *read
)
287 size_t bytes_read
, overflow_size
= 0;
288 char *overflow_buffer
= NULL
;
292 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
295 if (overflow
|| overflow_size
!= 0)
297 if (overflow_size
== 0)
299 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
300 memcpy (overflow_buffer
, buffer
, bytes_read
);
301 overflow_size
= bytes_read
;
306 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
308 memcpy (new_buf
, overflow_buffer
, overflow_size
);
309 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
311 overflow_buffer
= new_buf
;
312 overflow_size
+= bytes_read
;
319 *read
= overflow_size
;
323 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
326 /* Skip whitespace from PORT and return the first non-whitespace character
327 read. Raise an error on end-of-file. */
329 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
333 switch (c
= scm_getc (port
))
339 scm_i_input_error (eoferr
,
348 switch (c
= scm_getc (port
))
354 case SCM_LINE_INCREMENTORS
:
360 switch (c
= scm_getc (port
))
363 eoferr
= "read_sharp";
366 scm_read_shebang (c
, port
, opts
);
369 scm_read_commented_expression (c
, port
, opts
);
372 if (scm_is_false (scm_get_hash_procedure (c
)))
374 scm_read_r6rs_block_comment (c
, port
);
379 scm_ungetc (c
, port
);
384 case SCM_LINE_INCREMENTORS
:
385 case SCM_SINGLE_SPACES
:
400 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
401 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
402 long line
, int column
);
406 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
407 long line
, int column
)
409 if (opts
->record_positions_p
)
410 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
415 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
416 #define FUNC_NAME "scm_i_lreadparen"
419 SCM tmp
, tl
, ans
= SCM_EOL
;
420 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
421 const int terminating_char
= ((chr
== '{') ? '}'
422 : ((chr
== '[') ? ']'
425 /* Need to capture line and column numbers here. */
426 long line
= SCM_LINUM (port
);
427 int column
= SCM_COL (port
) - 1;
429 c
= flush_ws (port
, opts
, FUNC_NAME
);
430 if (terminating_char
== c
)
433 scm_ungetc (c
, port
);
434 tmp
= scm_read_expression (port
, opts
);
436 /* Note that it is possible for scm_read_expression to return
437 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
438 check that it's a real dot by checking `c'. */
439 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
441 ans
= scm_read_expression (port
, opts
);
442 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
443 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
448 /* Build the head of the list structure. */
449 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
451 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
455 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
456 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
457 scm_i_input_error (FUNC_NAME
, port
,
458 "in pair: mismatched close paren: ~A",
459 scm_list_1 (SCM_MAKE_CHAR (c
)));
461 scm_ungetc (c
, port
);
462 tmp
= scm_read_expression (port
, opts
);
464 /* See above note about scm_sym_dot. */
465 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
467 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
469 c
= flush_ws (port
, opts
, FUNC_NAME
);
470 if (terminating_char
!= c
)
471 scm_i_input_error (FUNC_NAME
, port
,
472 "in pair: missing close paren", SCM_EOL
);
476 new_tail
= scm_cons (tmp
, SCM_EOL
);
477 SCM_SETCDR (tl
, new_tail
);
483 /* In addition to finding the length, 'scm_ilength' checks for
484 improper or circular lists, in which case it returns -1. */
485 int len
= scm_ilength (ans
);
487 /* The (len == 0) case is handled above */
489 /* Return directly to avoid re-annotating the element's source
490 location with the position of the outer brace. Also, it
491 might not be possible to annotate the element. */
492 return scm_car (ans
); /* {e} => e */
494 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
495 else if (len
>= 3 && (len
& 1))
497 /* It's a proper list whose length is odd and at least 3. If
498 the elements at odd indices (the infix operator positions)
499 are all 'equal?', then it's a simple curly-infix list.
500 Otherwise it's a mixed curly-infix list. */
501 SCM op
= scm_cadr (ans
);
503 /* Check to see if the elements at odd indices are 'equal?' */
504 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
506 if (scm_is_null (tl
))
508 /* Convert simple curly-infix list to prefix:
509 {a <op> b <op> ...} => (<op> a b ...) */
511 while (scm_is_pair (scm_cdr (tl
)))
514 SCM_SETCDR (tl
, tmp
);
517 ans
= scm_cons (op
, ans
);
520 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
522 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
523 ans
= scm_cons (sym_nfx
, ans
);
529 /* Mixed curly-infix (possibly improper) list:
530 {e . tail} => ($nfx$ e . tail) */
531 ans
= scm_cons (sym_nfx
, ans
);
534 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
539 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
540 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
542 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
548 while (i < ndigits) \
550 a = scm_getc (port); \
554 && (a == (scm_t_wchar) terminator) \
557 if ('0' <= a && a <= '9') \
559 else if ('A' <= a && a <= 'F') \
561 else if ('a' <= a && a <= 'f') \
574 skip_intraline_whitespace (SCM port
)
584 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
586 scm_ungetc (c
, port
);
590 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
591 #define FUNC_NAME "scm_lreadr"
593 /* For strings smaller than C_STR, this function creates only one Scheme
594 object (the string returned). */
597 size_t c_str_len
= 0;
598 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
600 /* Need to capture line and column numbers here. */
601 long line
= SCM_LINUM (port
);
602 int column
= SCM_COL (port
) - 1;
604 while ('"' != (c
= scm_getc (port
)))
609 scm_i_input_error (FUNC_NAME
, port
,
610 "end of file in string constant", SCM_EOL
);
613 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
615 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
621 switch (c
= scm_getc (port
))
629 if (opts
->hungry_eol_escapes_p
)
630 skip_intraline_whitespace (port
);
657 if (opts
->r6rs_escapes_p
)
658 SCM_READ_HEX_ESCAPE (10, ';');
660 SCM_READ_HEX_ESCAPE (2, '\0');
663 if (!opts
->r6rs_escapes_p
)
665 SCM_READ_HEX_ESCAPE (4, '\0');
669 if (!opts
->r6rs_escapes_p
)
671 SCM_READ_HEX_ESCAPE (6, '\0');
676 scm_i_input_error (FUNC_NAME
, port
,
677 "illegal character in escape sequence: ~S",
678 scm_list_1 (SCM_MAKE_CHAR (c
)));
682 c_str
[c_str_len
++] = c
;
685 if (scm_is_null (str
))
686 /* Fast path: we got a string that fits in C_STR. */
687 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
691 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
693 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
696 return maybe_annotate_source (str
, port
, opts
, line
, column
);
702 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
704 SCM result
, str
= SCM_EOL
;
705 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
707 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
709 /* Need to capture line and column numbers here. */
710 long line
= SCM_LINUM (port
);
711 int column
= SCM_COL (port
) - 1;
713 scm_ungetc (chr
, port
);
714 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
717 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
719 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
720 if (scm_is_false (result
))
722 /* Return a symbol instead of a number */
723 if (opts
->case_insensitive_p
)
724 str
= scm_string_downcase_x (str
);
725 result
= scm_string_to_symbol (str
);
727 else if (SCM_NIMP (result
))
728 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
730 SCM_COL (port
) += scm_i_string_length (str
);
735 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
738 int ends_with_colon
= 0;
740 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
741 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
742 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
745 scm_ungetc (chr
, port
);
746 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
749 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
751 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
753 str
= scm_from_stringn (buffer
, bytes_read
- 1,
754 pt
->encoding
, pt
->ilseq_handler
);
756 if (opts
->case_insensitive_p
)
757 str
= scm_string_downcase_x (str
);
758 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
762 str
= scm_from_stringn (buffer
, bytes_read
,
763 pt
->encoding
, pt
->ilseq_handler
);
765 if (opts
->case_insensitive_p
)
766 str
= scm_string_downcase_x (str
);
767 result
= scm_string_to_symbol (str
);
770 SCM_COL (port
) += scm_i_string_length (str
);
775 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
776 #define FUNC_NAME "scm_lreadr"
780 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
808 scm_ungetc (chr
, port
);
809 scm_ungetc ('#', port
);
813 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
816 pt
= SCM_PTAB_ENTRY (port
);
817 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
819 result
= scm_string_to_number (str
, scm_from_uint (radix
));
821 SCM_COL (port
) += scm_i_string_length (str
);
823 if (scm_is_true (result
))
826 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
833 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
836 long line
= SCM_LINUM (port
);
837 int column
= SCM_COL (port
) - 1;
842 p
= scm_sym_quasiquote
;
855 p
= scm_sym_uq_splicing
;
858 scm_ungetc (c
, port
);
865 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
866 "scm_read_quote", chr
);
870 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
871 return maybe_annotate_source (p
, port
, opts
, line
, column
);
874 SCM_SYMBOL (sym_syntax
, "syntax");
875 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
876 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
877 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
880 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
883 long line
= SCM_LINUM (port
);
884 int column
= SCM_COL (port
) - 1;
902 p
= sym_unsyntax_splicing
;
905 scm_ungetc (c
, port
);
912 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
913 "scm_read_syntax", chr
);
917 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
918 return maybe_annotate_source (p
, port
, opts
, line
, column
);
922 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
924 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
926 if (!scm_is_eq (id
, sym_nil
))
927 scm_i_input_error ("scm_read_nil", port
,
928 "unexpected input while reading #nil: ~a",
931 return SCM_ELISP_NIL
;
935 scm_read_semicolon_comment (int chr
, SCM port
)
939 /* We use the get_byte here because there is no need to get the
940 locale correct with comment input. This presumes that newline
941 always represents itself no matter what the encoding is. */
942 for (c
= scm_get_byte_or_eof (port
);
943 (c
!= EOF
) && (c
!= '\n');
944 c
= scm_get_byte_or_eof (port
));
946 return SCM_UNSPECIFIED
;
950 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
953 scm_read_boolean (int chr
, SCM port
)
966 return SCM_UNSPECIFIED
;
970 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
971 #define FUNC_NAME "scm_lreadr"
973 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
975 size_t charname_len
, bytes_read
;
980 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
983 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
987 chr
= scm_getc (port
);
989 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
990 "while reading character", SCM_EOL
);
992 /* CHR must be a token delimiter, like a whitespace. */
993 return (SCM_MAKE_CHAR (chr
));
996 pt
= SCM_PTAB_ENTRY (port
);
998 /* Simple ASCII characters can be processed immediately. Also, simple
999 ISO-8859-1 characters can be processed immediately if the encoding for this
1000 port is ISO-8859-1. */
1001 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
1003 SCM_COL (port
) += 1;
1004 return SCM_MAKE_CHAR (buffer
[0]);
1007 /* Otherwise, convert the buffer into a proper scheme string for
1009 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
1011 charname_len
= scm_i_string_length (charname
);
1012 SCM_COL (port
) += charname_len
;
1013 cp
= scm_i_string_ref (charname
, 0);
1014 if (charname_len
== 1)
1015 return SCM_MAKE_CHAR (cp
);
1017 /* Ignore dotted circles, which may be used to keep combining characters from
1018 combining with the backslash in #\charname. */
1019 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1020 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1022 if (cp
>= '0' && cp
< '8')
1024 /* Dirk:FIXME:: This type of character syntax is not R5RS
1025 * compliant. Further, it should be verified that the constant
1026 * does only consist of octal digits. */
1027 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1028 if (SCM_I_INUMP (p
))
1030 scm_t_wchar c
= scm_to_uint32 (p
);
1031 if (SCM_IS_UNICODE_CHAR (c
))
1032 return SCM_MAKE_CHAR (c
);
1034 scm_i_input_error (FUNC_NAME
, port
,
1035 "out-of-range octal character escape: ~a",
1036 scm_list_1 (charname
));
1040 if (cp
== 'x' && (charname_len
> 1))
1044 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1045 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1046 scm_from_uint (16));
1047 if (SCM_I_INUMP (p
))
1049 scm_t_wchar c
= scm_to_uint32 (p
);
1050 if (SCM_IS_UNICODE_CHAR (c
))
1051 return SCM_MAKE_CHAR (c
);
1053 scm_i_input_error (FUNC_NAME
, port
,
1054 "out-of-range hex character escape: ~a",
1055 scm_list_1 (charname
));
1059 /* The names of characters should never have non-Latin1
1061 if (scm_i_is_narrow_string (charname
)
1062 || scm_i_try_narrow_string (charname
))
1063 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1065 if (scm_is_true (ch
))
1069 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1070 scm_list_1 (charname
));
1072 return SCM_UNSPECIFIED
;
1077 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1081 /* Read the symbol that comprises the keyword. Doing this instead of
1082 invoking a specific symbol reader function allows `scm_read_keyword ()'
1083 to adapt to the delimiters currently valid of symbols.
1085 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1086 symbol
= scm_read_expression (port
, opts
);
1087 if (!scm_is_symbol (symbol
))
1088 scm_i_input_error ("scm_read_keyword", port
,
1089 "keyword prefix `~a' not followed by a symbol: ~s",
1090 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1092 return (scm_symbol_to_keyword (symbol
));
1096 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1097 long line
, int column
)
1099 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1100 guarantee that it's going to do what we want. After all, this is an
1101 implementation detail of `scm_read_vector ()', not a desirable
1103 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1104 port
, opts
, line
, column
);
1107 /* Helper used by scm_read_array */
1109 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1118 c
= scm_getc (port
);
1121 while ('0' <= c
&& c
<= '9')
1123 res
= 10*res
+ c
-'0';
1125 c
= scm_getc (port
);
1133 /* Read an array. This function can also read vectors and uniform
1134 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1137 C is the first character read after the '#'.
1140 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1143 scm_t_wchar tag_buf
[8];
1146 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1148 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1149 the array code can not deal with zero-length dimensions yet, and
1150 we want to allow zero-length vectors, of course.
1153 return scm_read_vector (c
, port
, opts
, line
, column
);
1155 /* Disambiguate between '#f' and uniform floating point vectors.
1159 c
= scm_getc (port
);
1160 if (c
!= '3' && c
!= '6')
1163 scm_ungetc (c
, port
);
1169 goto continue_reading_tag
;
1174 c
= read_decimal_integer (port
, c
, &rank
);
1176 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1181 continue_reading_tag
:
1182 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1183 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1185 tag_buf
[tag_len
++] = c
;
1186 c
= scm_getc (port
);
1192 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1193 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1194 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1199 if (c
== '@' || c
== ':')
1205 ssize_t lbnd
= 0, len
= 0;
1210 c
= scm_getc (port
);
1211 c
= read_decimal_integer (port
, c
, &lbnd
);
1214 s
= scm_from_ssize_t (lbnd
);
1218 c
= scm_getc (port
);
1219 c
= read_decimal_integer (port
, c
, &len
);
1221 scm_i_input_error (NULL
, port
,
1222 "array length must be non-negative",
1225 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1228 shape
= scm_cons (s
, shape
);
1229 } while (c
== '@' || c
== ':');
1231 shape
= scm_reverse_x (shape
, SCM_EOL
);
1234 /* Read nested lists of elements. */
1236 scm_i_input_error (NULL
, port
,
1237 "missing '(' in vector or array literal",
1239 elements
= scm_read_sexp (c
, port
, opts
);
1241 if (scm_is_false (shape
))
1242 shape
= scm_from_ssize_t (rank
);
1243 else if (scm_ilength (shape
) != rank
)
1246 "the number of shape specifications must match the array rank",
1249 /* Handle special print syntax of rank zero arrays; see
1250 scm_i_print_array for a rationale. */
1253 if (!scm_is_pair (elements
))
1254 scm_i_input_error (NULL
, port
,
1255 "too few elements in array literal, need 1",
1257 if (!scm_is_null (SCM_CDR (elements
)))
1258 scm_i_input_error (NULL
, port
,
1259 "too many elements in array literal, want 1",
1261 elements
= SCM_CAR (elements
);
1264 /* Construct array, annotate with source location, and return. */
1265 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1266 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1270 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1271 long line
, int column
)
1273 return scm_read_array (chr
, port
, opts
, line
, column
);
1277 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1278 long line
, int column
)
1280 chr
= scm_getc (port
);
1284 chr
= scm_getc (port
);
1288 chr
= scm_getc (port
);
1292 return maybe_annotate_source
1293 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1294 port
, opts
, line
, column
);
1297 scm_i_input_error ("read_bytevector", port
,
1298 "invalid bytevector prefix",
1299 SCM_MAKE_CHAR (chr
));
1300 return SCM_UNSPECIFIED
;
1304 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1305 long line
, int column
)
1307 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1308 terribly inefficient but who cares? */
1309 SCM s_bits
= SCM_EOL
;
1311 for (chr
= scm_getc (port
);
1312 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1313 chr
= scm_getc (port
))
1315 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1319 scm_ungetc (chr
, port
);
1321 return maybe_annotate_source
1322 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1323 port
, opts
, line
, column
);
1327 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1333 int c
= scm_getc (port
);
1336 scm_i_input_error ("skip_block_comment", port
,
1337 "unterminated `#! ... !#' comment", SCM_EOL
);
1341 else if (c
== '#' && bang_seen
)
1347 return SCM_UNSPECIFIED
;
1350 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1352 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1354 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1358 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1360 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1364 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1366 c
= scm_getc (port
);
1368 scm_i_input_error ("skip_block_comment", port
,
1369 "unterminated `#! ... !#' comment", SCM_EOL
);
1370 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1372 else if (CHAR_IS_DELIMITER (c
))
1374 scm_ungetc (c
, port
);
1376 if (0 == strcmp ("r6rs", name
))
1377 ; /* Silently ignore */
1378 else if (0 == strcmp ("fold-case", name
))
1379 set_port_case_insensitive_p (port
, opts
, 1);
1380 else if (0 == strcmp ("no-fold-case", name
))
1381 set_port_case_insensitive_p (port
, opts
, 0);
1382 else if (0 == strcmp ("curly-infix", name
))
1383 set_port_curly_infix_p (port
, opts
, 1);
1384 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1386 set_port_curly_infix_p (port
, opts
, 1);
1387 set_port_square_brackets_p (port
, opts
, 0);
1392 return SCM_UNSPECIFIED
;
1396 scm_ungetc (c
, port
);
1401 scm_ungetc (name
[--i
], port
);
1402 return scm_read_scsh_block_comment (chr
, port
);
1406 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1408 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1409 nested. So care must be taken. */
1410 int nesting_level
= 1;
1412 int a
= scm_getc (port
);
1415 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1416 "unterminated `#| ... |#' comment", SCM_EOL
);
1418 while (nesting_level
> 0)
1420 int b
= scm_getc (port
);
1423 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1424 "unterminated `#| ... |#' comment", SCM_EOL
);
1426 if (a
== '|' && b
== '#')
1431 else if (a
== '#' && b
== '|')
1440 return SCM_UNSPECIFIED
;
1444 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1445 scm_t_read_opts
*opts
)
1449 c
= flush_ws (port
, opts
, (char *) NULL
);
1451 scm_i_input_error ("read_commented_expression", port
,
1452 "no expression after #; comment", SCM_EOL
);
1453 scm_ungetc (c
, port
);
1454 scm_read_expression (port
, opts
);
1455 return SCM_UNSPECIFIED
;
1459 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1461 /* Guile's extended symbol read syntax looks like this:
1463 #{This is all a symbol name}#
1465 So here, CHR is expected to be `{'. */
1468 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1470 buf
= scm_i_string_start_writing (buf
);
1472 while ((chr
= scm_getc (port
)) != EOF
)
1483 scm_i_string_set_x (buf
, len
++, '}');
1489 else if (chr
== '\\')
1491 /* It used to be that print.c would print extended-read-syntax
1492 symbols with backslashes before "non-standard" chars, but
1493 this routine wouldn't do anything with those escapes.
1494 Bummer. What we've done is to change print.c to output
1495 R6RS hex escapes for those characters, relying on the fact
1496 that the extended read syntax would never put a `\' before
1497 an `x'. For now, we just ignore other instances of
1498 backslash in the string. */
1499 switch ((chr
= scm_getc (port
)))
1507 SCM_READ_HEX_ESCAPE (10, ';');
1508 scm_i_string_set_x (buf
, len
++, c
);
1516 scm_i_string_stop_writing ();
1517 scm_i_input_error ("scm_read_extended_symbol", port
,
1518 "illegal character in escape sequence: ~S",
1519 scm_list_1 (SCM_MAKE_CHAR (c
)));
1523 scm_i_string_set_x (buf
, len
++, chr
);
1528 scm_i_string_set_x (buf
, len
++, chr
);
1530 if (len
>= scm_i_string_length (buf
) - 2)
1534 scm_i_string_stop_writing ();
1535 addy
= scm_i_make_string (1024, NULL
, 0);
1536 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1538 buf
= scm_i_string_start_writing (buf
);
1543 scm_i_string_stop_writing ();
1545 scm_i_input_error ("scm_read_extended_symbol", port
,
1546 "end of file while reading symbol", SCM_EOL
);
1548 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1553 /* Top-level token readers, i.e., dispatchers. */
1556 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1560 proc
= scm_get_hash_procedure (chr
);
1561 if (scm_is_true (scm_procedure_p (proc
)))
1563 long line
= SCM_LINUM (port
);
1564 int column
= SCM_COL (port
) - 2;
1567 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1569 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1570 && !scm_i_has_source_properties (got
))
1571 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1576 return SCM_UNSPECIFIED
;
1579 /* The reader for the sharp `#' character. It basically dispatches reads
1580 among the above token readers. */
1582 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1583 long line
, int column
)
1584 #define FUNC_NAME "scm_lreadr"
1588 chr
= scm_getc (port
);
1590 result
= scm_read_sharp_extension (chr
, port
, opts
);
1591 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1597 return (scm_read_character (chr
, port
, opts
));
1599 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1604 /* This one may return either a boolean or an SRFI-4 vector. */
1605 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1607 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1609 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1613 return (scm_read_boolean (chr
, port
));
1615 return (scm_read_keyword (chr
, port
, opts
));
1616 case '0': case '1': case '2': case '3': case '4':
1617 case '5': case '6': case '7': case '8': case '9':
1619 #if SCM_ENABLE_DEPRECATED
1620 /* See below for 'i' and 'e'. */
1626 return (scm_read_array (chr
, port
, opts
, line
, column
));
1630 #if SCM_ENABLE_DEPRECATED
1632 /* When next char is '(', it really is an old-style
1634 scm_t_wchar next_c
= scm_getc (port
);
1636 scm_ungetc (next_c
, port
);
1638 return scm_read_array (chr
, port
, opts
, line
, column
);
1652 return (scm_read_number_and_radix (chr
, port
, opts
));
1654 return (scm_read_extended_symbol (chr
, port
));
1656 return (scm_read_shebang (chr
, port
, opts
));
1658 return (scm_read_commented_expression (chr
, port
, opts
));
1662 return (scm_read_syntax (chr
, port
, opts
));
1664 return (scm_read_nil (chr
, port
, opts
));
1666 result
= scm_read_sharp_extension (chr
, port
, opts
);
1667 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1669 /* To remain compatible with 1.8 and earlier, the following
1670 characters have lower precedence than `read-hash-extend'
1675 return scm_read_r6rs_block_comment (chr
, port
);
1677 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1678 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1685 return SCM_UNSPECIFIED
;
1690 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1691 #define FUNC_NAME "read_inner_expression"
1697 chr
= scm_getc (port
);
1701 case SCM_WHITE_SPACES
:
1702 case SCM_LINE_INCREMENTORS
:
1705 (void) scm_read_semicolon_comment (chr
, port
);
1708 if (opts
->curly_infix_p
)
1710 if (opts
->neoteric_p
)
1711 return scm_read_sexp (chr
, port
, opts
);
1716 /* Enable neoteric expressions within curly braces */
1717 opts
->neoteric_p
= 1;
1718 expr
= scm_read_sexp (chr
, port
, opts
);
1719 opts
->neoteric_p
= 0;
1724 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1726 if (opts
->square_brackets_p
)
1727 return scm_read_sexp (chr
, port
, opts
);
1728 else if (opts
->curly_infix_p
)
1730 /* The syntax of neoteric expressions requires that '[' be
1731 a delimiter when curly-infix is enabled, so it cannot
1732 be part of an unescaped symbol. We might as well do
1733 something useful with it, so we adopt Kawa's convention:
1734 [...] => ($bracket-list$ ...) */
1735 long line
= SCM_LINUM (port
);
1736 int column
= SCM_COL (port
) - 1;
1737 return maybe_annotate_source
1738 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1739 port
, opts
, line
, column
);
1742 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1744 return (scm_read_sexp (chr
, port
, opts
));
1746 return (scm_read_string (chr
, port
, opts
));
1750 return (scm_read_quote (chr
, port
, opts
));
1753 long line
= SCM_LINUM (port
);
1754 int column
= SCM_COL (port
) - 1;
1755 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1756 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1757 /* We read a comment or some such. */
1763 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1766 if (opts
->curly_infix_p
)
1767 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1769 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1771 if (opts
->square_brackets_p
)
1772 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1773 /* otherwise fall through */
1777 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1778 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1783 if (((chr
>= '0') && (chr
<= '9'))
1784 || (strchr ("+-.", chr
)))
1785 return (scm_read_number (chr
, port
, opts
));
1787 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1795 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1796 #define FUNC_NAME "scm_read_expression"
1798 if (!opts
->neoteric_p
)
1799 return read_inner_expression (port
, opts
);
1806 if (opts
->record_positions_p
)
1808 /* We need to get the position of the first non-whitespace
1809 character in order to correctly annotate neoteric
1810 expressions. For example, for the expression 'f(x)', the
1811 first call to 'read_inner_expression' reads the 'f' (which
1812 cannot be annotated), and then we later read the '(x)' and
1813 use it to construct the new list (f x). */
1814 int c
= flush_ws (port
, opts
, (char *) NULL
);
1817 scm_ungetc (c
, port
);
1818 line
= SCM_LINUM (port
);
1819 column
= SCM_COL (port
);
1822 expr
= read_inner_expression (port
, opts
);
1824 /* 'expr' is the first component of the neoteric expression. Now
1825 we loop, and as long as the next character is '(', '[', or '{',
1826 (without any intervening whitespace), we use it to construct a
1827 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1830 int chr
= scm_getc (port
);
1833 /* e(...) => (e ...) */
1834 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1835 else if (chr
== '[')
1836 /* e[...] => ($bracket-apply$ e ...) */
1837 expr
= scm_cons (sym_bracket_apply
,
1839 scm_read_sexp (chr
, port
, opts
)));
1840 else if (chr
== '{')
1842 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1844 if (scm_is_null (arg
))
1845 expr
= scm_list_1 (expr
); /* e{} => (e) */
1847 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1852 scm_ungetc (chr
, port
);
1855 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1863 /* Actual reader. */
1865 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1867 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1869 "Read an s-expression from the input port @var{port}, or from\n"
1870 "the current input port if @var{port} is not specified.\n"
1871 "Any whitespace before the next token is discarded.")
1872 #define FUNC_NAME s_scm_read
1874 scm_t_read_opts opts
;
1877 if (SCM_UNBNDP (port
))
1878 port
= scm_current_input_port ();
1879 SCM_VALIDATE_OPINPORT (1, port
);
1881 init_read_options (port
, &opts
);
1883 c
= flush_ws (port
, &opts
, (char *) NULL
);
1886 scm_ungetc (c
, port
);
1888 return (scm_read_expression (port
, &opts
));
1895 /* Manipulate the read-hash-procedures alist. This could be written in
1896 Scheme, but maybe it will also be used by C code during initialisation. */
1897 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1898 (SCM chr
, SCM proc
),
1899 "Install the procedure @var{proc} for reading expressions\n"
1900 "starting with the character sequence @code{#} and @var{chr}.\n"
1901 "@var{proc} will be called with two arguments: the character\n"
1902 "@var{chr} and the port to read further data from. The object\n"
1903 "returned will be the return value of @code{read}. \n"
1904 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1906 #define FUNC_NAME s_scm_read_hash_extend
1911 SCM_VALIDATE_CHAR (1, chr
);
1912 SCM_ASSERT (scm_is_false (proc
)
1913 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1914 proc
, SCM_ARG2
, FUNC_NAME
);
1916 /* Check if chr is already in the alist. */
1917 this = scm_i_read_hash_procedures_ref ();
1921 if (scm_is_null (this))
1923 /* not found, so add it to the beginning. */
1924 if (scm_is_true (proc
))
1926 SCM
new = scm_cons (scm_cons (chr
, proc
),
1927 scm_i_read_hash_procedures_ref ());
1928 scm_i_read_hash_procedures_set_x (new);
1932 if (scm_is_eq (chr
, SCM_CAAR (this)))
1934 /* already in the alist. */
1935 if (scm_is_false (proc
))
1938 if (scm_is_false (prev
))
1940 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1941 scm_i_read_hash_procedures_set_x (rest
);
1944 scm_set_cdr_x (prev
, SCM_CDR (this));
1949 scm_set_cdr_x (SCM_CAR (this), proc
);
1954 this = SCM_CDR (this);
1957 return SCM_UNSPECIFIED
;
1961 /* Recover the read-hash procedure corresponding to char c. */
1963 scm_get_hash_procedure (int c
)
1965 SCM rest
= scm_i_read_hash_procedures_ref ();
1969 if (scm_is_null (rest
))
1972 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1973 return SCM_CDAR (rest
);
1975 rest
= SCM_CDR (rest
);
1979 #define SCM_ENCODING_SEARCH_SIZE (500)
1981 /* Search the first few hundred characters of a file for an Emacs-like coding
1982 declaration. Returns either NULL or a string whose storage has been
1983 allocated with `scm_gc_malloc ()'. */
1985 scm_i_scan_for_encoding (SCM port
)
1988 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1989 size_t bytes_read
, encoding_length
, i
;
1990 char *encoding
= NULL
;
1992 char *pos
, *encoding_start
;
1995 pt
= SCM_PTAB_ENTRY (port
);
1997 if (pt
->rw_active
== SCM_PORT_WRITE
)
2001 pt
->rw_active
= SCM_PORT_READ
;
2003 if (pt
->read_pos
== pt
->read_end
)
2005 /* We can use the read buffer, and thus avoid a seek. */
2006 if (scm_fill_input (port
) == EOF
)
2009 bytes_read
= pt
->read_end
- pt
->read_pos
;
2010 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
2011 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
2013 if (bytes_read
<= 1)
2014 /* An unbuffered port -- don't scan. */
2017 memcpy (header
, pt
->read_pos
, bytes_read
);
2018 header
[bytes_read
] = '\0';
2022 /* Try to read some bytes and then seek back. Not all ports
2023 support seeking back; and indeed some file ports (like
2024 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2025 check performed by SCM_FPORT_FDES---but fail to seek
2026 backwards. Hence this block comes second. We prefer to use
2027 the read buffer in-place. */
2028 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2031 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2032 header
[bytes_read
] = '\0';
2033 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2037 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
2040 /* search past "coding[:=]" */
2044 if ((pos
= strstr(pos
, "coding")) == NULL
)
2047 pos
+= strlen("coding");
2048 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2049 (*pos
== ':' || *pos
== '='))
2057 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2058 (*pos
== ' ' || *pos
== '\t'))
2061 /* grab the next token */
2062 encoding_start
= pos
;
2064 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2065 && encoding_start
+ i
- header
< bytes_read
2066 && (isalnum ((int) encoding_start
[i
])
2067 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
2070 encoding_length
= i
;
2071 if (encoding_length
== 0)
2074 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2075 for (i
= 0; i
< encoding_length
; i
++)
2076 encoding
[i
] = toupper ((int) encoding
[i
]);
2078 /* push backwards to make sure we were in a comment */
2080 pos
= encoding_start
;
2081 while (pos
>= header
)
2088 else if (*pos
== '\n' || pos
== header
)
2090 /* This wasn't in a semicolon comment. Check for a
2091 hash-bang comment. */
2092 char *beg
= strstr (header
, "#!");
2093 char *end
= strstr (header
, "!#");
2094 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2105 /* This wasn't in a comment */
2108 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
2109 scm_misc_error (NULL
,
2110 "the port input declares the encoding ~s but is encoded as UTF-8",
2111 scm_list_1 (scm_from_locale_string (encoding
)));
2116 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2118 "Scans the port for an Emacs-like character coding declaration\n"
2119 "near the top of the contents of a port with random-accessible contents.\n"
2120 "The coding declaration is of the form\n"
2121 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2123 "Returns a string containing the character encoding of the file\n"
2124 "if a declaration was found, or @code{#f} otherwise.\n")
2125 #define FUNC_NAME s_scm_file_encoding
2130 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2132 enc
= scm_i_scan_for_encoding (port
);
2137 s_enc
= scm_from_locale_string (enc
);
2146 /* Per-port read options.
2148 We store per-port read options in the 'port-read-options' key of the
2149 port's alist, which is stored in 'scm_i_port_weak_hash'. The value
2150 stored in the alist is a single integer that contains a two-bit field
2151 for each read option.
2153 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2154 the applicable value should be inherited from the corresponding
2155 global read option. Otherwise, the bit field contains the value of
2156 the read option. For boolean read options that have been set
2157 per-port, the possible values are 0 or 1. If the 'keyword_style'
2158 read option has been set per-port, its possible values are those in
2159 'enum t_keyword_style'. */
2161 /* Key to read options in per-port alists. */
2162 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2164 /* Offsets of bit fields for each per-port override */
2165 #define READ_OPTION_COPY_SOURCE_P 0
2166 #define READ_OPTION_RECORD_POSITIONS_P 2
2167 #define READ_OPTION_CASE_INSENSITIVE_P 4
2168 #define READ_OPTION_KEYWORD_STYLE 6
2169 #define READ_OPTION_R6RS_ESCAPES_P 8
2170 #define READ_OPTION_SQUARE_BRACKETS_P 10
2171 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2172 #define READ_OPTION_CURLY_INFIX_P 14
2174 /* The total width in bits of the per-port overrides */
2175 #define READ_OPTIONS_NUM_BITS 16
2177 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2178 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2180 #define READ_OPTION_MASK 3
2181 #define READ_OPTION_INHERIT 3
2184 set_port_read_option (SCM port
, int option
, int new_value
)
2186 SCM alist
, scm_read_options
;
2187 unsigned int read_options
;
2189 new_value
&= READ_OPTION_MASK
;
2190 scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex
);
2191 alist
= scm_hashq_ref (scm_i_port_weak_hash
, port
, SCM_BOOL_F
);
2192 scm_read_options
= scm_assq_ref (alist
, sym_port_read_options
);
2193 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2194 read_options
= scm_to_uint (scm_read_options
);
2196 read_options
= READ_OPTIONS_INHERIT_ALL
;
2197 read_options
&= ~(READ_OPTION_MASK
<< option
);
2198 read_options
|= new_value
<< option
;
2199 scm_read_options
= scm_from_uint (read_options
);
2200 alist
= scm_assq_set_x (alist
, sym_port_read_options
, scm_read_options
);
2201 scm_hashq_set_x (scm_i_port_weak_hash
, port
, alist
);
2202 scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex
);
2205 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2207 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2210 opts
->case_insensitive_p
= value
;
2211 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2214 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2216 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2219 opts
->square_brackets_p
= value
;
2220 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2223 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2225 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2228 opts
->curly_infix_p
= value
;
2229 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2232 /* Initialize OPTS based on PORT's read options and the global read
2235 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2237 SCM alist
, val
, scm_read_options
;
2238 unsigned int read_options
, x
;
2240 scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex
);
2241 alist
= scm_hashq_ref (scm_i_port_weak_hash
, port
, SCM_BOOL_F
);
2242 scm_read_options
= scm_assq_ref (alist
, sym_port_read_options
);
2243 scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex
);
2245 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2246 read_options
= scm_to_uint (scm_read_options
);
2248 read_options
= READ_OPTIONS_INHERIT_ALL
;
2250 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2251 if (x
== READ_OPTION_INHERIT
)
2253 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2254 if (scm_is_eq (val
, scm_keyword_prefix
))
2255 x
= KEYWORD_STYLE_PREFIX
;
2256 else if (scm_is_eq (val
, scm_keyword_postfix
))
2257 x
= KEYWORD_STYLE_POSTFIX
;
2259 x
= KEYWORD_STYLE_HASH_PREFIX
;
2261 opts
->keyword_style
= x
;
2263 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2266 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2267 if (x == READ_OPTION_INHERIT) \
2268 x = !!SCM_ ## NAME; \
2273 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2274 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2275 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2276 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2277 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2278 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2279 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2281 #undef RESOLVE_BOOLEAN_OPTION
2283 opts
->neoteric_p
= 0;
2289 SCM read_hash_procs
;
2291 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2293 scm_i_read_hash_procedures
=
2294 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2296 scm_init_opts (scm_read_options
, scm_read_opts
);
2297 #include "libguile/read.x"