1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 /* SRFI-105 curly infix expression support */
67 SCM_SYMBOL (sym_nfx
, "$nfx$");
68 SCM_SYMBOL (sym_bracket_list
, "$bracket-list$");
69 SCM_SYMBOL (sym_bracket_apply
, "$bracket-apply$");
71 scm_t_option scm_read_opts
[] =
73 { SCM_OPTION_BOOLEAN
, "copy", 0,
74 "Copy source code expressions." },
75 { SCM_OPTION_BOOLEAN
, "positions", 1,
76 "Record positions of source code expressions." },
77 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
78 "Convert symbols to lower case."},
79 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
80 "Style of keyword recognition: #f, 'prefix or 'postfix."},
81 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
82 "Use R6RS variable-length character and string hex escapes."},
83 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
84 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
85 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
86 "In strings, consume leading whitespace after an escaped end-of-line."},
87 { SCM_OPTION_BOOLEAN
, "curly-infix", 0,
88 "Support SRFI-105 curly infix expressions."},
92 /* Internal read options structure. This is initialized by 'scm_read'
93 from the global and per-port read options, and a pointer is passed
94 down to all helper functions. */
98 KEYWORD_STYLE_HASH_PREFIX
,
100 KEYWORD_STYLE_POSTFIX
105 enum t_keyword_style keyword_style
;
106 unsigned int copy_source_p
: 1;
107 unsigned int record_positions_p
: 1;
108 unsigned int case_insensitive_p
: 1;
109 unsigned int r6rs_escapes_p
: 1;
110 unsigned int square_brackets_p
: 1;
111 unsigned int hungry_eol_escapes_p
: 1;
112 unsigned int curly_infix_p
: 1;
113 unsigned int neoteric_p
: 1;
116 typedef struct t_read_opts scm_t_read_opts
;
120 Give meaningful error messages for errors
124 FILE:LINE:COL: MESSAGE
125 This happened in ....
127 This is not standard GNU format, but the test-suite likes the real
128 message to be in front.
134 scm_i_input_error (char const *function
,
135 SCM port
, const char *message
, SCM arg
)
137 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
139 : scm_from_locale_string ("#<unknown port>"));
141 SCM string_port
= scm_open_output_string ();
142 SCM string
= SCM_EOL
;
143 scm_simple_format (string_port
,
144 scm_from_locale_string ("~A:~S:~S: ~A"),
146 scm_from_long (SCM_LINUM (port
) + 1),
147 scm_from_int (SCM_COL (port
) + 1),
148 scm_from_locale_string (message
)));
150 string
= scm_get_output_string (string_port
);
151 scm_close_output_port (string_port
);
152 scm_error_scm (scm_from_latin1_symbol ("read-error"),
153 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
160 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
162 "Option interface for the read options. Instead of using\n"
163 "this procedure directly, use the procedures @code{read-enable},\n"
164 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
165 #define FUNC_NAME s_scm_read_options
167 SCM ans
= scm_options (setting
,
170 if (SCM_COPY_SOURCE_P
)
171 SCM_RECORD_POSITIONS_P
= 1;
176 /* A fluid referring to an association list mapping extra hash
177 characters to procedures. */
178 static SCM
*scm_i_read_hash_procedures
;
181 scm_i_read_hash_procedures_ref (void)
183 return scm_fluid_ref (*scm_i_read_hash_procedures
);
187 scm_i_read_hash_procedures_set_x (SCM value
)
189 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
196 /* Size of the C buffer used to read symbols and numbers. */
197 #define READER_BUFFER_SIZE 128
199 /* Number of 32-bit codepoints in the buffer used to read strings. */
200 #define READER_STRING_BUFFER_SIZE 128
202 /* The maximum size of Scheme character names. */
203 #define READER_CHAR_NAME_MAX_SIZE 50
205 /* The maximum size of reader directive names. */
206 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
209 /* `isblank' is only in C99. */
210 #define CHAR_IS_BLANK_(_chr) \
211 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
212 || ((_chr) == '\f') || ((_chr) == '\r'))
215 # define CHAR_IS_BLANK(_chr) \
216 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
218 # define CHAR_IS_BLANK CHAR_IS_BLANK_
222 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
224 #define CHAR_IS_R5RS_DELIMITER(c) \
226 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
228 #define CHAR_IS_DELIMITER(c) \
229 (CHAR_IS_R5RS_DELIMITER (c) \
230 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
231 || opts->curly_infix_p)) \
232 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
234 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
236 #define CHAR_IS_EXPONENT_MARKER(_chr) \
237 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
238 || ((_chr) == 'd') || ((_chr) == 'l'))
240 /* Read an SCSH block comment. */
241 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
242 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
243 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
, scm_t_read_opts
*);
244 static SCM
scm_read_shebang (scm_t_wchar
, SCM
, scm_t_read_opts
*);
245 static SCM
scm_get_hash_procedure (int);
247 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
248 result in the pre-allocated buffer BUF. Return zero if the whole token has
249 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
250 bytes actually read. */
252 read_token (SCM port
, scm_t_read_opts
*opts
,
253 char *buf
, size_t buf_size
, size_t *read
)
257 while (*read
< buf_size
)
261 chr
= scm_get_byte_or_eof_unlocked (port
);
265 else if (CHAR_IS_DELIMITER (chr
))
267 scm_unget_byte_unlocked (chr
, port
);
280 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
281 if the token doesn't fit in BUFFER_SIZE bytes. */
283 read_complete_token (SCM port
, scm_t_read_opts
*opts
,
284 char *buffer
, size_t buffer_size
, size_t *read
)
287 size_t bytes_read
, overflow_size
= 0;
288 char *overflow_buffer
= NULL
;
292 overflow
= read_token (port
, opts
, buffer
, buffer_size
, &bytes_read
);
295 if (overflow
|| overflow_size
!= 0)
297 if (overflow_size
== 0)
299 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
300 memcpy (overflow_buffer
, buffer
, bytes_read
);
301 overflow_size
= bytes_read
;
306 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
308 memcpy (new_buf
, overflow_buffer
, overflow_size
);
309 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
311 overflow_buffer
= new_buf
;
312 overflow_size
+= bytes_read
;
319 *read
= overflow_size
;
323 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
326 /* Skip whitespace from PORT and return the first non-whitespace character
327 read. Raise an error on end-of-file. */
329 flush_ws (SCM port
, scm_t_read_opts
*opts
, const char *eoferr
)
333 switch (c
= scm_getc_unlocked (port
))
339 scm_i_input_error (eoferr
,
348 switch (c
= scm_getc_unlocked (port
))
354 case SCM_LINE_INCREMENTORS
:
360 switch (c
= scm_getc_unlocked (port
))
363 eoferr
= "read_sharp";
366 scm_read_shebang (c
, port
, opts
);
369 scm_read_commented_expression (c
, port
, opts
);
372 if (scm_is_false (scm_get_hash_procedure (c
)))
374 scm_read_r6rs_block_comment (c
, port
);
379 scm_ungetc_unlocked (c
, port
);
384 case SCM_LINE_INCREMENTORS
:
385 case SCM_SINGLE_SPACES
:
400 static SCM
scm_read_expression (SCM port
, scm_t_read_opts
*opts
);
401 static SCM
scm_read_sharp (int chr
, SCM port
, scm_t_read_opts
*opts
,
402 long line
, int column
);
406 maybe_annotate_source (SCM x
, SCM port
, scm_t_read_opts
*opts
,
407 long line
, int column
)
409 if (opts
->record_positions_p
)
410 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
415 scm_read_sexp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
416 #define FUNC_NAME "scm_i_lreadparen"
419 SCM tmp
, tl
, ans
= SCM_EOL
;
420 const int curly_list_p
= (chr
== '{') && opts
->curly_infix_p
;
421 const int terminating_char
= ((chr
== '{') ? '}'
422 : ((chr
== '[') ? ']'
425 /* Need to capture line and column numbers here. */
426 long line
= SCM_LINUM (port
);
427 int column
= SCM_COL (port
) - 1;
429 c
= flush_ws (port
, opts
, FUNC_NAME
);
430 if (terminating_char
== c
)
433 scm_ungetc_unlocked (c
, port
);
434 tmp
= scm_read_expression (port
, opts
);
436 /* Note that it is possible for scm_read_expression to return
437 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
438 check that it's a real dot by checking `c'. */
439 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
441 ans
= scm_read_expression (port
, opts
);
442 if (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
443 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
448 /* Build the head of the list structure. */
449 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
451 while (terminating_char
!= (c
= flush_ws (port
, opts
, FUNC_NAME
)))
455 if (c
== ')' || (c
== ']' && opts
->square_brackets_p
)
456 || ((c
== '}' || c
== ']') && opts
->curly_infix_p
))
457 scm_i_input_error (FUNC_NAME
, port
,
458 "in pair: mismatched close paren: ~A",
459 scm_list_1 (SCM_MAKE_CHAR (c
)));
461 scm_ungetc_unlocked (c
, port
);
462 tmp
= scm_read_expression (port
, opts
);
464 /* See above note about scm_sym_dot. */
465 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
467 SCM_SETCDR (tl
, scm_read_expression (port
, opts
));
469 c
= flush_ws (port
, opts
, FUNC_NAME
);
470 if (terminating_char
!= c
)
471 scm_i_input_error (FUNC_NAME
, port
,
472 "in pair: missing close paren", SCM_EOL
);
476 new_tail
= scm_cons (tmp
, SCM_EOL
);
477 SCM_SETCDR (tl
, new_tail
);
483 /* In addition to finding the length, 'scm_ilength' checks for
484 improper or circular lists, in which case it returns -1. */
485 int len
= scm_ilength (ans
);
487 /* The (len == 0) case is handled above */
489 /* Return directly to avoid re-annotating the element's source
490 location with the position of the outer brace. Also, it
491 might not be possible to annotate the element. */
492 return scm_car (ans
); /* {e} => e */
494 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
495 else if (len
>= 3 && (len
& 1))
497 /* It's a proper list whose length is odd and at least 3. If
498 the elements at odd indices (the infix operator positions)
499 are all 'equal?', then it's a simple curly-infix list.
500 Otherwise it's a mixed curly-infix list. */
501 SCM op
= scm_cadr (ans
);
503 /* Check to see if the elements at odd indices are 'equal?' */
504 for (tl
= scm_cdddr (ans
); ; tl
= scm_cddr (tl
))
506 if (scm_is_null (tl
))
508 /* Convert simple curly-infix list to prefix:
509 {a <op> b <op> ...} => (<op> a b ...) */
511 while (scm_is_pair (scm_cdr (tl
)))
514 SCM_SETCDR (tl
, tmp
);
517 ans
= scm_cons (op
, ans
);
520 else if (scm_is_false (scm_equal_p (op
, scm_car (tl
))))
522 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
523 ans
= scm_cons (sym_nfx
, ans
);
529 /* Mixed curly-infix (possibly improper) list:
530 {e . tail} => ($nfx$ e . tail) */
531 ans
= scm_cons (sym_nfx
, ans
);
534 return maybe_annotate_source (ans
, port
, opts
, line
, column
);
539 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
540 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
542 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
548 while (i < ndigits) \
550 a = scm_getc_unlocked (port); \
554 && (a == (scm_t_wchar) terminator) \
557 if ('0' <= a && a <= '9') \
559 else if ('A' <= a && a <= 'F') \
561 else if ('a' <= a && a <= 'f') \
574 skip_intraline_whitespace (SCM port
)
580 c
= scm_getc_unlocked (port
);
584 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
586 scm_ungetc_unlocked (c
, port
);
590 scm_read_string (int chr
, SCM port
, scm_t_read_opts
*opts
)
591 #define FUNC_NAME "scm_lreadr"
593 /* For strings smaller than C_STR, this function creates only one Scheme
594 object (the string returned). */
597 size_t c_str_len
= 0;
598 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
600 /* Need to capture line and column numbers here. */
601 long line
= SCM_LINUM (port
);
602 int column
= SCM_COL (port
) - 1;
604 while ('"' != (c
= scm_getc_unlocked (port
)))
609 scm_i_input_error (FUNC_NAME
, port
,
610 "end of file in string constant", SCM_EOL
);
613 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
615 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
621 switch (c
= scm_getc_unlocked (port
))
629 if (opts
->hungry_eol_escapes_p
)
630 skip_intraline_whitespace (port
);
657 if (opts
->r6rs_escapes_p
)
658 SCM_READ_HEX_ESCAPE (10, ';');
660 SCM_READ_HEX_ESCAPE (2, '\0');
663 if (!opts
->r6rs_escapes_p
)
665 SCM_READ_HEX_ESCAPE (4, '\0');
669 if (!opts
->r6rs_escapes_p
)
671 SCM_READ_HEX_ESCAPE (6, '\0');
676 scm_i_input_error (FUNC_NAME
, port
,
677 "illegal character in escape sequence: ~S",
678 scm_list_1 (SCM_MAKE_CHAR (c
)));
682 c_str
[c_str_len
++] = c
;
685 if (scm_is_null (str
))
686 /* Fast path: we got a string that fits in C_STR. */
687 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
691 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
693 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
696 return maybe_annotate_source (str
, port
, opts
, line
, column
);
702 scm_read_number (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
704 SCM result
, str
= SCM_EOL
;
705 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
708 /* Need to capture line and column numbers here. */
709 long line
= SCM_LINUM (port
);
710 int column
= SCM_COL (port
) - 1;
712 scm_ungetc_unlocked (chr
, port
);
713 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
716 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
718 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
719 if (scm_is_false (result
))
721 /* Return a symbol instead of a number */
722 if (opts
->case_insensitive_p
)
723 str
= scm_string_downcase_x (str
);
724 result
= scm_string_to_symbol (str
);
726 else if (SCM_NIMP (result
))
727 result
= maybe_annotate_source (result
, port
, opts
, line
, column
);
729 SCM_COL (port
) += scm_i_string_length (str
);
734 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
737 int ends_with_colon
= 0;
739 int postfix
= (opts
->keyword_style
== KEYWORD_STYLE_POSTFIX
);
740 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
743 scm_ungetc_unlocked (chr
, port
);
744 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
747 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
749 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
751 str
= scm_from_port_stringn (buffer
, bytes_read
- 1, port
);
753 if (opts
->case_insensitive_p
)
754 str
= scm_string_downcase_x (str
);
755 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
759 str
= scm_from_port_stringn (buffer
, bytes_read
, port
);
761 if (opts
->case_insensitive_p
)
762 str
= scm_string_downcase_x (str
);
763 result
= scm_string_to_symbol (str
);
766 SCM_COL (port
) += scm_i_string_length (str
);
771 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
772 #define FUNC_NAME "scm_lreadr"
776 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
803 scm_ungetc_unlocked (chr
, port
);
804 scm_ungetc_unlocked ('#', port
);
808 buffer
= read_complete_token (port
, opts
, local_buffer
, sizeof local_buffer
,
811 str
= scm_from_port_stringn (buffer
, read
, port
);
813 result
= scm_string_to_number (str
, scm_from_uint (radix
));
815 SCM_COL (port
) += scm_i_string_length (str
);
817 if (scm_is_true (result
))
820 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
827 scm_read_quote (int chr
, SCM port
, scm_t_read_opts
*opts
)
830 long line
= SCM_LINUM (port
);
831 int column
= SCM_COL (port
) - 1;
836 p
= scm_sym_quasiquote
;
847 c
= scm_getc_unlocked (port
);
849 p
= scm_sym_uq_splicing
;
852 scm_ungetc_unlocked (c
, port
);
859 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
860 "scm_read_quote", chr
);
864 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
865 return maybe_annotate_source (p
, port
, opts
, line
, column
);
868 SCM_SYMBOL (sym_syntax
, "syntax");
869 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
870 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
871 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
874 scm_read_syntax (int chr
, SCM port
, scm_t_read_opts
*opts
)
877 long line
= SCM_LINUM (port
);
878 int column
= SCM_COL (port
) - 1;
894 c
= scm_getc_unlocked (port
);
896 p
= sym_unsyntax_splicing
;
899 scm_ungetc_unlocked (c
, port
);
906 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
907 "scm_read_syntax", chr
);
911 p
= scm_cons2 (p
, scm_read_expression (port
, opts
), SCM_EOL
);
912 return maybe_annotate_source (p
, port
, opts
, line
, column
);
916 scm_read_nil (int chr
, SCM port
, scm_t_read_opts
*opts
)
918 SCM id
= scm_read_mixed_case_symbol (chr
, port
, opts
);
920 if (!scm_is_eq (id
, sym_nil
))
921 scm_i_input_error ("scm_read_nil", port
,
922 "unexpected input while reading #nil: ~a",
925 return SCM_ELISP_NIL
;
929 scm_read_semicolon_comment (int chr
, SCM port
)
933 /* We use the get_byte here because there is no need to get the
934 locale correct with comment input. This presumes that newline
935 always represents itself no matter what the encoding is. */
936 for (c
= scm_get_byte_or_eof_unlocked (port
);
937 (c
!= EOF
) && (c
!= '\n');
938 c
= scm_get_byte_or_eof_unlocked (port
));
940 return SCM_UNSPECIFIED
;
944 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
947 scm_read_boolean (int chr
, SCM port
)
960 return SCM_UNSPECIFIED
;
964 scm_read_character (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
965 #define FUNC_NAME "scm_lreadr"
967 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
969 size_t charname_len
, bytes_read
;
974 overflow
= read_token (port
, opts
, buffer
, READER_CHAR_NAME_MAX_SIZE
,
977 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
981 chr
= scm_getc_unlocked (port
);
983 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
984 "while reading character", SCM_EOL
);
986 /* CHR must be a token delimiter, like a whitespace. */
987 return (SCM_MAKE_CHAR (chr
));
990 pt
= SCM_PTAB_ENTRY (port
);
992 /* Simple ASCII characters can be processed immediately. Also, simple
993 ISO-8859-1 characters can be processed immediately if the encoding for this
994 port is ISO-8859-1. */
995 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
998 return SCM_MAKE_CHAR (buffer
[0]);
1001 /* Otherwise, convert the buffer into a proper scheme string for
1003 charname
= scm_from_port_stringn (buffer
, bytes_read
, port
);
1004 charname_len
= scm_i_string_length (charname
);
1005 SCM_COL (port
) += charname_len
;
1006 cp
= scm_i_string_ref (charname
, 0);
1007 if (charname_len
== 1)
1008 return SCM_MAKE_CHAR (cp
);
1010 /* Ignore dotted circles, which may be used to keep combining characters from
1011 combining with the backslash in #\charname. */
1012 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
1013 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
1015 if (cp
>= '0' && cp
< '8')
1017 /* Dirk:FIXME:: This type of character syntax is not R5RS
1018 * compliant. Further, it should be verified that the constant
1019 * does only consist of octal digits. */
1020 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
1021 if (SCM_I_INUMP (p
))
1023 scm_t_wchar c
= scm_to_uint32 (p
);
1024 if (SCM_IS_UNICODE_CHAR (c
))
1025 return SCM_MAKE_CHAR (c
);
1027 scm_i_input_error (FUNC_NAME
, port
,
1028 "out-of-range octal character escape: ~a",
1029 scm_list_1 (charname
));
1033 if (cp
== 'x' && (charname_len
> 1))
1037 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1038 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1039 scm_from_uint (16));
1040 if (SCM_I_INUMP (p
))
1042 scm_t_wchar c
= scm_to_uint32 (p
);
1043 if (SCM_IS_UNICODE_CHAR (c
))
1044 return SCM_MAKE_CHAR (c
);
1046 scm_i_input_error (FUNC_NAME
, port
,
1047 "out-of-range hex character escape: ~a",
1048 scm_list_1 (charname
));
1052 /* The names of characters should never have non-Latin1
1054 if (scm_i_is_narrow_string (charname
)
1055 || scm_i_try_narrow_string (charname
))
1056 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1058 if (scm_is_true (ch
))
1062 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1063 scm_list_1 (charname
));
1065 return SCM_UNSPECIFIED
;
1070 scm_read_keyword (int chr
, SCM port
, scm_t_read_opts
*opts
)
1074 /* Read the symbol that comprises the keyword. Doing this instead of
1075 invoking a specific symbol reader function allows `scm_read_keyword ()'
1076 to adapt to the delimiters currently valid of symbols.
1078 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1079 symbol
= scm_read_expression (port
, opts
);
1080 if (!scm_is_symbol (symbol
))
1081 scm_i_input_error ("scm_read_keyword", port
,
1082 "keyword prefix `~a' not followed by a symbol: ~s",
1083 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1085 return (scm_symbol_to_keyword (symbol
));
1089 scm_read_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1090 long line
, int column
)
1092 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1093 guarantee that it's going to do what we want. After all, this is an
1094 implementation detail of `scm_read_vector ()', not a desirable
1096 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
, opts
)),
1097 port
, opts
, line
, column
);
1100 /* Helper used by scm_read_array */
1102 read_decimal_integer (SCM port
, int c
, ssize_t
*resp
)
1111 c
= scm_getc_unlocked (port
);
1114 while ('0' <= c
&& c
<= '9')
1116 res
= 10*res
+ c
-'0';
1118 c
= scm_getc_unlocked (port
);
1126 /* Read an array. This function can also read vectors and uniform
1127 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1130 C is the first character read after the '#'. */
1132 scm_read_array (int c
, SCM port
, scm_t_read_opts
*opts
, long line
, int column
)
1135 scm_t_wchar tag_buf
[8];
1138 SCM tag
, shape
= SCM_BOOL_F
, elements
, array
;
1140 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1141 the array code can not deal with zero-length dimensions yet, and
1142 we want to allow zero-length vectors, of course. */
1144 return scm_read_vector (c
, port
, opts
, line
, column
);
1146 /* Disambiguate between '#f' and uniform floating point vectors. */
1149 c
= scm_getc_unlocked (port
);
1150 if (c
!= '3' && c
!= '6')
1153 scm_ungetc_unlocked (c
, port
);
1159 goto continue_reading_tag
;
1164 c
= read_decimal_integer (port
, c
, &rank
);
1166 scm_i_input_error (NULL
, port
, "array rank must be non-negative",
1171 continue_reading_tag
:
1172 while (c
!= EOF
&& c
!= '(' && c
!= '@' && c
!= ':'
1173 && tag_len
< sizeof tag_buf
/ sizeof tag_buf
[0])
1175 tag_buf
[tag_len
++] = c
;
1176 c
= scm_getc_unlocked (port
);
1182 tag
= scm_string_to_symbol (scm_from_utf32_stringn (tag_buf
, tag_len
));
1183 if (tag_len
== sizeof tag_buf
/ sizeof tag_buf
[0])
1184 scm_i_input_error (NULL
, port
, "invalid array tag, starting with: ~a",
1189 if (c
== '@' || c
== ':')
1195 ssize_t lbnd
= 0, len
= 0;
1200 c
= scm_getc_unlocked (port
);
1201 c
= read_decimal_integer (port
, c
, &lbnd
);
1204 s
= scm_from_ssize_t (lbnd
);
1208 c
= scm_getc_unlocked (port
);
1209 c
= read_decimal_integer (port
, c
, &len
);
1211 scm_i_input_error (NULL
, port
,
1212 "array length must be non-negative",
1215 s
= scm_list_2 (s
, scm_from_ssize_t (lbnd
+len
-1));
1218 shape
= scm_cons (s
, shape
);
1219 } while (c
== '@' || c
== ':');
1221 shape
= scm_reverse_x (shape
, SCM_EOL
);
1224 /* Read nested lists of elements. */
1226 scm_i_input_error (NULL
, port
,
1227 "missing '(' in vector or array literal",
1229 elements
= scm_read_sexp (c
, port
, opts
);
1231 if (scm_is_false (shape
))
1232 shape
= scm_from_ssize_t (rank
);
1233 else if (scm_ilength (shape
) != rank
)
1236 "the number of shape specifications must match the array rank",
1239 /* Handle special print syntax of rank zero arrays; see
1240 scm_i_print_array for a rationale. */
1243 if (!scm_is_pair (elements
))
1244 scm_i_input_error (NULL
, port
,
1245 "too few elements in array literal, need 1",
1247 if (!scm_is_null (SCM_CDR (elements
)))
1248 scm_i_input_error (NULL
, port
,
1249 "too many elements in array literal, want 1",
1251 elements
= SCM_CAR (elements
);
1254 /* Construct array, annotate with source location, and return. */
1255 array
= scm_list_to_typed_array (tag
, shape
, elements
);
1256 return maybe_annotate_source (array
, port
, opts
, line
, column
);
1260 scm_read_srfi4_vector (int chr
, SCM port
, scm_t_read_opts
*opts
,
1261 long line
, int column
)
1263 return scm_read_array (chr
, port
, opts
, line
, column
);
1267 scm_read_bytevector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1268 long line
, int column
)
1270 chr
= scm_getc_unlocked (port
);
1274 chr
= scm_getc_unlocked (port
);
1278 chr
= scm_getc_unlocked (port
);
1282 return maybe_annotate_source
1283 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
, opts
)),
1284 port
, opts
, line
, column
);
1287 scm_i_input_error ("read_bytevector", port
,
1288 "invalid bytevector prefix",
1289 SCM_MAKE_CHAR (chr
));
1290 return SCM_UNSPECIFIED
;
1294 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1295 long line
, int column
)
1297 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1298 terribly inefficient but who cares? */
1299 SCM s_bits
= SCM_EOL
;
1301 for (chr
= scm_getc_unlocked (port
);
1302 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1303 chr
= scm_getc_unlocked (port
))
1305 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1309 scm_ungetc_unlocked (chr
, port
);
1311 return maybe_annotate_source
1312 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1313 port
, opts
, line
, column
);
1317 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1323 int c
= scm_getc_unlocked (port
);
1326 scm_i_input_error ("skip_block_comment", port
,
1327 "unterminated `#! ... !#' comment", SCM_EOL
);
1331 else if (c
== '#' && bang_seen
)
1337 return SCM_UNSPECIFIED
;
1340 static void set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
,
1342 static void set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
,
1344 static void set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
,
1348 scm_read_shebang (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
)
1350 char name
[READER_DIRECTIVE_NAME_MAX_SIZE
+ 1];
1354 while (i
<= READER_DIRECTIVE_NAME_MAX_SIZE
)
1356 c
= scm_getc_unlocked (port
);
1358 scm_i_input_error ("skip_block_comment", port
,
1359 "unterminated `#! ... !#' comment", SCM_EOL
);
1360 else if (('a' <= c
&& c
<= 'z') || ('0' <= c
&& c
<= '9') || c
== '-')
1362 else if (CHAR_IS_DELIMITER (c
))
1364 scm_ungetc_unlocked (c
, port
);
1366 if (0 == strcmp ("r6rs", name
))
1367 ; /* Silently ignore */
1368 else if (0 == strcmp ("fold-case", name
))
1369 set_port_case_insensitive_p (port
, opts
, 1);
1370 else if (0 == strcmp ("no-fold-case", name
))
1371 set_port_case_insensitive_p (port
, opts
, 0);
1372 else if (0 == strcmp ("curly-infix", name
))
1373 set_port_curly_infix_p (port
, opts
, 1);
1374 else if (0 == strcmp ("curly-infix-and-bracket-lists", name
))
1376 set_port_curly_infix_p (port
, opts
, 1);
1377 set_port_square_brackets_p (port
, opts
, 0);
1382 return SCM_UNSPECIFIED
;
1386 scm_ungetc_unlocked (c
, port
);
1391 scm_ungetc_unlocked (name
[--i
], port
);
1392 return scm_read_scsh_block_comment (chr
, port
);
1396 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1398 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1399 nested. So care must be taken. */
1400 int nesting_level
= 1;
1402 int a
= scm_getc_unlocked (port
);
1405 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1406 "unterminated `#| ... |#' comment", SCM_EOL
);
1408 while (nesting_level
> 0)
1410 int b
= scm_getc_unlocked (port
);
1413 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1414 "unterminated `#| ... |#' comment", SCM_EOL
);
1416 if (a
== '|' && b
== '#')
1421 else if (a
== '#' && b
== '|')
1430 return SCM_UNSPECIFIED
;
1434 scm_read_commented_expression (scm_t_wchar chr
, SCM port
,
1435 scm_t_read_opts
*opts
)
1439 c
= flush_ws (port
, opts
, (char *) NULL
);
1441 scm_i_input_error ("read_commented_expression", port
,
1442 "no expression after #; comment", SCM_EOL
);
1443 scm_ungetc_unlocked (c
, port
);
1444 scm_read_expression (port
, opts
);
1445 return SCM_UNSPECIFIED
;
1449 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1451 /* Guile's extended symbol read syntax looks like this:
1453 #{This is all a symbol name}#
1455 So here, CHR is expected to be `{'. */
1458 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1460 buf
= scm_i_string_start_writing (buf
);
1462 while ((chr
= scm_getc_unlocked (port
)) != EOF
)
1473 scm_i_string_set_x (buf
, len
++, '}');
1479 else if (chr
== '\\')
1481 /* It used to be that print.c would print extended-read-syntax
1482 symbols with backslashes before "non-standard" chars, but
1483 this routine wouldn't do anything with those escapes.
1484 Bummer. What we've done is to change print.c to output
1485 R6RS hex escapes for those characters, relying on the fact
1486 that the extended read syntax would never put a `\' before
1487 an `x'. For now, we just ignore other instances of
1488 backslash in the string. */
1489 switch ((chr
= scm_getc_unlocked (port
)))
1497 SCM_READ_HEX_ESCAPE (10, ';');
1498 scm_i_string_set_x (buf
, len
++, c
);
1506 scm_i_string_stop_writing ();
1507 scm_i_input_error ("scm_read_extended_symbol", port
,
1508 "illegal character in escape sequence: ~S",
1509 scm_list_1 (SCM_MAKE_CHAR (c
)));
1513 scm_i_string_set_x (buf
, len
++, chr
);
1518 scm_i_string_set_x (buf
, len
++, chr
);
1520 if (len
>= scm_i_string_length (buf
) - 2)
1524 scm_i_string_stop_writing ();
1525 addy
= scm_i_make_string (1024, NULL
, 0);
1526 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1528 buf
= scm_i_string_start_writing (buf
);
1533 scm_i_string_stop_writing ();
1535 scm_i_input_error ("scm_read_extended_symbol", port
,
1536 "end of file while reading symbol", SCM_EOL
);
1538 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1543 /* Top-level token readers, i.e., dispatchers. */
1546 scm_read_sharp_extension (int chr
, SCM port
, scm_t_read_opts
*opts
)
1550 proc
= scm_get_hash_procedure (chr
);
1551 if (scm_is_true (scm_procedure_p (proc
)))
1553 long line
= SCM_LINUM (port
);
1554 int column
= SCM_COL (port
) - 2;
1557 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1559 if (opts
->record_positions_p
&& SCM_NIMP (got
)
1560 && !scm_i_has_source_properties (got
))
1561 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1566 return SCM_UNSPECIFIED
;
1569 /* The reader for the sharp `#' character. It basically dispatches reads
1570 among the above token readers. */
1572 scm_read_sharp (scm_t_wchar chr
, SCM port
, scm_t_read_opts
*opts
,
1573 long line
, int column
)
1574 #define FUNC_NAME "scm_lreadr"
1578 chr
= scm_getc_unlocked (port
);
1580 result
= scm_read_sharp_extension (chr
, port
, opts
);
1581 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1587 return (scm_read_character (chr
, port
, opts
));
1589 return (scm_read_vector (chr
, port
, opts
, line
, column
));
1594 /* This one may return either a boolean or an SRFI-4 vector. */
1595 return (scm_read_srfi4_vector (chr
, port
, opts
, line
, column
));
1597 return (scm_read_bytevector (chr
, port
, opts
, line
, column
));
1599 return (scm_read_guile_bit_vector (chr
, port
, opts
, line
, column
));
1603 return (scm_read_boolean (chr
, port
));
1605 return (scm_read_keyword (chr
, port
, opts
));
1606 case '0': case '1': case '2': case '3': case '4':
1607 case '5': case '6': case '7': case '8': case '9':
1609 return (scm_read_array (chr
, port
, opts
, line
, column
));
1623 return (scm_read_number_and_radix (chr
, port
, opts
));
1625 return (scm_read_extended_symbol (chr
, port
));
1627 return (scm_read_shebang (chr
, port
, opts
));
1629 return (scm_read_commented_expression (chr
, port
, opts
));
1633 return (scm_read_syntax (chr
, port
, opts
));
1635 return (scm_read_nil (chr
, port
, opts
));
1637 result
= scm_read_sharp_extension (chr
, port
, opts
);
1638 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1640 /* To remain compatible with 1.8 and earlier, the following
1641 characters have lower precedence than `read-hash-extend'
1646 return scm_read_r6rs_block_comment (chr
, port
);
1648 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1649 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1656 return SCM_UNSPECIFIED
;
1661 read_inner_expression (SCM port
, scm_t_read_opts
*opts
)
1662 #define FUNC_NAME "read_inner_expression"
1668 chr
= scm_getc_unlocked (port
);
1672 case SCM_WHITE_SPACES
:
1673 case SCM_LINE_INCREMENTORS
:
1676 (void) scm_read_semicolon_comment (chr
, port
);
1679 if (opts
->curly_infix_p
)
1681 if (opts
->neoteric_p
)
1682 return scm_read_sexp (chr
, port
, opts
);
1687 /* Enable neoteric expressions within curly braces */
1688 opts
->neoteric_p
= 1;
1689 expr
= scm_read_sexp (chr
, port
, opts
);
1690 opts
->neoteric_p
= 0;
1695 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1697 if (opts
->square_brackets_p
)
1698 return scm_read_sexp (chr
, port
, opts
);
1699 else if (opts
->curly_infix_p
)
1701 /* The syntax of neoteric expressions requires that '[' be
1702 a delimiter when curly-infix is enabled, so it cannot
1703 be part of an unescaped symbol. We might as well do
1704 something useful with it, so we adopt Kawa's convention:
1705 [...] => ($bracket-list$ ...) */
1706 long line
= SCM_LINUM (port
);
1707 int column
= SCM_COL (port
) - 1;
1708 return maybe_annotate_source
1709 (scm_cons (sym_bracket_list
, scm_read_sexp (chr
, port
, opts
)),
1710 port
, opts
, line
, column
);
1713 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1715 return (scm_read_sexp (chr
, port
, opts
));
1717 return (scm_read_string (chr
, port
, opts
));
1721 return (scm_read_quote (chr
, port
, opts
));
1724 long line
= SCM_LINUM (port
);
1725 int column
= SCM_COL (port
) - 1;
1726 SCM result
= scm_read_sharp (chr
, port
, opts
, line
, column
);
1727 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1728 /* We read a comment or some such. */
1734 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1737 if (opts
->curly_infix_p
)
1738 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"}\"", SCM_EOL
);
1740 return scm_read_mixed_case_symbol (chr
, port
, opts
);
1742 if (opts
->square_brackets_p
)
1743 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1744 /* otherwise fall through */
1748 if (opts
->keyword_style
== KEYWORD_STYLE_PREFIX
)
1749 return scm_symbol_to_keyword (scm_read_expression (port
, opts
));
1754 if (((chr
>= '0') && (chr
<= '9'))
1755 || (strchr ("+-.", chr
)))
1756 return (scm_read_number (chr
, port
, opts
));
1758 return (scm_read_mixed_case_symbol (chr
, port
, opts
));
1766 scm_read_expression (SCM port
, scm_t_read_opts
*opts
)
1767 #define FUNC_NAME "scm_read_expression"
1769 if (!opts
->neoteric_p
)
1770 return read_inner_expression (port
, opts
);
1777 if (opts
->record_positions_p
)
1779 /* We need to get the position of the first non-whitespace
1780 character in order to correctly annotate neoteric
1781 expressions. For example, for the expression 'f(x)', the
1782 first call to 'read_inner_expression' reads the 'f' (which
1783 cannot be annotated), and then we later read the '(x)' and
1784 use it to construct the new list (f x). */
1785 int c
= flush_ws (port
, opts
, (char *) NULL
);
1788 scm_ungetc_unlocked (c
, port
);
1789 line
= SCM_LINUM (port
);
1790 column
= SCM_COL (port
);
1793 expr
= read_inner_expression (port
, opts
);
1795 /* 'expr' is the first component of the neoteric expression. Now
1796 we loop, and as long as the next character is '(', '[', or '{',
1797 (without any intervening whitespace), we use it to construct a
1798 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1801 int chr
= scm_getc_unlocked (port
);
1804 /* e(...) => (e ...) */
1805 expr
= scm_cons (expr
, scm_read_sexp (chr
, port
, opts
));
1806 else if (chr
== '[')
1807 /* e[...] => ($bracket-apply$ e ...) */
1808 expr
= scm_cons (sym_bracket_apply
,
1810 scm_read_sexp (chr
, port
, opts
)));
1811 else if (chr
== '{')
1813 SCM arg
= scm_read_sexp (chr
, port
, opts
);
1815 if (scm_is_null (arg
))
1816 expr
= scm_list_1 (expr
); /* e{} => (e) */
1818 expr
= scm_list_2 (expr
, arg
); /* e{...} => (e {...}) */
1823 scm_ungetc_unlocked (chr
, port
);
1826 maybe_annotate_source (expr
, port
, opts
, line
, column
);
1834 /* Actual reader. */
1836 static void init_read_options (SCM port
, scm_t_read_opts
*opts
);
1838 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1840 "Read an s-expression from the input port @var{port}, or from\n"
1841 "the current input port if @var{port} is not specified.\n"
1842 "Any whitespace before the next token is discarded.")
1843 #define FUNC_NAME s_scm_read
1845 scm_t_read_opts opts
;
1848 if (SCM_UNBNDP (port
))
1849 port
= scm_current_input_port ();
1850 SCM_VALIDATE_OPINPORT (1, port
);
1852 init_read_options (port
, &opts
);
1854 c
= flush_ws (port
, &opts
, (char *) NULL
);
1857 scm_ungetc_unlocked (c
, port
);
1859 return (scm_read_expression (port
, &opts
));
1866 /* Manipulate the read-hash-procedures alist. This could be written in
1867 Scheme, but maybe it will also be used by C code during initialisation. */
1868 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1869 (SCM chr
, SCM proc
),
1870 "Install the procedure @var{proc} for reading expressions\n"
1871 "starting with the character sequence @code{#} and @var{chr}.\n"
1872 "@var{proc} will be called with two arguments: the character\n"
1873 "@var{chr} and the port to read further data from. The object\n"
1874 "returned will be the return value of @code{read}. \n"
1875 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1877 #define FUNC_NAME s_scm_read_hash_extend
1882 SCM_VALIDATE_CHAR (1, chr
);
1883 SCM_ASSERT (scm_is_false (proc
)
1884 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1885 proc
, SCM_ARG2
, FUNC_NAME
);
1887 /* Check if chr is already in the alist. */
1888 this = scm_i_read_hash_procedures_ref ();
1892 if (scm_is_null (this))
1894 /* not found, so add it to the beginning. */
1895 if (scm_is_true (proc
))
1897 SCM
new = scm_cons (scm_cons (chr
, proc
),
1898 scm_i_read_hash_procedures_ref ());
1899 scm_i_read_hash_procedures_set_x (new);
1903 if (scm_is_eq (chr
, SCM_CAAR (this)))
1905 /* already in the alist. */
1906 if (scm_is_false (proc
))
1909 if (scm_is_false (prev
))
1911 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1912 scm_i_read_hash_procedures_set_x (rest
);
1915 scm_set_cdr_x (prev
, SCM_CDR (this));
1920 scm_set_cdr_x (SCM_CAR (this), proc
);
1925 this = SCM_CDR (this);
1928 return SCM_UNSPECIFIED
;
1932 /* Recover the read-hash procedure corresponding to char c. */
1934 scm_get_hash_procedure (int c
)
1936 SCM rest
= scm_i_read_hash_procedures_ref ();
1940 if (scm_is_null (rest
))
1943 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1944 return SCM_CDAR (rest
);
1946 rest
= SCM_CDR (rest
);
1950 #define SCM_ENCODING_SEARCH_SIZE (500)
1952 /* Search the first few hundred characters of a file for an Emacs-like coding
1953 declaration. Returns either NULL or a string whose storage has been
1954 allocated with `scm_gc_malloc ()'. */
1956 scm_i_scan_for_encoding (SCM port
)
1959 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1960 size_t bytes_read
, encoding_length
, i
;
1961 char *encoding
= NULL
;
1963 char *pos
, *encoding_start
;
1966 pt
= SCM_PTAB_ENTRY (port
);
1968 if (pt
->rw_active
== SCM_PORT_WRITE
)
1969 scm_flush_unlocked (port
);
1972 pt
->rw_active
= SCM_PORT_READ
;
1974 if (pt
->read_pos
== pt
->read_end
)
1976 /* We can use the read buffer, and thus avoid a seek. */
1977 if (scm_fill_input_unlocked (port
) == EOF
)
1980 bytes_read
= pt
->read_end
- pt
->read_pos
;
1981 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1982 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1984 if (bytes_read
<= 1)
1985 /* An unbuffered port -- don't scan. */
1988 memcpy (header
, pt
->read_pos
, bytes_read
);
1989 header
[bytes_read
] = '\0';
1993 /* Try to read some bytes and then seek back. Not all ports
1994 support seeking back; and indeed some file ports (like
1995 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1996 check performed by SCM_FPORT_FDES---but fail to seek
1997 backwards. Hence this block comes second. We prefer to use
1998 the read buffer in-place. */
1999 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
2002 bytes_read
= scm_c_read_unlocked (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
2003 header
[bytes_read
] = '\0';
2004 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
2008 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
2011 /* search past "coding[:=]" */
2015 if ((pos
= strstr(pos
, "coding")) == NULL
)
2018 pos
+= strlen("coding");
2019 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
2020 (*pos
== ':' || *pos
== '='))
2028 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
2029 (*pos
== ' ' || *pos
== '\t'))
2032 /* grab the next token */
2033 encoding_start
= pos
;
2035 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
2036 && encoding_start
+ i
- header
< bytes_read
2037 && (isalnum ((int) encoding_start
[i
])
2038 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
2041 encoding_length
= i
;
2042 if (encoding_length
== 0)
2045 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
2046 for (i
= 0; i
< encoding_length
; i
++)
2047 encoding
[i
] = toupper ((int) encoding
[i
]);
2049 /* push backwards to make sure we were in a comment */
2051 pos
= encoding_start
;
2052 while (pos
>= header
)
2059 else if (*pos
== '\n' || pos
== header
)
2061 /* This wasn't in a semicolon comment. Check for a
2062 hash-bang comment. */
2063 char *beg
= strstr (header
, "#!");
2064 char *end
= strstr (header
, "!#");
2065 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
2076 /* This wasn't in a comment */
2079 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
2080 scm_misc_error (NULL
,
2081 "the port input declares the encoding ~s but is encoded as UTF-8",
2082 scm_list_1 (scm_from_locale_string (encoding
)));
2087 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
2089 "Scans the port for an Emacs-like character coding declaration\n"
2090 "near the top of the contents of a port with random-accessible contents.\n"
2091 "The coding declaration is of the form\n"
2092 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2094 "Returns a string containing the character encoding of the file\n"
2095 "if a declaration was found, or @code{#f} otherwise.\n")
2096 #define FUNC_NAME s_scm_file_encoding
2101 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
2103 enc
= scm_i_scan_for_encoding (port
);
2108 s_enc
= scm_from_locale_string (enc
);
2117 /* Per-port read options.
2119 We store per-port read options in the 'port-read-options' key of the
2120 port's alist. The value stored in the alist is a single integer that
2121 contains a two-bit field for each read option.
2123 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2124 the applicable value should be inherited from the corresponding
2125 global read option. Otherwise, the bit field contains the value of
2126 the read option. For boolean read options that have been set
2127 per-port, the possible values are 0 or 1. If the 'keyword_style'
2128 read option has been set per-port, its possible values are those in
2129 'enum t_keyword_style'. */
2131 /* Key to read options in per-port alists. */
2132 SCM_SYMBOL (sym_port_read_options
, "port-read-options");
2134 /* Offsets of bit fields for each per-port override */
2135 #define READ_OPTION_COPY_SOURCE_P 0
2136 #define READ_OPTION_RECORD_POSITIONS_P 2
2137 #define READ_OPTION_CASE_INSENSITIVE_P 4
2138 #define READ_OPTION_KEYWORD_STYLE 6
2139 #define READ_OPTION_R6RS_ESCAPES_P 8
2140 #define READ_OPTION_SQUARE_BRACKETS_P 10
2141 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2142 #define READ_OPTION_CURLY_INFIX_P 14
2144 /* The total width in bits of the per-port overrides */
2145 #define READ_OPTIONS_NUM_BITS 16
2147 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2148 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2150 #define READ_OPTION_MASK 3
2151 #define READ_OPTION_INHERIT 3
2154 set_port_read_option (SCM port
, int option
, int new_value
)
2156 SCM scm_read_options
;
2157 unsigned int read_options
;
2159 new_value
&= READ_OPTION_MASK
;
2160 scm_read_options
= scm_assq_ref (SCM_PTAB_ENTRY(port
)->alist
,
2161 sym_port_read_options
);
2162 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2163 read_options
= scm_to_uint (scm_read_options
);
2165 read_options
= READ_OPTIONS_INHERIT_ALL
;
2166 read_options
&= ~(READ_OPTION_MASK
<< option
);
2167 read_options
|= new_value
<< option
;
2168 scm_read_options
= scm_from_uint (read_options
);
2169 SCM_PTAB_ENTRY(port
)->alist
= scm_assq_set_x (SCM_PTAB_ENTRY(port
)->alist
,
2170 sym_port_read_options
,
2174 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2176 set_port_case_insensitive_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2179 opts
->case_insensitive_p
= value
;
2180 set_port_read_option (port
, READ_OPTION_CASE_INSENSITIVE_P
, value
);
2183 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2185 set_port_square_brackets_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2188 opts
->square_brackets_p
= value
;
2189 set_port_read_option (port
, READ_OPTION_SQUARE_BRACKETS_P
, value
);
2192 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2194 set_port_curly_infix_p (SCM port
, scm_t_read_opts
*opts
, int value
)
2197 opts
->curly_infix_p
= value
;
2198 set_port_read_option (port
, READ_OPTION_CURLY_INFIX_P
, value
);
2201 /* Initialize OPTS based on PORT's read options and the global read
2204 init_read_options (SCM port
, scm_t_read_opts
*opts
)
2206 SCM val
, scm_read_options
;
2207 unsigned int read_options
, x
;
2209 scm_read_options
= scm_assq_ref (SCM_PTAB_ENTRY(port
)->alist
,
2210 sym_port_read_options
);
2212 if (scm_is_unsigned_integer (scm_read_options
, 0, READ_OPTIONS_MAX_VALUE
))
2213 read_options
= scm_to_uint (scm_read_options
);
2215 read_options
= READ_OPTIONS_INHERIT_ALL
;
2217 x
= READ_OPTION_MASK
& (read_options
>> READ_OPTION_KEYWORD_STYLE
);
2218 if (x
== READ_OPTION_INHERIT
)
2220 val
= SCM_PACK (SCM_KEYWORD_STYLE
);
2221 if (scm_is_eq (val
, scm_keyword_prefix
))
2222 x
= KEYWORD_STYLE_PREFIX
;
2223 else if (scm_is_eq (val
, scm_keyword_postfix
))
2224 x
= KEYWORD_STYLE_POSTFIX
;
2226 x
= KEYWORD_STYLE_HASH_PREFIX
;
2228 opts
->keyword_style
= x
;
2230 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2233 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2234 if (x == READ_OPTION_INHERIT) \
2235 x = !!SCM_ ## NAME; \
2240 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P
, copy_source_p
);
2241 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P
, record_positions_p
);
2242 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P
, case_insensitive_p
);
2243 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P
, r6rs_escapes_p
);
2244 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P
, square_brackets_p
);
2245 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P
, hungry_eol_escapes_p
);
2246 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P
, curly_infix_p
);
2248 #undef RESOLVE_BOOLEAN_OPTION
2250 opts
->neoteric_p
= 0;
2256 SCM read_hash_procs
;
2258 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
2260 scm_i_read_hash_procedures
=
2261 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
2263 scm_init_opts (scm_read_options
, scm_read_opts
);
2264 #include "libguile/read.x"