1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Number of 32-bit codepoints in the buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 128
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
239 if the token doesn't fit in BUFFER_SIZE bytes. */
241 read_complete_token (SCM port
, char *buffer
, size_t buffer_size
,
245 size_t bytes_read
, overflow_size
= 0;
246 char *overflow_buffer
= NULL
;
250 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
253 if (overflow
|| overflow_size
!= 0)
255 if (overflow_size
== 0)
257 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
258 memcpy (overflow_buffer
, buffer
, bytes_read
);
259 overflow_size
= bytes_read
;
264 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
266 memcpy (new_buf
, overflow_buffer
, overflow_size
);
267 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
269 overflow_buffer
= new_buf
;
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
, long line
, int column
);
363 maybe_annotate_source (SCM x
, SCM port
, long line
, int column
)
365 if (SCM_RECORD_POSITIONS_P
)
366 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
371 scm_read_sexp (scm_t_wchar chr
, SCM port
)
372 #define FUNC_NAME "scm_i_lreadparen"
375 SCM tmp
, tl
, ans
= SCM_EOL
;
376 const int terminating_char
= ((chr
== '[') ? ']' : ')');
378 /* Need to capture line and column numbers here. */
379 long line
= SCM_LINUM (port
);
380 int column
= SCM_COL (port
) - 1;
382 c
= flush_ws (port
, FUNC_NAME
);
383 if (terminating_char
== c
)
386 scm_ungetc (c
, port
);
387 tmp
= scm_read_expression (port
);
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
394 ans
= scm_read_expression (port
);
395 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
396 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
401 /* Build the head of the list structure. */
402 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
404 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
408 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
409 scm_i_input_error (FUNC_NAME
, port
,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c
)));
413 scm_ungetc (c
, port
);
414 tmp
= scm_read_expression (port
);
416 /* See above note about scm_sym_dot. */
417 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
419 SCM_SETCDR (tl
, scm_read_expression (port
));
421 c
= flush_ws (port
, FUNC_NAME
);
422 if (terminating_char
!= c
)
423 scm_i_input_error (FUNC_NAME
, port
,
424 "in pair: missing close paren", SCM_EOL
);
428 new_tail
= scm_cons (tmp
, SCM_EOL
);
429 SCM_SETCDR (tl
, new_tail
);
434 return maybe_annotate_source (ans
, port
, line
, column
);
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
448 while (i < ndigits) \
450 a = scm_getc (port); \
454 && (a == (scm_t_wchar) terminator) \
457 if ('0' <= a && a <= '9') \
459 else if ('A' <= a && a <= 'F') \
461 else if ('a' <= a && a <= 'f') \
474 skip_intraline_whitespace (SCM port
)
484 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
486 scm_ungetc (c
, port
);
490 scm_read_string (int chr
, SCM port
)
491 #define FUNC_NAME "scm_lreadr"
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
497 size_t c_str_len
= 0;
498 scm_t_wchar c
, c_str
[READER_STRING_BUFFER_SIZE
];
500 /* Need to capture line and column numbers here. */
501 long line
= SCM_LINUM (port
);
502 int column
= SCM_COL (port
) - 1;
504 while ('"' != (c
= scm_getc (port
)))
509 scm_i_input_error (FUNC_NAME
, port
,
510 "end of file in string constant", SCM_EOL
);
513 if (c_str_len
+ 1 >= READER_STRING_BUFFER_SIZE
)
515 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
521 switch (c
= scm_getc (port
))
529 if (SCM_HUNGRY_EOL_ESCAPES_P
)
530 skip_intraline_whitespace (port
);
557 if (SCM_R6RS_ESCAPES_P
)
558 SCM_READ_HEX_ESCAPE (10, ';');
560 SCM_READ_HEX_ESCAPE (2, '\0');
563 if (!SCM_R6RS_ESCAPES_P
)
565 SCM_READ_HEX_ESCAPE (4, '\0');
569 if (!SCM_R6RS_ESCAPES_P
)
571 SCM_READ_HEX_ESCAPE (6, '\0');
576 scm_i_input_error (FUNC_NAME
, port
,
577 "illegal character in escape sequence: ~S",
578 scm_list_1 (SCM_MAKE_CHAR (c
)));
582 c_str
[c_str_len
++] = c
;
585 if (scm_is_null (str
))
586 /* Fast path: we got a string that fits in C_STR. */
587 str
= scm_from_utf32_stringn (c_str
, c_str_len
);
591 str
= scm_cons (scm_from_utf32_stringn (c_str
, c_str_len
), str
);
593 str
= scm_string_concatenate_reverse (str
, SCM_UNDEFINED
, SCM_UNDEFINED
);
596 return maybe_annotate_source (str
, port
, line
, column
);
602 scm_read_number (scm_t_wchar chr
, SCM port
)
604 SCM result
, str
= SCM_EOL
;
605 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
607 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
609 /* Need to capture line and column numbers here. */
610 long line
= SCM_LINUM (port
);
611 int column
= SCM_COL (port
) - 1;
613 scm_ungetc (chr
, port
);
614 buffer
= read_complete_token (port
, local_buffer
, sizeof local_buffer
,
617 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
619 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
620 if (scm_is_false (result
))
622 /* Return a symbol instead of a number */
623 if (SCM_CASE_INSENSITIVE_P
)
624 str
= scm_string_downcase_x (str
);
625 result
= scm_string_to_symbol (str
);
627 else if (SCM_NIMP (result
))
628 result
= maybe_annotate_source (result
, port
, line
, column
);
630 SCM_COL (port
) += scm_i_string_length (str
);
635 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
638 int ends_with_colon
= 0;
640 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
641 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
642 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
645 scm_ungetc (chr
, port
);
646 buffer
= read_complete_token (port
, local_buffer
, sizeof local_buffer
,
649 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
651 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
653 str
= scm_from_stringn (buffer
, bytes_read
- 1,
654 pt
->encoding
, pt
->ilseq_handler
);
656 if (SCM_CASE_INSENSITIVE_P
)
657 str
= scm_string_downcase_x (str
);
658 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
662 str
= scm_from_stringn (buffer
, bytes_read
,
663 pt
->encoding
, pt
->ilseq_handler
);
665 if (SCM_CASE_INSENSITIVE_P
)
666 str
= scm_string_downcase_x (str
);
667 result
= scm_string_to_symbol (str
);
670 SCM_COL (port
) += scm_i_string_length (str
);
675 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
676 #define FUNC_NAME "scm_lreadr"
680 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
708 scm_ungetc (chr
, port
);
709 scm_ungetc ('#', port
);
713 buffer
= read_complete_token (port
, local_buffer
, sizeof local_buffer
,
716 pt
= SCM_PTAB_ENTRY (port
);
717 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
719 result
= scm_string_to_number (str
, scm_from_uint (radix
));
721 SCM_COL (port
) += scm_i_string_length (str
);
723 if (scm_is_true (result
))
726 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
733 scm_read_quote (int chr
, SCM port
)
736 long line
= SCM_LINUM (port
);
737 int column
= SCM_COL (port
) - 1;
742 p
= scm_sym_quasiquote
;
755 p
= scm_sym_uq_splicing
;
758 scm_ungetc (c
, port
);
765 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
766 "scm_read_quote", chr
);
770 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
771 return maybe_annotate_source (p
, port
, line
, column
);
774 SCM_SYMBOL (sym_syntax
, "syntax");
775 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
776 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
777 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
780 scm_read_syntax (int chr
, SCM port
)
783 long line
= SCM_LINUM (port
);
784 int column
= SCM_COL (port
) - 1;
802 p
= sym_unsyntax_splicing
;
805 scm_ungetc (c
, port
);
812 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
813 "scm_read_syntax", chr
);
817 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
818 return maybe_annotate_source (p
, port
, line
, column
);
822 scm_read_nil (int chr
, SCM port
)
824 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
826 if (!scm_is_eq (id
, sym_nil
))
827 scm_i_input_error ("scm_read_nil", port
,
828 "unexpected input while reading #nil: ~a",
831 return SCM_ELISP_NIL
;
835 scm_read_semicolon_comment (int chr
, SCM port
)
839 /* We use the get_byte here because there is no need to get the
840 locale correct with comment input. This presumes that newline
841 always represents itself no matter what the encoding is. */
842 for (c
= scm_get_byte_or_eof (port
);
843 (c
!= EOF
) && (c
!= '\n');
844 c
= scm_get_byte_or_eof (port
));
846 return SCM_UNSPECIFIED
;
850 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
853 scm_read_boolean (int chr
, SCM port
)
866 return SCM_UNSPECIFIED
;
870 scm_read_character (scm_t_wchar chr
, SCM port
)
871 #define FUNC_NAME "scm_lreadr"
873 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
875 size_t charname_len
, bytes_read
;
880 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
882 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
886 chr
= scm_getc (port
);
888 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
889 "while reading character", SCM_EOL
);
891 /* CHR must be a token delimiter, like a whitespace. */
892 return (SCM_MAKE_CHAR (chr
));
895 pt
= SCM_PTAB_ENTRY (port
);
897 /* Simple ASCII characters can be processed immediately. Also, simple
898 ISO-8859-1 characters can be processed immediately if the encoding for this
899 port is ISO-8859-1. */
900 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
903 return SCM_MAKE_CHAR (buffer
[0]);
906 /* Otherwise, convert the buffer into a proper scheme string for
908 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
910 charname_len
= scm_i_string_length (charname
);
911 SCM_COL (port
) += charname_len
;
912 cp
= scm_i_string_ref (charname
, 0);
913 if (charname_len
== 1)
914 return SCM_MAKE_CHAR (cp
);
916 /* Ignore dotted circles, which may be used to keep combining characters from
917 combining with the backslash in #\charname. */
918 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
919 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
921 if (cp
>= '0' && cp
< '8')
923 /* Dirk:FIXME:: This type of character syntax is not R5RS
924 * compliant. Further, it should be verified that the constant
925 * does only consist of octal digits. */
926 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
929 scm_t_wchar c
= scm_to_uint32 (p
);
930 if (SCM_IS_UNICODE_CHAR (c
))
931 return SCM_MAKE_CHAR (c
);
933 scm_i_input_error (FUNC_NAME
, port
,
934 "out-of-range octal character escape: ~a",
935 scm_list_1 (charname
));
939 if (cp
== 'x' && (charname_len
> 1))
943 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
944 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
948 scm_t_wchar c
= scm_to_uint32 (p
);
949 if (SCM_IS_UNICODE_CHAR (c
))
950 return SCM_MAKE_CHAR (c
);
952 scm_i_input_error (FUNC_NAME
, port
,
953 "out-of-range hex character escape: ~a",
954 scm_list_1 (charname
));
958 /* The names of characters should never have non-Latin1
960 if (scm_i_is_narrow_string (charname
)
961 || scm_i_try_narrow_string (charname
))
962 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
964 if (scm_is_true (ch
))
968 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
969 scm_list_1 (charname
));
971 return SCM_UNSPECIFIED
;
976 scm_read_keyword (int chr
, SCM port
)
980 /* Read the symbol that comprises the keyword. Doing this instead of
981 invoking a specific symbol reader function allows `scm_read_keyword ()'
982 to adapt to the delimiters currently valid of symbols.
984 XXX: This implementation allows sloppy syntaxes like `#: key'. */
985 symbol
= scm_read_expression (port
);
986 if (!scm_is_symbol (symbol
))
987 scm_i_input_error ("scm_read_keyword", port
,
988 "keyword prefix `~a' not followed by a symbol: ~s",
989 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
991 return (scm_symbol_to_keyword (symbol
));
995 scm_read_vector (int chr
, SCM port
, long line
, int column
)
997 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
998 guarantee that it's going to do what we want. After all, this is an
999 implementation detail of `scm_read_vector ()', not a desirable
1001 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
)),
1002 port
, line
, column
);
1006 scm_read_array (int chr
, SCM port
, long line
, int column
)
1008 SCM result
= scm_i_read_array (port
, chr
);
1009 if (scm_is_false (result
))
1012 return maybe_annotate_source (result
, port
, line
, column
);
1016 scm_read_srfi4_vector (int chr
, SCM port
, long line
, int column
)
1018 return scm_read_array (chr
, port
, line
, column
);
1022 scm_read_bytevector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1024 chr
= scm_getc (port
);
1028 chr
= scm_getc (port
);
1032 chr
= scm_getc (port
);
1036 return maybe_annotate_source
1037 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
)),
1038 port
, line
, column
);
1041 scm_i_input_error ("read_bytevector", port
,
1042 "invalid bytevector prefix",
1043 SCM_MAKE_CHAR (chr
));
1044 return SCM_UNSPECIFIED
;
1048 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1050 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1051 terribly inefficient but who cares? */
1052 SCM s_bits
= SCM_EOL
;
1054 for (chr
= scm_getc (port
);
1055 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1056 chr
= scm_getc (port
))
1058 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1062 scm_ungetc (chr
, port
);
1064 return maybe_annotate_source
1065 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1066 port
, line
, column
);
1070 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1076 int c
= scm_getc (port
);
1079 scm_i_input_error ("skip_block_comment", port
,
1080 "unterminated `#! ... !#' comment", SCM_EOL
);
1084 else if (c
== '#' && bang_seen
)
1090 return SCM_UNSPECIFIED
;
1094 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1097 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1099 scm_ungetc (c
, port
);
1100 return scm_read_scsh_block_comment (chr
, port
);
1102 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1104 scm_ungetc (c
, port
);
1105 scm_ungetc ('r', port
);
1106 return scm_read_scsh_block_comment (chr
, port
);
1108 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1110 scm_ungetc (c
, port
);
1111 scm_ungetc ('6', port
);
1112 scm_ungetc ('r', port
);
1113 return scm_read_scsh_block_comment (chr
, port
);
1115 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1117 scm_ungetc (c
, port
);
1118 scm_ungetc ('r', port
);
1119 scm_ungetc ('6', port
);
1120 scm_ungetc ('r', port
);
1121 return scm_read_scsh_block_comment (chr
, port
);
1124 return SCM_UNSPECIFIED
;
1128 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1130 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1131 nested. So care must be taken. */
1132 int nesting_level
= 1;
1134 int a
= scm_getc (port
);
1137 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1138 "unterminated `#| ... |#' comment", SCM_EOL
);
1140 while (nesting_level
> 0)
1142 int b
= scm_getc (port
);
1145 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1146 "unterminated `#| ... |#' comment", SCM_EOL
);
1148 if (a
== '|' && b
== '#')
1153 else if (a
== '#' && b
== '|')
1162 return SCM_UNSPECIFIED
;
1166 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1170 c
= flush_ws (port
, (char *) NULL
);
1172 scm_i_input_error ("read_commented_expression", port
,
1173 "no expression after #; comment", SCM_EOL
);
1174 scm_ungetc (c
, port
);
1175 scm_read_expression (port
);
1176 return SCM_UNSPECIFIED
;
1180 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1182 /* Guile's extended symbol read syntax looks like this:
1184 #{This is all a symbol name}#
1186 So here, CHR is expected to be `{'. */
1189 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1191 buf
= scm_i_string_start_writing (buf
);
1193 while ((chr
= scm_getc (port
)) != EOF
)
1204 scm_i_string_set_x (buf
, len
++, '}');
1210 else if (chr
== '\\')
1212 /* It used to be that print.c would print extended-read-syntax
1213 symbols with backslashes before "non-standard" chars, but
1214 this routine wouldn't do anything with those escapes.
1215 Bummer. What we've done is to change print.c to output
1216 R6RS hex escapes for those characters, relying on the fact
1217 that the extended read syntax would never put a `\' before
1218 an `x'. For now, we just ignore other instances of
1219 backslash in the string. */
1220 switch ((chr
= scm_getc (port
)))
1228 SCM_READ_HEX_ESCAPE (10, ';');
1229 scm_i_string_set_x (buf
, len
++, c
);
1237 scm_i_string_stop_writing ();
1238 scm_i_input_error ("scm_read_extended_symbol", port
,
1239 "illegal character in escape sequence: ~S",
1240 scm_list_1 (SCM_MAKE_CHAR (c
)));
1244 scm_i_string_set_x (buf
, len
++, chr
);
1249 scm_i_string_set_x (buf
, len
++, chr
);
1251 if (len
>= scm_i_string_length (buf
) - 2)
1255 scm_i_string_stop_writing ();
1256 addy
= scm_i_make_string (1024, NULL
, 0);
1257 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1259 buf
= scm_i_string_start_writing (buf
);
1264 scm_i_string_stop_writing ();
1266 scm_i_input_error ("scm_read_extended_symbol", port
,
1267 "end of file while reading symbol", SCM_EOL
);
1269 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1274 /* Top-level token readers, i.e., dispatchers. */
1277 scm_read_sharp_extension (int chr
, SCM port
)
1281 proc
= scm_get_hash_procedure (chr
);
1282 if (scm_is_true (scm_procedure_p (proc
)))
1284 long line
= SCM_LINUM (port
);
1285 int column
= SCM_COL (port
) - 2;
1288 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1290 if (scm_is_pair (got
) && !scm_i_has_source_properties (got
))
1291 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1296 return SCM_UNSPECIFIED
;
1299 /* The reader for the sharp `#' character. It basically dispatches reads
1300 among the above token readers. */
1302 scm_read_sharp (scm_t_wchar chr
, SCM port
, long line
, int column
)
1303 #define FUNC_NAME "scm_lreadr"
1307 chr
= scm_getc (port
);
1309 result
= scm_read_sharp_extension (chr
, port
);
1310 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1316 return (scm_read_character (chr
, port
));
1318 return (scm_read_vector (chr
, port
, line
, column
));
1323 /* This one may return either a boolean or an SRFI-4 vector. */
1324 return (scm_read_srfi4_vector (chr
, port
, line
, column
));
1326 return (scm_read_bytevector (chr
, port
, line
, column
));
1328 return (scm_read_guile_bit_vector (chr
, port
, line
, column
));
1332 return (scm_read_boolean (chr
, port
));
1334 return (scm_read_keyword (chr
, port
));
1335 case '0': case '1': case '2': case '3': case '4':
1336 case '5': case '6': case '7': case '8': case '9':
1338 #if SCM_ENABLE_DEPRECATED
1339 /* See below for 'i' and 'e'. */
1345 return (scm_read_array (chr
, port
, line
, column
));
1349 #if SCM_ENABLE_DEPRECATED
1351 /* When next char is '(', it really is an old-style
1353 scm_t_wchar next_c
= scm_getc (port
);
1355 scm_ungetc (next_c
, port
);
1357 return scm_read_array (chr
, port
, line
, column
);
1371 return (scm_read_number_and_radix (chr
, port
));
1373 return (scm_read_extended_symbol (chr
, port
));
1375 return (scm_read_shebang (chr
, port
));
1377 return (scm_read_commented_expression (chr
, port
));
1381 return (scm_read_syntax (chr
, port
));
1383 return (scm_read_nil (chr
, port
));
1385 result
= scm_read_sharp_extension (chr
, port
);
1386 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1388 /* To remain compatible with 1.8 and earlier, the following
1389 characters have lower precedence than `read-hash-extend'
1394 return scm_read_r6rs_block_comment (chr
, port
);
1396 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1397 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1404 return SCM_UNSPECIFIED
;
1409 scm_read_expression (SCM port
)
1410 #define FUNC_NAME "scm_read_expression"
1416 chr
= scm_getc (port
);
1420 case SCM_WHITE_SPACES
:
1421 case SCM_LINE_INCREMENTORS
:
1424 (void) scm_read_semicolon_comment (chr
, port
);
1427 if (!SCM_SQUARE_BRACKETS_P
)
1428 return (scm_read_mixed_case_symbol (chr
, port
));
1429 /* otherwise fall through */
1431 return (scm_read_sexp (chr
, port
));
1433 return (scm_read_string (chr
, port
));
1437 return (scm_read_quote (chr
, port
));
1440 long line
= SCM_LINUM (port
);
1441 int column
= SCM_COL (port
) - 1;
1442 SCM result
= scm_read_sharp (chr
, port
, line
, column
);
1443 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1444 /* We read a comment or some such. */
1450 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1453 if (SCM_SQUARE_BRACKETS_P
)
1454 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1455 /* otherwise fall through */
1459 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1460 return scm_symbol_to_keyword (scm_read_expression (port
));
1465 if (((chr
>= '0') && (chr
<= '9'))
1466 || (strchr ("+-.", chr
)))
1467 return (scm_read_number (chr
, port
));
1469 return (scm_read_mixed_case_symbol (chr
, port
));
1477 /* Actual reader. */
1479 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1481 "Read an s-expression from the input port @var{port}, or from\n"
1482 "the current input port if @var{port} is not specified.\n"
1483 "Any whitespace before the next token is discarded.")
1484 #define FUNC_NAME s_scm_read
1488 if (SCM_UNBNDP (port
))
1489 port
= scm_current_input_port ();
1490 SCM_VALIDATE_OPINPORT (1, port
);
1492 c
= flush_ws (port
, (char *) NULL
);
1495 scm_ungetc (c
, port
);
1497 return (scm_read_expression (port
));
1504 /* Manipulate the read-hash-procedures alist. This could be written in
1505 Scheme, but maybe it will also be used by C code during initialisation. */
1506 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1507 (SCM chr
, SCM proc
),
1508 "Install the procedure @var{proc} for reading expressions\n"
1509 "starting with the character sequence @code{#} and @var{chr}.\n"
1510 "@var{proc} will be called with two arguments: the character\n"
1511 "@var{chr} and the port to read further data from. The object\n"
1512 "returned will be the return value of @code{read}. \n"
1513 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1515 #define FUNC_NAME s_scm_read_hash_extend
1520 SCM_VALIDATE_CHAR (1, chr
);
1521 SCM_ASSERT (scm_is_false (proc
)
1522 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1523 proc
, SCM_ARG2
, FUNC_NAME
);
1525 /* Check if chr is already in the alist. */
1526 this = scm_i_read_hash_procedures_ref ();
1530 if (scm_is_null (this))
1532 /* not found, so add it to the beginning. */
1533 if (scm_is_true (proc
))
1535 SCM
new = scm_cons (scm_cons (chr
, proc
),
1536 scm_i_read_hash_procedures_ref ());
1537 scm_i_read_hash_procedures_set_x (new);
1541 if (scm_is_eq (chr
, SCM_CAAR (this)))
1543 /* already in the alist. */
1544 if (scm_is_false (proc
))
1547 if (scm_is_false (prev
))
1549 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1550 scm_i_read_hash_procedures_set_x (rest
);
1553 scm_set_cdr_x (prev
, SCM_CDR (this));
1558 scm_set_cdr_x (SCM_CAR (this), proc
);
1563 this = SCM_CDR (this);
1566 return SCM_UNSPECIFIED
;
1570 /* Recover the read-hash procedure corresponding to char c. */
1572 scm_get_hash_procedure (int c
)
1574 SCM rest
= scm_i_read_hash_procedures_ref ();
1578 if (scm_is_null (rest
))
1581 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1582 return SCM_CDAR (rest
);
1584 rest
= SCM_CDR (rest
);
1588 #define SCM_ENCODING_SEARCH_SIZE (500)
1590 /* Search the first few hundred characters of a file for an Emacs-like coding
1591 declaration. Returns either NULL or a string whose storage has been
1592 allocated with `scm_gc_malloc ()'. */
1594 scm_i_scan_for_encoding (SCM port
)
1597 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1598 size_t bytes_read
, encoding_length
, i
;
1599 char *encoding
= NULL
;
1601 char *pos
, *encoding_start
;
1604 pt
= SCM_PTAB_ENTRY (port
);
1606 if (pt
->rw_active
== SCM_PORT_WRITE
)
1610 pt
->rw_active
= SCM_PORT_READ
;
1612 if (pt
->read_pos
== pt
->read_end
)
1614 /* We can use the read buffer, and thus avoid a seek. */
1615 if (scm_fill_input (port
) == EOF
)
1618 bytes_read
= pt
->read_end
- pt
->read_pos
;
1619 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1620 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1622 if (bytes_read
<= 1)
1623 /* An unbuffered port -- don't scan. */
1626 memcpy (header
, pt
->read_pos
, bytes_read
);
1627 header
[bytes_read
] = '\0';
1631 /* Try to read some bytes and then seek back. Not all ports
1632 support seeking back; and indeed some file ports (like
1633 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1634 check performed by SCM_FPORT_FDES---but fail to seek
1635 backwards. Hence this block comes second. We prefer to use
1636 the read buffer in-place. */
1637 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1640 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1641 header
[bytes_read
] = '\0';
1642 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1646 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1649 /* search past "coding[:=]" */
1653 if ((pos
= strstr(pos
, "coding")) == NULL
)
1656 pos
+= strlen("coding");
1657 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1658 (*pos
== ':' || *pos
== '='))
1666 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1667 (*pos
== ' ' || *pos
== '\t'))
1670 /* grab the next token */
1671 encoding_start
= pos
;
1673 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1674 && encoding_start
+ i
- header
< bytes_read
1675 && (isalnum ((int) encoding_start
[i
])
1676 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1679 encoding_length
= i
;
1680 if (encoding_length
== 0)
1683 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1684 for (i
= 0; i
< encoding_length
; i
++)
1685 encoding
[i
] = toupper ((int) encoding
[i
]);
1687 /* push backwards to make sure we were in a comment */
1689 pos
= encoding_start
;
1690 while (pos
>= header
)
1697 else if (*pos
== '\n' || pos
== header
)
1699 /* This wasn't in a semicolon comment. Check for a
1700 hash-bang comment. */
1701 char *beg
= strstr (header
, "#!");
1702 char *end
= strstr (header
, "!#");
1703 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1714 /* This wasn't in a comment */
1717 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1718 scm_misc_error (NULL
,
1719 "the port input declares the encoding ~s but is encoded as UTF-8",
1720 scm_list_1 (scm_from_locale_string (encoding
)));
1725 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1727 "Scans the port for an Emacs-like character coding declaration\n"
1728 "near the top of the contents of a port with random-accessible contents.\n"
1729 "The coding declaration is of the form\n"
1730 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1732 "Returns a string containing the character encoding of the file\n"
1733 "if a declaration was found, or @code{#f} otherwise.\n")
1734 #define FUNC_NAME s_scm_file_encoding
1739 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1741 enc
= scm_i_scan_for_encoding (port
);
1746 s_enc
= scm_from_locale_string (enc
);
1757 SCM read_hash_procs
;
1759 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
1761 scm_i_read_hash_procedures
=
1762 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1764 scm_init_opts (scm_read_options
, scm_read_opts
);
1765 #include "libguile/read.x"