1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
63 SCM_SYMBOL (sym_nil
, "nil");
65 scm_t_option scm_read_opts
[] = {
66 { SCM_OPTION_BOOLEAN
, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN
, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM
, "keywords", (unsigned long) SCM_BOOL_F
,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN
, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
76 { SCM_OPTION_BOOLEAN
, "elisp-strings", 0,
77 "Support `\\(' and `\\)' in strings."},
78 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
79 "Use R6RS variable-length character and string hex escapes."},
80 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
81 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 Give meaningful error messages for errors
90 FILE:LINE:COL: MESSAGE
93 This is not standard GNU format, but the test-suite likes the real
94 message to be in front.
100 scm_i_input_error (char const *function
,
101 SCM port
, const char *message
, SCM arg
)
103 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
105 : scm_from_locale_string ("#<unknown port>"));
107 SCM string_port
= scm_open_output_string ();
108 SCM string
= SCM_EOL
;
109 scm_simple_format (string_port
,
110 scm_from_locale_string ("~A:~S:~S: ~A"),
112 scm_from_long (SCM_LINUM (port
) + 1),
113 scm_from_int (SCM_COL (port
) + 1),
114 scm_from_locale_string (message
)));
116 string
= scm_get_output_string (string_port
);
117 scm_close_output_port (string_port
);
118 scm_error_scm (scm_from_locale_symbol ("read-error"),
119 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
126 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
128 "Option interface for the read options. Instead of using\n"
129 "this procedure directly, use the procedures @code{read-enable},\n"
130 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
131 #define FUNC_NAME s_scm_read_options
133 SCM ans
= scm_options (setting
,
136 if (SCM_COPY_SOURCE_P
)
137 SCM_RECORD_POSITIONS_P
= 1;
142 /* An association list mapping extra hash characters to procedures. */
143 static SCM
*scm_read_hash_procedures
;
150 /* Size of the C buffer used to read symbols and numbers. */
151 #define READER_BUFFER_SIZE 128
153 /* Size of the C buffer used to read strings. */
154 #define READER_STRING_BUFFER_SIZE 512
156 /* The maximum size of Scheme character names. */
157 #define READER_CHAR_NAME_MAX_SIZE 50
160 /* `isblank' is only in C99. */
161 #define CHAR_IS_BLANK_(_chr) \
162 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
163 || ((_chr) == '\f') || ((_chr) == '\r'))
166 # define CHAR_IS_BLANK(_chr) \
167 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
175 #define CHAR_IS_R5RS_DELIMITER(c) \
177 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
178 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
180 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
184 #define CHAR_IS_EXPONENT_MARKER(_chr) \
185 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
186 || ((_chr) == 'd') || ((_chr) == 'l'))
188 /* Read an SCSH block comment. */
189 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
190 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
191 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
192 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
193 static SCM
scm_get_hash_procedure (int);
195 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
196 result in the pre-allocated buffer BUF. Return zero if the whole token has
197 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
198 bytes actually read. */
200 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
204 while (*read
< buf_size
)
208 chr
= scm_get_byte_or_eof (port
);
212 else if (CHAR_IS_DELIMITER (chr
))
214 scm_unget_byte (chr
, port
);
227 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
228 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
229 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
230 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
231 will be set the number of bytes actually read. */
233 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
234 char **overflow_buffer
, size_t *read
)
237 size_t bytes_read
, overflow_size
;
239 *overflow_buffer
= NULL
;
244 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
247 if (overflow
|| overflow_size
!= 0)
249 if (overflow_size
== 0)
251 *overflow_buffer
= scm_malloc (bytes_read
);
252 memcpy (*overflow_buffer
, buffer
, bytes_read
);
253 overflow_size
= bytes_read
;
257 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
258 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
259 overflow_size
+= bytes_read
;
266 *read
= overflow_size
;
270 return (overflow_size
!= 0);
273 /* Skip whitespace from PORT and return the first non-whitespace character
274 read. Raise an error on end-of-file. */
276 flush_ws (SCM port
, const char *eoferr
)
278 register scm_t_wchar c
;
280 switch (c
= scm_getc (port
))
286 scm_i_input_error (eoferr
,
295 switch (c
= scm_getc (port
))
301 case SCM_LINE_INCREMENTORS
:
307 switch (c
= scm_getc (port
))
310 eoferr
= "read_sharp";
313 scm_read_shebang (c
, port
);
316 scm_read_commented_expression (c
, port
);
319 if (scm_is_false (scm_get_hash_procedure (c
)))
321 scm_read_r6rs_block_comment (c
, port
);
326 scm_ungetc (c
, port
);
331 case SCM_LINE_INCREMENTORS
:
332 case SCM_SINGLE_SPACES
:
347 static SCM
scm_read_expression (SCM port
);
348 static SCM
scm_read_sharp (int chr
, SCM port
);
349 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
353 scm_read_sexp (scm_t_wchar chr
, SCM port
)
354 #define FUNC_NAME "scm_i_lreadparen"
358 register SCM tl
, ans
= SCM_EOL
;
359 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
360 const int terminating_char
= ((chr
== '[') ? ']' : ')');
362 /* Need to capture line and column numbers here. */
363 long line
= SCM_LINUM (port
);
364 int column
= SCM_COL (port
) - 1;
367 c
= flush_ws (port
, FUNC_NAME
);
368 if (terminating_char
== c
)
371 scm_ungetc (c
, port
);
372 if (scm_is_eq (scm_sym_dot
,
373 (tmp
= scm_read_expression (port
))))
375 ans
= scm_read_expression (port
);
376 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
377 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
382 /* Build the head of the list structure. */
383 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
385 if (SCM_COPY_SOURCE_P
)
386 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
391 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
395 scm_ungetc (c
, port
);
396 if (scm_is_eq (scm_sym_dot
,
397 (tmp
= scm_read_expression (port
))))
399 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
401 if (SCM_COPY_SOURCE_P
)
402 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
405 c
= flush_ws (port
, FUNC_NAME
);
406 if (terminating_char
!= c
)
407 scm_i_input_error (FUNC_NAME
, port
,
408 "in pair: missing close paren", SCM_EOL
);
412 new_tail
= scm_cons (tmp
, SCM_EOL
);
413 SCM_SETCDR (tl
, new_tail
);
416 if (SCM_COPY_SOURCE_P
)
418 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
421 SCM_SETCDR (tl2
, new_tail2
);
427 if (SCM_RECORD_POSITIONS_P
)
428 scm_whash_insert (scm_source_whash
,
430 scm_make_srcprops (line
, column
,
441 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
442 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
444 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
450 while (i < ndigits) \
452 a = scm_getc (port); \
456 && (a == (scm_t_wchar) terminator) \
459 if ('0' <= a && a <= '9') \
461 else if ('A' <= a && a <= 'F') \
463 else if ('a' <= a && a <= 'f') \
476 scm_read_string (int chr
, SCM port
)
477 #define FUNC_NAME "scm_lreadr"
479 /* For strings smaller than C_STR, this function creates only one Scheme
480 object (the string returned). */
482 SCM str
= SCM_BOOL_F
;
483 unsigned c_str_len
= 0;
486 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
487 while ('"' != (c
= scm_getc (port
)))
492 scm_i_input_error (FUNC_NAME
, port
,
493 "end of file in string constant", SCM_EOL
);
496 if (c_str_len
+ 1 >= scm_i_string_length (str
))
498 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
500 str
= scm_string_append (scm_list_2 (str
, addy
));
505 switch (c
= scm_getc (port
))
514 if (SCM_ESCAPED_PARENS_P
)
544 if (SCM_R6RS_ESCAPES_P
)
545 SCM_READ_HEX_ESCAPE (10, ';');
547 SCM_READ_HEX_ESCAPE (2, '\0');
550 if (!SCM_R6RS_ESCAPES_P
)
552 SCM_READ_HEX_ESCAPE (4, '\0');
556 if (!SCM_R6RS_ESCAPES_P
)
558 SCM_READ_HEX_ESCAPE (6, '\0');
563 scm_i_input_error (FUNC_NAME
, port
,
564 "illegal character in escape sequence: ~S",
565 scm_list_1 (SCM_MAKE_CHAR (c
)));
568 str
= scm_i_string_start_writing (str
);
569 scm_i_string_set_x (str
, c_str_len
++, c
);
570 scm_i_string_stop_writing ();
575 return scm_i_substring_copy (str
, 0, c_str_len
);
584 scm_read_number (scm_t_wchar chr
, SCM port
)
586 SCM result
, str
= SCM_EOL
;
587 char buffer
[READER_BUFFER_SIZE
];
588 char *overflow_buffer
= NULL
;
591 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
593 scm_ungetc (chr
, port
);
594 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
595 &overflow_buffer
, &bytes_read
);
598 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
600 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
603 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
604 if (!scm_is_true (result
))
606 /* Return a symbol instead of a number */
607 if (SCM_CASE_INSENSITIVE_P
)
608 str
= scm_string_downcase_x (str
);
609 result
= scm_string_to_symbol (str
);
613 free (overflow_buffer
);
614 SCM_COL (port
) += scm_i_string_length (str
);
619 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
622 int ends_with_colon
= 0;
624 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
626 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
627 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
630 scm_ungetc (chr
, port
);
631 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
632 &overflow_buffer
, &bytes_read
);
636 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
638 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
641 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
644 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
646 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
649 if (SCM_CASE_INSENSITIVE_P
)
650 str
= scm_string_downcase_x (str
);
651 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
656 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
658 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
661 if (SCM_CASE_INSENSITIVE_P
)
662 str
= scm_string_downcase_x (str
);
663 result
= scm_string_to_symbol (str
);
667 free (overflow_buffer
);
668 SCM_COL (port
) += scm_i_string_length (str
);
673 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
674 #define FUNC_NAME "scm_lreadr"
678 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
707 scm_ungetc (chr
, port
);
708 scm_ungetc ('#', port
);
712 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
713 &overflow_buffer
, &read
);
715 pt
= SCM_PTAB_ENTRY (port
);
717 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
719 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
722 result
= scm_string_to_number (str
, scm_from_uint (radix
));
725 free (overflow_buffer
);
727 SCM_COL (port
) += scm_i_string_length (str
);
729 if (scm_is_true (result
))
732 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
739 scm_read_quote (int chr
, SCM port
)
742 long line
= SCM_LINUM (port
);
743 int column
= SCM_COL (port
) - 1;
748 p
= scm_sym_quasiquote
;
761 p
= scm_sym_uq_splicing
;
764 scm_ungetc (c
, port
);
771 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
772 "scm_read_quote", chr
);
776 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
777 if (SCM_RECORD_POSITIONS_P
)
778 scm_whash_insert (scm_source_whash
, p
,
779 scm_make_srcprops (line
, column
,
782 ? (scm_cons2 (SCM_CAR (p
),
783 SCM_CAR (SCM_CDR (p
)),
792 SCM_SYMBOL (sym_syntax
, "syntax");
793 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
794 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
795 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
798 scm_read_syntax (int chr
, SCM port
)
801 long line
= SCM_LINUM (port
);
802 int column
= SCM_COL (port
) - 1;
820 p
= sym_unsyntax_splicing
;
823 scm_ungetc (c
, port
);
830 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
831 "scm_read_syntax", chr
);
835 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
836 if (SCM_RECORD_POSITIONS_P
)
837 scm_whash_insert (scm_source_whash
, p
,
838 scm_make_srcprops (line
, column
,
841 ? (scm_cons2 (SCM_CAR (p
),
842 SCM_CAR (SCM_CDR (p
)),
852 scm_read_nil (int chr
, SCM port
)
854 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
856 if (!scm_is_eq (id
, sym_nil
))
857 scm_i_input_error ("scm_read_nil", port
,
858 "unexpected input while reading #nil: ~a",
861 return SCM_ELISP_NIL
;
865 scm_read_semicolon_comment (int chr
, SCM port
)
869 /* We use the get_byte here because there is no need to get the
870 locale correct with comment input. This presumes that newline
871 always represents itself no matter what the encoding is. */
872 for (c
= scm_get_byte_or_eof (port
);
873 (c
!= EOF
) && (c
!= '\n');
874 c
= scm_get_byte_or_eof (port
));
876 return SCM_UNSPECIFIED
;
880 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
883 scm_read_boolean (int chr
, SCM port
)
896 return SCM_UNSPECIFIED
;
900 scm_read_character (scm_t_wchar chr
, SCM port
)
901 #define FUNC_NAME "scm_lreadr"
903 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
905 size_t charname_len
, bytes_read
;
910 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
916 chr
= scm_getc (port
);
918 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
919 "while reading character", SCM_EOL
);
921 /* CHR must be a token delimiter, like a whitespace. */
922 return (SCM_MAKE_CHAR (chr
));
925 pt
= SCM_PTAB_ENTRY (port
);
927 /* Simple ASCII characters can be processed immediately. Also, simple
928 ISO-8859-1 characters can be processed immediately if the encoding for this
929 port is ISO-8859-1. */
930 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
933 return SCM_MAKE_CHAR (buffer
[0]);
936 /* Otherwise, convert the buffer into a proper scheme string for
938 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
940 charname_len
= scm_i_string_length (charname
);
941 SCM_COL (port
) += charname_len
;
942 cp
= scm_i_string_ref (charname
, 0);
943 if (charname_len
== 1)
944 return SCM_MAKE_CHAR (cp
);
946 /* Ignore dotted circles, which may be used to keep combining characters from
947 combining with the backslash in #\charname. */
948 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
949 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
951 if (cp
>= '0' && cp
< '8')
953 /* Dirk:FIXME:: This type of character syntax is not R5RS
954 * compliant. Further, it should be verified that the constant
955 * does only consist of octal digits. */
956 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
959 scm_t_wchar c
= SCM_I_INUM (p
);
960 if (SCM_IS_UNICODE_CHAR (c
))
961 return SCM_MAKE_CHAR (c
);
963 scm_i_input_error (FUNC_NAME
, port
,
964 "out-of-range octal character escape: ~a",
965 scm_list_1 (charname
));
969 if (cp
== 'x' && (charname_len
> 1) && SCM_R6RS_ESCAPES_P
)
973 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
974 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
978 scm_t_wchar c
= SCM_I_INUM (p
);
979 if (SCM_IS_UNICODE_CHAR (c
))
980 return SCM_MAKE_CHAR (c
);
982 scm_i_input_error (FUNC_NAME
, port
,
983 "out-of-range hex character escape: ~a",
984 scm_list_1 (charname
));
988 /* The names of characters should never have non-Latin1
990 if (scm_i_is_narrow_string (charname
)
991 || scm_i_try_narrow_string (charname
))
992 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
994 if (scm_is_true (ch
))
999 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1000 scm_list_1 (charname
));
1002 return SCM_UNSPECIFIED
;
1007 scm_read_keyword (int chr
, SCM port
)
1011 /* Read the symbol that comprises the keyword. Doing this instead of
1012 invoking a specific symbol reader function allows `scm_read_keyword ()'
1013 to adapt to the delimiters currently valid of symbols.
1015 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1016 symbol
= scm_read_expression (port
);
1017 if (!scm_is_symbol (symbol
))
1018 scm_i_input_error ("scm_read_keyword", port
,
1019 "keyword prefix `~a' not followed by a symbol: ~s",
1020 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1022 return (scm_symbol_to_keyword (symbol
));
1026 scm_read_vector (int chr
, SCM port
)
1028 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1029 guarantee that it's going to do what we want. After all, this is an
1030 implementation detail of `scm_read_vector ()', not a desirable
1032 return (scm_vector (scm_read_sexp (chr
, port
)));
1036 scm_read_srfi4_vector (int chr
, SCM port
)
1038 return scm_i_read_array (port
, chr
);
1042 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1044 chr
= scm_getc (port
);
1048 chr
= scm_getc (port
);
1052 chr
= scm_getc (port
);
1056 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1059 scm_i_input_error ("read_bytevector", port
,
1060 "invalid bytevector prefix",
1061 SCM_MAKE_CHAR (chr
));
1062 return SCM_UNSPECIFIED
;
1066 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1068 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1069 terribly inefficient but who cares? */
1070 SCM s_bits
= SCM_EOL
;
1072 for (chr
= scm_getc (port
);
1073 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1074 chr
= scm_getc (port
))
1076 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1080 scm_ungetc (chr
, port
);
1082 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1086 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1090 /* We can use the get_byte here because there is no need to get the
1091 locale correct when reading comments. This presumes that
1092 hash and exclamation points always represent themselves no
1093 matter what the source encoding is.*/
1096 int c
= scm_get_byte_or_eof (port
);
1099 scm_i_input_error ("skip_block_comment", port
,
1100 "unterminated `#! ... !#' comment", SCM_EOL
);
1104 else if (c
== '#' && bang_seen
)
1110 return SCM_UNSPECIFIED
;
1114 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1117 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1119 scm_ungetc (c
, port
);
1120 return scm_read_scsh_block_comment (chr
, port
);
1122 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1124 scm_ungetc (c
, port
);
1125 scm_ungetc ('r', port
);
1126 return scm_read_scsh_block_comment (chr
, port
);
1128 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1130 scm_ungetc (c
, port
);
1131 scm_ungetc ('6', port
);
1132 scm_ungetc ('r', port
);
1133 return scm_read_scsh_block_comment (chr
, port
);
1135 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1137 scm_ungetc (c
, port
);
1138 scm_ungetc ('r', port
);
1139 scm_ungetc ('6', port
);
1140 scm_ungetc ('r', port
);
1141 return scm_read_scsh_block_comment (chr
, port
);
1144 return SCM_UNSPECIFIED
;
1148 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1150 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1151 nested. So care must be taken. */
1152 int nesting_level
= 1;
1153 int opening_seen
= 0, closing_seen
= 0;
1155 while (nesting_level
> 0)
1157 int c
= scm_getc (port
);
1160 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1161 "unterminated `#| ... |#' comment", SCM_EOL
);
1169 else if (closing_seen
)
1180 opening_seen
= closing_seen
= 0;
1183 return SCM_UNSPECIFIED
;
1187 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1191 c
= flush_ws (port
, (char *) NULL
);
1193 scm_i_input_error ("read_commented_expression", port
,
1194 "no expression after #; comment", SCM_EOL
);
1195 scm_ungetc (c
, port
);
1196 scm_read_expression (port
);
1197 return SCM_UNSPECIFIED
;
1201 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1203 /* Guile's extended symbol read syntax looks like this:
1205 #{This is all a symbol name}#
1207 So here, CHR is expected to be `{'. */
1208 int saw_brace
= 0, finished
= 0;
1210 SCM buf
= scm_i_make_string (1024, NULL
);
1212 buf
= scm_i_string_start_writing (buf
);
1214 while ((chr
= scm_getc (port
)) != EOF
)
1226 scm_i_string_set_x (buf
, len
++, '}');
1227 scm_i_string_set_x (buf
, len
++, chr
);
1230 else if (chr
== '}')
1233 scm_i_string_set_x (buf
, len
++, chr
);
1235 if (len
>= scm_i_string_length (buf
) - 2)
1239 scm_i_string_stop_writing ();
1240 addy
= scm_i_make_string (1024, NULL
);
1241 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1243 buf
= scm_i_string_start_writing (buf
);
1249 scm_i_string_stop_writing ();
1251 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1256 /* Top-level token readers, i.e., dispatchers. */
1259 scm_read_sharp_extension (int chr
, SCM port
)
1263 proc
= scm_get_hash_procedure (chr
);
1264 if (scm_is_true (scm_procedure_p (proc
)))
1266 long line
= SCM_LINUM (port
);
1267 int column
= SCM_COL (port
) - 2;
1270 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1271 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1273 if (SCM_RECORD_POSITIONS_P
)
1274 return (recsexpr (got
, line
, column
,
1275 SCM_FILENAME (port
)));
1281 return SCM_UNSPECIFIED
;
1284 /* The reader for the sharp `#' character. It basically dispatches reads
1285 among the above token readers. */
1287 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1288 #define FUNC_NAME "scm_lreadr"
1292 chr
= scm_getc (port
);
1294 result
= scm_read_sharp_extension (chr
, port
);
1295 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1301 return (scm_read_character (chr
, port
));
1303 return (scm_read_vector (chr
, port
));
1307 /* This one may return either a boolean or an SRFI-4 vector. */
1308 return (scm_read_srfi4_vector (chr
, port
));
1310 return (scm_read_bytevector (chr
, port
));
1312 return (scm_read_guile_bit_vector (chr
, port
));
1316 /* This one may return either a boolean or an SRFI-4 vector. */
1317 return (scm_read_boolean (chr
, port
));
1319 return (scm_read_keyword (chr
, port
));
1320 case '0': case '1': case '2': case '3': case '4':
1321 case '5': case '6': case '7': case '8': case '9':
1323 #if SCM_ENABLE_DEPRECATED
1324 /* See below for 'i' and 'e'. */
1331 return (scm_i_read_array (port
, chr
));
1335 #if SCM_ENABLE_DEPRECATED
1337 /* When next char is '(', it really is an old-style
1339 scm_t_wchar next_c
= scm_getc (port
);
1341 scm_ungetc (next_c
, port
);
1343 return scm_i_read_array (port
, chr
);
1357 return (scm_read_number_and_radix (chr
, port
));
1359 return (scm_read_extended_symbol (chr
, port
));
1361 return (scm_read_shebang (chr
, port
));
1363 return (scm_read_commented_expression (chr
, port
));
1367 return (scm_read_syntax (chr
, port
));
1369 return (scm_read_nil (chr
, port
));
1371 result
= scm_read_sharp_extension (chr
, port
);
1372 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1374 /* To remain compatible with 1.8 and earlier, the following
1375 characters have lower precedence than `read-hash-extend'
1380 return scm_read_r6rs_block_comment (chr
, port
);
1382 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1383 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1390 return SCM_UNSPECIFIED
;
1395 scm_read_expression (SCM port
)
1396 #define FUNC_NAME "scm_read_expression"
1400 register scm_t_wchar chr
;
1402 chr
= scm_getc (port
);
1406 case SCM_WHITE_SPACES
:
1407 case SCM_LINE_INCREMENTORS
:
1410 (void) scm_read_semicolon_comment (chr
, port
);
1413 if (!SCM_SQUARE_BRACKETS_P
)
1414 return (scm_read_mixed_case_symbol (chr
, port
));
1415 /* otherwise fall through */
1417 return (scm_read_sexp (chr
, port
));
1419 return (scm_read_string (chr
, port
));
1423 return (scm_read_quote (chr
, port
));
1427 result
= scm_read_sharp (chr
, port
);
1428 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1429 /* We read a comment or some such. */
1435 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1440 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1441 return scm_symbol_to_keyword (scm_read_expression (port
));
1446 if (((chr
>= '0') && (chr
<= '9'))
1447 || (strchr ("+-.", chr
)))
1448 return (scm_read_number (chr
, port
));
1450 return (scm_read_mixed_case_symbol (chr
, port
));
1458 /* Actual reader. */
1460 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1462 "Read an s-expression from the input port @var{port}, or from\n"
1463 "the current input port if @var{port} is not specified.\n"
1464 "Any whitespace before the next token is discarded.")
1465 #define FUNC_NAME s_scm_read
1469 if (SCM_UNBNDP (port
))
1470 port
= scm_current_input_port ();
1471 SCM_VALIDATE_OPINPORT (1, port
);
1473 c
= flush_ws (port
, (char *) NULL
);
1476 scm_ungetc (c
, port
);
1478 return (scm_read_expression (port
));
1485 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1487 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1489 if (!scm_is_pair(obj
)) {
1492 SCM tmp
= obj
, copy
;
1493 /* If this sexpr is visible in the read:sharp source, we want to
1494 keep that information, so only record non-constant cons cells
1495 which haven't previously been read by the reader. */
1496 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1498 if (SCM_COPY_SOURCE_P
)
1500 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1502 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1504 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1509 copy
= SCM_CDR (copy
);
1511 SCM_SETCDR (copy
, tmp
);
1515 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1516 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1517 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1518 copy
= SCM_UNDEFINED
;
1520 scm_whash_insert (scm_source_whash
,
1522 scm_make_srcprops (line
,
1532 /* Manipulate the read-hash-procedures alist. This could be written in
1533 Scheme, but maybe it will also be used by C code during initialisation. */
1534 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1535 (SCM chr
, SCM proc
),
1536 "Install the procedure @var{proc} for reading expressions\n"
1537 "starting with the character sequence @code{#} and @var{chr}.\n"
1538 "@var{proc} will be called with two arguments: the character\n"
1539 "@var{chr} and the port to read further data from. The object\n"
1540 "returned will be the return value of @code{read}. \n"
1541 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1543 #define FUNC_NAME s_scm_read_hash_extend
1548 SCM_VALIDATE_CHAR (1, chr
);
1549 SCM_ASSERT (scm_is_false (proc
)
1550 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1551 proc
, SCM_ARG2
, FUNC_NAME
);
1553 /* Check if chr is already in the alist. */
1554 this = *scm_read_hash_procedures
;
1558 if (scm_is_null (this))
1560 /* not found, so add it to the beginning. */
1561 if (scm_is_true (proc
))
1563 *scm_read_hash_procedures
=
1564 scm_cons (scm_cons (chr
, proc
), *scm_read_hash_procedures
);
1568 if (scm_is_eq (chr
, SCM_CAAR (this)))
1570 /* already in the alist. */
1571 if (scm_is_false (proc
))
1574 if (scm_is_false (prev
))
1576 *scm_read_hash_procedures
=
1577 SCM_CDR (*scm_read_hash_procedures
);
1580 scm_set_cdr_x (prev
, SCM_CDR (this));
1585 scm_set_cdr_x (SCM_CAR (this), proc
);
1590 this = SCM_CDR (this);
1593 return SCM_UNSPECIFIED
;
1597 /* Recover the read-hash procedure corresponding to char c. */
1599 scm_get_hash_procedure (int c
)
1601 SCM rest
= *scm_read_hash_procedures
;
1605 if (scm_is_null (rest
))
1608 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1609 return SCM_CDAR (rest
);
1611 rest
= SCM_CDR (rest
);
1615 #define SCM_ENCODING_SEARCH_SIZE (500)
1617 /* Search the first few hundred characters of a file for an Emacs-like coding
1618 declaration. Returns either NULL or a string whose storage has been
1619 allocated with `scm_gc_malloc ()'. */
1621 scm_i_scan_for_encoding (SCM port
)
1623 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1625 char *encoding
= NULL
;
1631 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1632 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1633 "guile <(echo '(display "hello")')") so bail out. */
1636 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1638 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1641 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1644 /* search past "coding[:=]" */
1648 if ((pos
= strstr(pos
, "coding")) == NULL
)
1651 pos
+= strlen("coding");
1652 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1653 (*pos
== ':' || *pos
== '='))
1661 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1662 (*pos
== ' ' || *pos
== '\t'))
1665 /* grab the next token */
1667 while (pos
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1668 && pos
+ i
- header
< bytes_read
1669 && (isalnum ((int) pos
[i
]) || strchr ("_-.:/,+=()", pos
[i
]) != NULL
))
1675 encoding
= scm_gc_strndup (pos
, i
, "encoding");
1676 for (i
= 0; i
< strlen (encoding
); i
++)
1677 encoding
[i
] = toupper ((int) encoding
[i
]);
1679 /* push backwards to make sure we were in a comment */
1681 while (pos
- i
- header
> 0)
1683 if (*(pos
- i
) == '\n')
1685 /* This wasn't in a semicolon comment. Check for a
1686 hash-bang comment. */
1687 char *beg
= strstr (header
, "#!");
1688 char *end
= strstr (header
, "!#");
1689 if (beg
< pos
&& pos
< end
)
1693 if (*(pos
- i
) == ';')
1701 /* This wasn't in a comment */
1704 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1705 scm_misc_error (NULL
,
1706 "the port input declares the encoding ~s but is encoded as UTF-8",
1707 scm_list_1 (scm_from_locale_string (encoding
)));
1712 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1714 "Scans the port for an Emacs-like character coding declaration\n"
1715 "near the top of the contents of a port with random-acessible contents.\n"
1716 "The coding declaration is of the form\n"
1717 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1719 "Returns a string containing the character encoding of the file\n"
1720 "if a declaration was found, or @code{#f} otherwise.\n")
1721 #define FUNC_NAME s_scm_file_encoding
1726 enc
= scm_i_scan_for_encoding (port
);
1731 s_enc
= scm_from_locale_string (enc
);
1742 scm_read_hash_procedures
=
1743 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL
));
1745 scm_init_opts (scm_read_options
, scm_read_opts
);
1746 #include "libguile/read.x"