1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 scm_t_option scm_read_opts
[] = {
65 { SCM_OPTION_BOOLEAN
, "copy", 0,
66 "Copy source code expressions." },
67 { SCM_OPTION_BOOLEAN
, "positions", 0,
68 "Record positions of source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
70 "Convert symbols to lower case."},
71 { SCM_OPTION_SCM
, "keywords", (unsigned long) SCM_BOOL_F
,
72 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN
, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
76 { SCM_OPTION_BOOLEAN
, "elisp-strings", 0,
77 "Support `\\(' and `\\)' in strings."},
83 Give meaningful error messages for errors
87 FILE:LINE:COL: MESSAGE
90 This is not standard GNU format, but the test-suite likes the real
91 message to be in front.
97 scm_i_input_error (char const *function
,
98 SCM port
, const char *message
, SCM arg
)
100 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
102 : scm_from_locale_string ("#<unknown port>"));
104 SCM string_port
= scm_open_output_string ();
105 SCM string
= SCM_EOL
;
106 scm_simple_format (string_port
,
107 scm_from_locale_string ("~A:~S:~S: ~A"),
109 scm_from_long (SCM_LINUM (port
) + 1),
110 scm_from_int (SCM_COL (port
) + 1),
111 scm_from_locale_string (message
)));
113 string
= scm_get_output_string (string_port
);
114 scm_close_output_port (string_port
);
115 scm_error_scm (scm_from_locale_symbol ("read-error"),
116 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
123 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
125 "Option interface for the read options. Instead of using\n"
126 "this procedure directly, use the procedures @code{read-enable},\n"
127 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
128 #define FUNC_NAME s_scm_read_options
130 SCM ans
= scm_options (setting
,
133 if (SCM_COPY_SOURCE_P
)
134 SCM_RECORD_POSITIONS_P
= 1;
139 /* An association list mapping extra hash characters to procedures. */
140 static SCM
*scm_read_hash_procedures
;
147 /* Size of the C buffer used to read symbols and numbers. */
148 #define READER_BUFFER_SIZE 128
150 /* Size of the C buffer used to read strings. */
151 #define READER_STRING_BUFFER_SIZE 512
153 /* The maximum size of Scheme character names. */
154 #define READER_CHAR_NAME_MAX_SIZE 50
157 /* `isblank' is only in C99. */
158 #define CHAR_IS_BLANK_(_chr) \
159 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
160 || ((_chr) == '\f') || ((_chr) == '\r'))
163 # define CHAR_IS_BLANK(_chr) \
164 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
166 # define CHAR_IS_BLANK CHAR_IS_BLANK_
170 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
172 #define CHAR_IS_R5RS_DELIMITER(c) \
174 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
176 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
178 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
180 #define CHAR_IS_EXPONENT_MARKER(_chr) \
181 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
182 || ((_chr) == 'd') || ((_chr) == 'l'))
184 /* Read an SCSH block comment. */
185 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
186 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
187 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
188 static SCM
scm_get_hash_procedure (int);
190 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
191 zero if the whole token fits in BUF, non-zero otherwise. */
193 read_token (SCM port
, SCM buf
, size_t *read
)
198 buf
= scm_i_string_start_writing (buf
);
199 while (*read
< scm_i_string_length (buf
))
201 chr
= scm_getc (port
);
205 scm_i_string_stop_writing ();
209 chr
= (SCM_CASE_INSENSITIVE_P
? uc_tolower (chr
) : chr
);
211 if (CHAR_IS_DELIMITER (chr
))
213 scm_i_string_stop_writing ();
214 scm_ungetc (chr
, port
);
218 scm_i_string_set_x (buf
, *read
, chr
);
221 scm_i_string_stop_writing ();
227 read_complete_token (SCM port
, size_t *read
)
229 SCM buffer
, str
= SCM_EOL
;
233 buffer
= scm_i_make_string (READER_BUFFER_SIZE
, NULL
);
234 overflow
= read_token (port
, buffer
, read
);
236 return scm_i_substring (buffer
, 0, *read
);
238 str
= scm_string_copy (buffer
);
241 overflow
= read_token (port
, buffer
, &len
);
242 str
= scm_string_append (scm_list_2 (str
, buffer
));
247 return scm_i_substring (str
, 0, *read
);
250 /* Skip whitespace from PORT and return the first non-whitespace character
251 read. Raise an error on end-of-file. */
253 flush_ws (SCM port
, const char *eoferr
)
255 register scm_t_wchar c
;
257 switch (c
= scm_getc (port
))
263 scm_i_input_error (eoferr
,
272 switch (c
= scm_getc (port
))
278 case SCM_LINE_INCREMENTORS
:
284 switch (c
= scm_getc (port
))
287 eoferr
= "read_sharp";
290 scm_read_scsh_block_comment (c
, port
);
293 scm_read_commented_expression (c
, port
);
296 if (scm_is_false (scm_get_hash_procedure (c
)))
298 scm_read_r6rs_block_comment (c
, port
);
303 scm_ungetc (c
, port
);
308 case SCM_LINE_INCREMENTORS
:
309 case SCM_SINGLE_SPACES
:
324 static SCM
scm_read_expression (SCM port
);
325 static SCM
scm_read_sharp (int chr
, SCM port
);
326 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
330 scm_read_sexp (scm_t_wchar chr
, SCM port
)
331 #define FUNC_NAME "scm_i_lreadparen"
335 register SCM tl
, ans
= SCM_EOL
;
336 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
337 static const int terminating_char
= ')';
339 /* Need to capture line and column numbers here. */
340 long line
= SCM_LINUM (port
);
341 int column
= SCM_COL (port
) - 1;
344 c
= flush_ws (port
, FUNC_NAME
);
345 if (terminating_char
== c
)
348 scm_ungetc (c
, port
);
349 if (scm_is_eq (scm_sym_dot
,
350 (tmp
= scm_read_expression (port
))))
352 ans
= scm_read_expression (port
);
353 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
354 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
359 /* Build the head of the list structure. */
360 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
362 if (SCM_COPY_SOURCE_P
)
363 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
368 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
372 scm_ungetc (c
, port
);
373 if (scm_is_eq (scm_sym_dot
,
374 (tmp
= scm_read_expression (port
))))
376 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
378 if (SCM_COPY_SOURCE_P
)
379 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
382 c
= flush_ws (port
, FUNC_NAME
);
383 if (terminating_char
!= c
)
384 scm_i_input_error (FUNC_NAME
, port
,
385 "in pair: missing close paren", SCM_EOL
);
389 new_tail
= scm_cons (tmp
, SCM_EOL
);
390 SCM_SETCDR (tl
, new_tail
);
393 if (SCM_COPY_SOURCE_P
)
395 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
398 SCM_SETCDR (tl2
, new_tail2
);
404 if (SCM_RECORD_POSITIONS_P
)
405 scm_whash_insert (scm_source_whash
,
407 scm_make_srcprops (line
, column
,
418 scm_read_string (int chr
, SCM port
)
419 #define FUNC_NAME "scm_lreadr"
421 /* For strings smaller than C_STR, this function creates only one Scheme
422 object (the string returned). */
424 SCM str
= SCM_BOOL_F
;
425 unsigned c_str_len
= 0;
428 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
429 while ('"' != (c
= scm_getc (port
)))
434 scm_i_input_error (FUNC_NAME
, port
,
435 "end of file in string constant", SCM_EOL
);
438 if (c_str_len
+ 1 >= scm_i_string_length (str
))
440 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
442 str
= scm_string_append (scm_list_2 (str
, addy
));
447 switch (c
= scm_getc (port
))
457 if (SCM_ESCAPED_PARENS_P
)
493 if ('0' <= a
&& a
<= '9')
495 else if ('A' <= a
&& a
<= 'F')
497 else if ('a' <= a
&& a
<= 'f')
504 if ('0' <= b
&& b
<= '9')
506 else if ('A' <= b
&& b
<= 'F')
508 else if ('a' <= b
&& b
<= 'f')
523 for (i
= 0; i
< 4; i
++)
528 if ('0' <= a
&& a
<= '9')
530 else if ('A' <= a
&& a
<= 'F')
532 else if ('a' <= a
&& a
<= 'f')
548 for (i
= 0; i
< 6; i
++)
553 if ('0' <= a
&& a
<= '9')
555 else if ('A' <= a
&& a
<= 'F')
557 else if ('a' <= a
&& a
<= 'f')
570 scm_i_input_error (FUNC_NAME
, port
,
571 "illegal character in escape sequence: ~S",
572 scm_list_1 (SCM_MAKE_CHAR (c
)));
575 str
= scm_i_string_start_writing (str
);
576 scm_i_string_set_x (str
, c_str_len
++, c
);
577 scm_i_string_stop_writing ();
582 return scm_i_substring_copy (str
, 0, c_str_len
);
591 scm_read_number (scm_t_wchar chr
, SCM port
)
597 scm_ungetc (chr
, port
);
598 buffer
= read_complete_token (port
, &read
);
599 result
= scm_string_to_number (buffer
, SCM_UNDEFINED
);
600 if (!scm_is_true (result
))
601 /* Return a symbol instead of a number. */
602 result
= scm_string_to_symbol (buffer
);
608 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
611 int ends_with_colon
= 0;
614 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
616 scm_ungetc (chr
, port
);
617 buffer
= read_complete_token (port
, &read
);
619 ends_with_colon
= scm_i_string_ref (buffer
, read
- 1) == ':';
621 if (postfix
&& ends_with_colon
&& (read
> 1))
622 result
= scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer
, 0, read
- 1)));
624 result
= scm_string_to_symbol (buffer
);
630 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
631 #define FUNC_NAME "scm_lreadr"
635 SCM buffer
= scm_i_make_string (READER_BUFFER_SIZE
, NULL
);
661 scm_ungetc (chr
, port
);
662 scm_ungetc ('#', port
);
666 buffer
= read_complete_token (port
, &read
);
667 result
= scm_string_to_number (buffer
, scm_from_uint (radix
));
669 if (scm_is_true (result
))
672 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
679 scm_read_quote (int chr
, SCM port
)
682 long line
= SCM_LINUM (port
);
683 int column
= SCM_COL (port
) - 1;
688 p
= scm_sym_quasiquote
;
701 p
= scm_sym_uq_splicing
;
704 scm_ungetc (c
, port
);
711 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
712 "scm_read_quote", chr
);
716 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
717 if (SCM_RECORD_POSITIONS_P
)
718 scm_whash_insert (scm_source_whash
, p
,
719 scm_make_srcprops (line
, column
,
722 ? (scm_cons2 (SCM_CAR (p
),
723 SCM_CAR (SCM_CDR (p
)),
732 SCM_SYMBOL (sym_syntax
, "syntax");
733 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
734 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
735 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
738 scm_read_syntax (int chr
, SCM port
)
741 long line
= SCM_LINUM (port
);
742 int column
= SCM_COL (port
) - 1;
760 p
= sym_unsyntax_splicing
;
763 scm_ungetc (c
, port
);
770 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
771 "scm_read_syntax", chr
);
775 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
776 if (SCM_RECORD_POSITIONS_P
)
777 scm_whash_insert (scm_source_whash
, p
,
778 scm_make_srcprops (line
, column
,
781 ? (scm_cons2 (SCM_CAR (p
),
782 SCM_CAR (SCM_CDR (p
)),
792 scm_read_semicolon_comment (int chr
, SCM port
)
796 /* We use the get_byte here because there is no need to get the
797 locale correct with comment input. This presumes that newline
798 always represents itself no matter what the encoding is. */
799 for (c
= scm_get_byte_or_eof (port
);
800 (c
!= EOF
) && (c
!= '\n');
801 c
= scm_getc (port
));
803 return SCM_UNSPECIFIED
;
807 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
810 scm_read_boolean (int chr
, SCM port
)
823 return SCM_UNSPECIFIED
;
827 scm_read_character (scm_t_wchar chr
, SCM port
)
828 #define FUNC_NAME "scm_lreadr"
830 SCM charname
= scm_i_make_string (READER_CHAR_NAME_MAX_SIZE
, NULL
);
835 overflow
= read_token (port
, charname
, &charname_len
);
836 charname
= scm_c_substring (charname
, 0, charname_len
);
841 if (charname_len
== 0)
843 chr
= scm_getc (port
);
845 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
846 "while reading character", SCM_EOL
);
848 /* CHR must be a token delimiter, like a whitespace. */
849 return (SCM_MAKE_CHAR (chr
));
852 if (charname_len
== 1)
853 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 0));
855 cp
= scm_i_string_ref (charname
, 0);
856 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
857 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
859 if (cp
>= '0' && cp
< '8')
861 /* Dirk:FIXME:: This type of character syntax is not R5RS
862 * compliant. Further, it should be verified that the constant
863 * does only consist of octal digits. */
864 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
867 scm_t_wchar c
= SCM_I_INUM (p
);
868 if (SCM_IS_UNICODE_CHAR (c
))
869 return SCM_MAKE_CHAR (c
);
871 scm_i_input_error (FUNC_NAME
, port
,
872 "out-of-range octal character escape: ~a",
873 scm_list_1 (charname
));
877 /* The names of characters should never have non-Latin1
879 if (scm_i_is_narrow_string (charname
)
880 || scm_i_try_narrow_string (charname
))
881 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
883 if (scm_is_true (ch
))
888 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
889 scm_list_1 (charname
));
891 return SCM_UNSPECIFIED
;
896 scm_read_keyword (int chr
, SCM port
)
900 /* Read the symbol that comprises the keyword. Doing this instead of
901 invoking a specific symbol reader function allows `scm_read_keyword ()'
902 to adapt to the delimiters currently valid of symbols.
904 XXX: This implementation allows sloppy syntaxes like `#: key'. */
905 symbol
= scm_read_expression (port
);
906 if (!scm_is_symbol (symbol
))
907 scm_i_input_error ("scm_read_keyword", port
,
908 "keyword prefix `~a' not followed by a symbol: ~s",
909 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
911 return (scm_symbol_to_keyword (symbol
));
915 scm_read_vector (int chr
, SCM port
)
917 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
918 guarantee that it's going to do what we want. After all, this is an
919 implementation detail of `scm_read_vector ()', not a desirable
921 return (scm_vector (scm_read_sexp (chr
, port
)));
925 scm_read_srfi4_vector (int chr
, SCM port
)
927 return scm_i_read_array (port
, chr
);
931 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
933 chr
= scm_getc (port
);
937 chr
= scm_getc (port
);
941 chr
= scm_getc (port
);
945 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
948 scm_i_input_error ("read_bytevector", port
,
949 "invalid bytevector prefix",
950 SCM_MAKE_CHAR (chr
));
951 return SCM_UNSPECIFIED
;
955 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
957 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
958 terribly inefficient but who cares? */
959 SCM s_bits
= SCM_EOL
;
961 for (chr
= scm_getc (port
);
962 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
963 chr
= scm_getc (port
))
965 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
969 scm_ungetc (chr
, port
);
971 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
975 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
979 /* We can use the get_byte here because there is no need to get the
980 locale correct when reading comments. This presumes that
981 hash and exclamation points always represent themselves no
982 matter what the source encoding is.*/
985 int c
= scm_get_byte_or_eof (port
);
988 scm_i_input_error ("skip_block_comment", port
,
989 "unterminated `#! ... !#' comment", SCM_EOL
);
993 else if (c
== '#' && bang_seen
)
999 return SCM_UNSPECIFIED
;
1003 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1005 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1006 nested. So care must be taken. */
1007 int nesting_level
= 1;
1008 int opening_seen
= 0, closing_seen
= 0;
1010 while (nesting_level
> 0)
1012 int c
= scm_getc (port
);
1015 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1016 "unterminated `#| ... |#' comment", SCM_EOL
);
1024 else if (closing_seen
)
1035 opening_seen
= closing_seen
= 0;
1038 return SCM_UNSPECIFIED
;
1042 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1046 c
= flush_ws (port
, (char *) NULL
);
1048 scm_i_input_error ("read_commented_expression", port
,
1049 "no expression after #; comment", SCM_EOL
);
1050 scm_ungetc (c
, port
);
1051 scm_read_expression (port
);
1052 return SCM_UNSPECIFIED
;
1056 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1058 /* Guile's extended symbol read syntax looks like this:
1060 #{This is all a symbol name}#
1062 So here, CHR is expected to be `{'. */
1063 int saw_brace
= 0, finished
= 0;
1065 SCM buf
= scm_i_make_string (1024, NULL
);
1067 buf
= scm_i_string_start_writing (buf
);
1069 while ((chr
= scm_getc (port
)) != EOF
)
1081 scm_i_string_set_x (buf
, len
++, '}');
1082 scm_i_string_set_x (buf
, len
++, chr
);
1085 else if (chr
== '}')
1088 scm_i_string_set_x (buf
, len
++, chr
);
1090 if (len
>= scm_i_string_length (buf
) - 2)
1094 scm_i_string_stop_writing ();
1095 addy
= scm_i_make_string (1024, NULL
);
1096 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1098 buf
= scm_i_string_start_writing (buf
);
1104 scm_i_string_stop_writing ();
1106 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1111 /* Top-level token readers, i.e., dispatchers. */
1114 scm_read_sharp_extension (int chr
, SCM port
)
1118 proc
= scm_get_hash_procedure (chr
);
1119 if (scm_is_true (scm_procedure_p (proc
)))
1121 long line
= SCM_LINUM (port
);
1122 int column
= SCM_COL (port
) - 2;
1125 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1126 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1128 if (SCM_RECORD_POSITIONS_P
)
1129 return (recsexpr (got
, line
, column
,
1130 SCM_FILENAME (port
)));
1136 return SCM_UNSPECIFIED
;
1139 /* The reader for the sharp `#' character. It basically dispatches reads
1140 among the above token readers. */
1142 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1143 #define FUNC_NAME "scm_lreadr"
1147 chr
= scm_getc (port
);
1149 result
= scm_read_sharp_extension (chr
, port
);
1150 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1156 return (scm_read_character (chr
, port
));
1158 return (scm_read_vector (chr
, port
));
1162 /* This one may return either a boolean or an SRFI-4 vector. */
1163 return (scm_read_srfi4_vector (chr
, port
));
1165 return (scm_read_bytevector (chr
, port
));
1167 return (scm_read_guile_bit_vector (chr
, port
));
1171 /* This one may return either a boolean or an SRFI-4 vector. */
1172 return (scm_read_boolean (chr
, port
));
1174 return (scm_read_keyword (chr
, port
));
1175 case '0': case '1': case '2': case '3': case '4':
1176 case '5': case '6': case '7': case '8': case '9':
1178 #if SCM_ENABLE_DEPRECATED
1179 /* See below for 'i' and 'e'. */
1186 return (scm_i_read_array (port
, chr
));
1190 #if SCM_ENABLE_DEPRECATED
1192 /* When next char is '(', it really is an old-style
1194 scm_t_wchar next_c
= scm_getc (port
);
1196 scm_ungetc (next_c
, port
);
1198 return scm_i_read_array (port
, chr
);
1212 return (scm_read_number_and_radix (chr
, port
));
1214 return (scm_read_extended_symbol (chr
, port
));
1216 return (scm_read_scsh_block_comment (chr
, port
));
1218 return (scm_read_commented_expression (chr
, port
));
1222 return (scm_read_syntax (chr
, port
));
1224 result
= scm_read_sharp_extension (chr
, port
);
1225 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1227 /* To remain compatible with 1.8 and earlier, the following
1228 characters have lower precedence than `read-hash-extend'
1233 return scm_read_r6rs_block_comment (chr
, port
);
1235 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1236 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1243 return SCM_UNSPECIFIED
;
1248 scm_read_expression (SCM port
)
1249 #define FUNC_NAME "scm_read_expression"
1253 register scm_t_wchar chr
;
1255 chr
= scm_getc (port
);
1259 case SCM_WHITE_SPACES
:
1260 case SCM_LINE_INCREMENTORS
:
1263 (void) scm_read_semicolon_comment (chr
, port
);
1266 return (scm_read_sexp (chr
, port
));
1268 return (scm_read_string (chr
, port
));
1272 return (scm_read_quote (chr
, port
));
1276 result
= scm_read_sharp (chr
, port
);
1277 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1278 /* We read a comment or some such. */
1284 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1289 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1290 return scm_symbol_to_keyword (scm_read_expression (port
));
1295 if (((chr
>= '0') && (chr
<= '9'))
1296 || (strchr ("+-.", chr
)))
1297 return (scm_read_number (chr
, port
));
1299 return (scm_read_mixed_case_symbol (chr
, port
));
1307 /* Actual reader. */
1309 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1311 "Read an s-expression from the input port @var{port}, or from\n"
1312 "the current input port if @var{port} is not specified.\n"
1313 "Any whitespace before the next token is discarded.")
1314 #define FUNC_NAME s_scm_read
1318 if (SCM_UNBNDP (port
))
1319 port
= scm_current_input_port ();
1320 SCM_VALIDATE_OPINPORT (1, port
);
1322 c
= flush_ws (port
, (char *) NULL
);
1325 scm_ungetc (c
, port
);
1327 return (scm_read_expression (port
));
1334 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1336 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1338 if (!scm_is_pair(obj
)) {
1341 SCM tmp
= obj
, copy
;
1342 /* If this sexpr is visible in the read:sharp source, we want to
1343 keep that information, so only record non-constant cons cells
1344 which haven't previously been read by the reader. */
1345 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1347 if (SCM_COPY_SOURCE_P
)
1349 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1351 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1353 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1358 copy
= SCM_CDR (copy
);
1360 SCM_SETCDR (copy
, tmp
);
1364 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1365 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1366 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1367 copy
= SCM_UNDEFINED
;
1369 scm_whash_insert (scm_source_whash
,
1371 scm_make_srcprops (line
,
1381 /* Manipulate the read-hash-procedures alist. This could be written in
1382 Scheme, but maybe it will also be used by C code during initialisation. */
1383 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1384 (SCM chr
, SCM proc
),
1385 "Install the procedure @var{proc} for reading expressions\n"
1386 "starting with the character sequence @code{#} and @var{chr}.\n"
1387 "@var{proc} will be called with two arguments: the character\n"
1388 "@var{chr} and the port to read further data from. The object\n"
1389 "returned will be the return value of @code{read}. \n"
1390 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1392 #define FUNC_NAME s_scm_read_hash_extend
1397 SCM_VALIDATE_CHAR (1, chr
);
1398 SCM_ASSERT (scm_is_false (proc
)
1399 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1400 proc
, SCM_ARG2
, FUNC_NAME
);
1402 /* Check if chr is already in the alist. */
1403 this = *scm_read_hash_procedures
;
1407 if (scm_is_null (this))
1409 /* not found, so add it to the beginning. */
1410 if (scm_is_true (proc
))
1412 *scm_read_hash_procedures
=
1413 scm_cons (scm_cons (chr
, proc
), *scm_read_hash_procedures
);
1417 if (scm_is_eq (chr
, SCM_CAAR (this)))
1419 /* already in the alist. */
1420 if (scm_is_false (proc
))
1423 if (scm_is_false (prev
))
1425 *scm_read_hash_procedures
=
1426 SCM_CDR (*scm_read_hash_procedures
);
1429 scm_set_cdr_x (prev
, SCM_CDR (this));
1434 scm_set_cdr_x (SCM_CAR (this), proc
);
1439 this = SCM_CDR (this);
1442 return SCM_UNSPECIFIED
;
1446 /* Recover the read-hash procedure corresponding to char c. */
1448 scm_get_hash_procedure (int c
)
1450 SCM rest
= *scm_read_hash_procedures
;
1454 if (scm_is_null (rest
))
1457 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1458 return SCM_CDAR (rest
);
1460 rest
= SCM_CDR (rest
);
1464 #define SCM_ENCODING_SEARCH_SIZE (500)
1466 /* Search the first few hundred characters of a file for an Emacs-like coding
1467 declaration. Returns either NULL or a string whose storage has been
1468 allocated with `scm_gc_malloc ()'. */
1470 scm_i_scan_for_encoding (SCM port
)
1472 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1474 char *encoding
= NULL
;
1480 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1481 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1482 "guile <(echo '(display "hello")')") so bail out. */
1485 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1487 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1490 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1493 /* search past "coding[:=]" */
1497 if ((pos
= strstr(pos
, "coding")) == NULL
)
1500 pos
+= strlen("coding");
1501 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1502 (*pos
== ':' || *pos
== '='))
1510 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1511 (*pos
== ' ' || *pos
== '\t'))
1514 /* grab the next token */
1516 while (pos
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1517 && pos
+ i
- header
< bytes_read
1518 && (isalnum((int) pos
[i
]) || pos
[i
] == '_' || pos
[i
] == '-'
1525 encoding
= scm_gc_strndup (pos
, i
, "encoding");
1526 for (i
= 0; i
< strlen (encoding
); i
++)
1527 encoding
[i
] = toupper ((int) encoding
[i
]);
1529 /* push backwards to make sure we were in a comment */
1531 while (pos
- i
- header
> 0)
1533 if (*(pos
- i
) == '\n')
1535 /* This wasn't in a semicolon comment. Check for a
1536 hash-bang comment. */
1537 char *beg
= strstr (header
, "#!");
1538 char *end
= strstr (header
, "!#");
1539 if (beg
< pos
&& pos
< end
)
1543 if (*(pos
- i
) == ';')
1551 /* This wasn't in a comment */
1554 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1555 scm_misc_error (NULL
,
1556 "the port input declares the encoding ~s but is encoded as UTF-8",
1557 scm_list_1 (scm_from_locale_string (encoding
)));
1562 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1564 "Scans the port for an Emacs-like character coding declaration\n"
1565 "near the top of the contents of a port with random-acessible contents.\n"
1566 "The coding declaration is of the form\n"
1567 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1569 "Returns a string containing the character encoding of the file\n"
1570 "if a declaration was found, or @code{#f} otherwise.\n")
1571 #define FUNC_NAME s_scm_file_encoding
1576 enc
= scm_i_scan_for_encoding (port
);
1581 s_enc
= scm_from_locale_string (enc
);
1592 scm_read_hash_procedures
=
1593 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL
));
1595 scm_init_opts (scm_read_options
, scm_read_opts
);
1596 #include "libguile/read.x"