1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
244 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
245 char **overflow_buffer
, size_t *read
)
248 size_t bytes_read
, overflow_size
;
250 *overflow_buffer
= NULL
;
255 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
258 if (overflow
|| overflow_size
!= 0)
260 if (overflow_size
== 0)
262 *overflow_buffer
= scm_malloc (bytes_read
);
263 memcpy (*overflow_buffer
, buffer
, bytes_read
);
264 overflow_size
= bytes_read
;
268 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
269 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
!= 0);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
289 register scm_t_wchar c
;
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
);
363 scm_read_sexp (scm_t_wchar chr
, SCM port
)
364 #define FUNC_NAME "scm_i_lreadparen"
367 SCM tmp
, tl
, ans
= SCM_EOL
;
368 const int terminating_char
= ((chr
== '[') ? ']' : ')');
370 /* Need to capture line and column numbers here. */
371 long line
= SCM_LINUM (port
);
372 int column
= SCM_COL (port
) - 1;
374 c
= flush_ws (port
, FUNC_NAME
);
375 if (terminating_char
== c
)
378 scm_ungetc (c
, port
);
379 if (scm_is_eq (scm_sym_dot
,
380 (tmp
= scm_read_expression (port
))))
382 ans
= scm_read_expression (port
);
383 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
384 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
389 /* Build the head of the list structure. */
390 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
392 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
396 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
397 scm_i_input_error (FUNC_NAME
, port
,
398 "in pair: mismatched close paren: ~A",
399 scm_list_1 (SCM_MAKE_CHAR (c
)));
401 scm_ungetc (c
, port
);
402 tmp
= scm_read_expression (port
);
404 if (scm_is_eq (scm_sym_dot
, tmp
))
406 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
408 c
= flush_ws (port
, FUNC_NAME
);
409 if (terminating_char
!= c
)
410 scm_i_input_error (FUNC_NAME
, port
,
411 "in pair: missing close paren", SCM_EOL
);
415 new_tail
= scm_cons (tmp
, SCM_EOL
);
416 SCM_SETCDR (tl
, new_tail
);
421 if (SCM_RECORD_POSITIONS_P
)
422 scm_i_set_source_properties_x (ans
, line
, column
, SCM_FILENAME (port
));
429 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
430 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
432 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
438 while (i < ndigits) \
440 a = scm_getc (port); \
444 && (a == (scm_t_wchar) terminator) \
447 if ('0' <= a && a <= '9') \
449 else if ('A' <= a && a <= 'F') \
451 else if ('a' <= a && a <= 'f') \
464 skip_intraline_whitespace (SCM port
)
474 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
476 scm_ungetc (c
, port
);
480 scm_read_string (int chr
, SCM port
)
481 #define FUNC_NAME "scm_lreadr"
483 /* For strings smaller than C_STR, this function creates only one Scheme
484 object (the string returned). */
486 SCM str
= SCM_BOOL_F
;
487 unsigned c_str_len
= 0;
490 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
491 while ('"' != (c
= scm_getc (port
)))
496 scm_i_input_error (FUNC_NAME
, port
,
497 "end of file in string constant", SCM_EOL
);
500 if (c_str_len
+ 1 >= scm_i_string_length (str
))
502 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
504 str
= scm_string_append (scm_list_2 (str
, addy
));
509 switch (c
= scm_getc (port
))
517 if (SCM_HUNGRY_EOL_ESCAPES_P
)
518 skip_intraline_whitespace (port
);
545 if (SCM_R6RS_ESCAPES_P
)
546 SCM_READ_HEX_ESCAPE (10, ';');
548 SCM_READ_HEX_ESCAPE (2, '\0');
551 if (!SCM_R6RS_ESCAPES_P
)
553 SCM_READ_HEX_ESCAPE (4, '\0');
557 if (!SCM_R6RS_ESCAPES_P
)
559 SCM_READ_HEX_ESCAPE (6, '\0');
564 scm_i_input_error (FUNC_NAME
, port
,
565 "illegal character in escape sequence: ~S",
566 scm_list_1 (SCM_MAKE_CHAR (c
)));
569 str
= scm_i_string_start_writing (str
);
570 scm_i_string_set_x (str
, c_str_len
++, c
);
571 scm_i_string_stop_writing ();
576 return scm_i_substring_copy (str
, 0, c_str_len
);
585 scm_read_number (scm_t_wchar chr
, SCM port
)
587 SCM result
, str
= SCM_EOL
;
588 char buffer
[READER_BUFFER_SIZE
];
589 char *overflow_buffer
= NULL
;
592 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
594 scm_ungetc (chr
, port
);
595 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
596 &overflow_buffer
, &bytes_read
);
599 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
601 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
604 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
605 if (!scm_is_true (result
))
607 /* Return a symbol instead of a number */
608 if (SCM_CASE_INSENSITIVE_P
)
609 str
= scm_string_downcase_x (str
);
610 result
= scm_string_to_symbol (str
);
614 free (overflow_buffer
);
615 SCM_COL (port
) += scm_i_string_length (str
);
620 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
623 int ends_with_colon
= 0;
625 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
627 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
628 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
631 scm_ungetc (chr
, port
);
632 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
633 &overflow_buffer
, &bytes_read
);
637 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
639 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
642 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
645 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
647 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
650 if (SCM_CASE_INSENSITIVE_P
)
651 str
= scm_string_downcase_x (str
);
652 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
657 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
659 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
662 if (SCM_CASE_INSENSITIVE_P
)
663 str
= scm_string_downcase_x (str
);
664 result
= scm_string_to_symbol (str
);
668 free (overflow_buffer
);
669 SCM_COL (port
) += scm_i_string_length (str
);
674 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
675 #define FUNC_NAME "scm_lreadr"
679 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
708 scm_ungetc (chr
, port
);
709 scm_ungetc ('#', port
);
713 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
714 &overflow_buffer
, &read
);
716 pt
= SCM_PTAB_ENTRY (port
);
718 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
720 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
723 result
= scm_string_to_number (str
, scm_from_uint (radix
));
726 free (overflow_buffer
);
728 SCM_COL (port
) += scm_i_string_length (str
);
730 if (scm_is_true (result
))
733 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
740 scm_read_quote (int chr
, SCM port
)
743 long line
= SCM_LINUM (port
);
744 int column
= SCM_COL (port
) - 1;
749 p
= scm_sym_quasiquote
;
762 p
= scm_sym_uq_splicing
;
765 scm_ungetc (c
, port
);
772 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
773 "scm_read_quote", chr
);
777 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
778 if (SCM_RECORD_POSITIONS_P
)
779 scm_i_set_source_properties_x (p
, line
, column
, SCM_FILENAME (port
));
784 SCM_SYMBOL (sym_syntax
, "syntax");
785 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
786 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
787 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
790 scm_read_syntax (int chr
, SCM port
)
793 long line
= SCM_LINUM (port
);
794 int column
= SCM_COL (port
) - 1;
812 p
= sym_unsyntax_splicing
;
815 scm_ungetc (c
, port
);
822 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
823 "scm_read_syntax", chr
);
827 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
828 if (SCM_RECORD_POSITIONS_P
)
829 scm_i_set_source_properties_x (p
, line
, column
, SCM_FILENAME (port
));
835 scm_read_nil (int chr
, SCM port
)
837 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
839 if (!scm_is_eq (id
, sym_nil
))
840 scm_i_input_error ("scm_read_nil", port
,
841 "unexpected input while reading #nil: ~a",
844 return SCM_ELISP_NIL
;
848 scm_read_semicolon_comment (int chr
, SCM port
)
852 /* We use the get_byte here because there is no need to get the
853 locale correct with comment input. This presumes that newline
854 always represents itself no matter what the encoding is. */
855 for (c
= scm_get_byte_or_eof (port
);
856 (c
!= EOF
) && (c
!= '\n');
857 c
= scm_get_byte_or_eof (port
));
859 return SCM_UNSPECIFIED
;
863 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
866 scm_read_boolean (int chr
, SCM port
)
879 return SCM_UNSPECIFIED
;
883 scm_read_character (scm_t_wchar chr
, SCM port
)
884 #define FUNC_NAME "scm_lreadr"
886 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
888 size_t charname_len
, bytes_read
;
893 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
895 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
899 chr
= scm_getc (port
);
901 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
902 "while reading character", SCM_EOL
);
904 /* CHR must be a token delimiter, like a whitespace. */
905 return (SCM_MAKE_CHAR (chr
));
908 pt
= SCM_PTAB_ENTRY (port
);
910 /* Simple ASCII characters can be processed immediately. Also, simple
911 ISO-8859-1 characters can be processed immediately if the encoding for this
912 port is ISO-8859-1. */
913 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
916 return SCM_MAKE_CHAR (buffer
[0]);
919 /* Otherwise, convert the buffer into a proper scheme string for
921 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
923 charname_len
= scm_i_string_length (charname
);
924 SCM_COL (port
) += charname_len
;
925 cp
= scm_i_string_ref (charname
, 0);
926 if (charname_len
== 1)
927 return SCM_MAKE_CHAR (cp
);
929 /* Ignore dotted circles, which may be used to keep combining characters from
930 combining with the backslash in #\charname. */
931 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
932 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
934 if (cp
>= '0' && cp
< '8')
936 /* Dirk:FIXME:: This type of character syntax is not R5RS
937 * compliant. Further, it should be verified that the constant
938 * does only consist of octal digits. */
939 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
942 scm_t_wchar c
= scm_to_uint32 (p
);
943 if (SCM_IS_UNICODE_CHAR (c
))
944 return SCM_MAKE_CHAR (c
);
946 scm_i_input_error (FUNC_NAME
, port
,
947 "out-of-range octal character escape: ~a",
948 scm_list_1 (charname
));
952 if (cp
== 'x' && (charname_len
> 1))
956 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
957 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
961 scm_t_wchar c
= scm_to_uint32 (p
);
962 if (SCM_IS_UNICODE_CHAR (c
))
963 return SCM_MAKE_CHAR (c
);
965 scm_i_input_error (FUNC_NAME
, port
,
966 "out-of-range hex character escape: ~a",
967 scm_list_1 (charname
));
971 /* The names of characters should never have non-Latin1
973 if (scm_i_is_narrow_string (charname
)
974 || scm_i_try_narrow_string (charname
))
975 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
977 if (scm_is_true (ch
))
981 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
982 scm_list_1 (charname
));
984 return SCM_UNSPECIFIED
;
989 scm_read_keyword (int chr
, SCM port
)
993 /* Read the symbol that comprises the keyword. Doing this instead of
994 invoking a specific symbol reader function allows `scm_read_keyword ()'
995 to adapt to the delimiters currently valid of symbols.
997 XXX: This implementation allows sloppy syntaxes like `#: key'. */
998 symbol
= scm_read_expression (port
);
999 if (!scm_is_symbol (symbol
))
1000 scm_i_input_error ("scm_read_keyword", port
,
1001 "keyword prefix `~a' not followed by a symbol: ~s",
1002 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1004 return (scm_symbol_to_keyword (symbol
));
1008 scm_read_vector (int chr
, SCM port
)
1010 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1011 guarantee that it's going to do what we want. After all, this is an
1012 implementation detail of `scm_read_vector ()', not a desirable
1014 return (scm_vector (scm_read_sexp (chr
, port
)));
1018 scm_read_srfi4_vector (int chr
, SCM port
)
1020 return scm_i_read_array (port
, chr
);
1024 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1026 chr
= scm_getc (port
);
1030 chr
= scm_getc (port
);
1034 chr
= scm_getc (port
);
1038 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1041 scm_i_input_error ("read_bytevector", port
,
1042 "invalid bytevector prefix",
1043 SCM_MAKE_CHAR (chr
));
1044 return SCM_UNSPECIFIED
;
1048 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1050 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1051 terribly inefficient but who cares? */
1052 SCM s_bits
= SCM_EOL
;
1054 for (chr
= scm_getc (port
);
1055 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1056 chr
= scm_getc (port
))
1058 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1062 scm_ungetc (chr
, port
);
1064 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1068 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1074 int c
= scm_getc (port
);
1077 scm_i_input_error ("skip_block_comment", port
,
1078 "unterminated `#! ... !#' comment", SCM_EOL
);
1082 else if (c
== '#' && bang_seen
)
1088 return SCM_UNSPECIFIED
;
1092 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1095 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1097 scm_ungetc (c
, port
);
1098 return scm_read_scsh_block_comment (chr
, port
);
1100 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1102 scm_ungetc (c
, port
);
1103 scm_ungetc ('r', port
);
1104 return scm_read_scsh_block_comment (chr
, port
);
1106 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1108 scm_ungetc (c
, port
);
1109 scm_ungetc ('6', port
);
1110 scm_ungetc ('r', port
);
1111 return scm_read_scsh_block_comment (chr
, port
);
1113 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1115 scm_ungetc (c
, port
);
1116 scm_ungetc ('r', port
);
1117 scm_ungetc ('6', port
);
1118 scm_ungetc ('r', port
);
1119 return scm_read_scsh_block_comment (chr
, port
);
1122 return SCM_UNSPECIFIED
;
1126 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1128 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1129 nested. So care must be taken. */
1130 int nesting_level
= 1;
1131 int opening_seen
= 0, closing_seen
= 0;
1133 while (nesting_level
> 0)
1135 int c
= scm_getc (port
);
1138 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1139 "unterminated `#| ... |#' comment", SCM_EOL
);
1147 else if (closing_seen
)
1158 opening_seen
= closing_seen
= 0;
1161 return SCM_UNSPECIFIED
;
1165 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1169 c
= flush_ws (port
, (char *) NULL
);
1171 scm_i_input_error ("read_commented_expression", port
,
1172 "no expression after #; comment", SCM_EOL
);
1173 scm_ungetc (c
, port
);
1174 scm_read_expression (port
);
1175 return SCM_UNSPECIFIED
;
1179 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1181 /* Guile's extended symbol read syntax looks like this:
1183 #{This is all a symbol name}#
1185 So here, CHR is expected to be `{'. */
1188 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1190 buf
= scm_i_string_start_writing (buf
);
1192 while ((chr
= scm_getc (port
)) != EOF
)
1203 scm_i_string_set_x (buf
, len
++, '}');
1209 else if (chr
== '\\')
1211 /* It used to be that print.c would print extended-read-syntax
1212 symbols with backslashes before "non-standard" chars, but
1213 this routine wouldn't do anything with those escapes.
1214 Bummer. What we've done is to change print.c to output
1215 R6RS hex escapes for those characters, relying on the fact
1216 that the extended read syntax would never put a `\' before
1217 an `x'. For now, we just ignore other instances of
1218 backslash in the string. */
1219 switch ((chr
= scm_getc (port
)))
1227 SCM_READ_HEX_ESCAPE (10, ';');
1228 scm_i_string_set_x (buf
, len
++, c
);
1236 scm_i_string_stop_writing ();
1237 scm_i_input_error ("scm_read_extended_symbol", port
,
1238 "illegal character in escape sequence: ~S",
1239 scm_list_1 (SCM_MAKE_CHAR (c
)));
1243 scm_i_string_set_x (buf
, len
++, chr
);
1248 scm_i_string_set_x (buf
, len
++, chr
);
1250 if (len
>= scm_i_string_length (buf
) - 2)
1254 scm_i_string_stop_writing ();
1255 addy
= scm_i_make_string (1024, NULL
, 0);
1256 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1258 buf
= scm_i_string_start_writing (buf
);
1263 scm_i_string_stop_writing ();
1265 scm_i_input_error ("scm_read_extended_symbol", port
,
1266 "end of file while reading symbol", SCM_EOL
);
1268 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1273 /* Top-level token readers, i.e., dispatchers. */
1276 scm_read_sharp_extension (int chr
, SCM port
)
1280 proc
= scm_get_hash_procedure (chr
);
1281 if (scm_is_true (scm_procedure_p (proc
)))
1283 long line
= SCM_LINUM (port
);
1284 int column
= SCM_COL (port
) - 2;
1287 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1289 if (scm_is_pair (got
) && !scm_i_has_source_properties (got
))
1290 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1295 return SCM_UNSPECIFIED
;
1298 /* The reader for the sharp `#' character. It basically dispatches reads
1299 among the above token readers. */
1301 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1302 #define FUNC_NAME "scm_lreadr"
1306 chr
= scm_getc (port
);
1308 result
= scm_read_sharp_extension (chr
, port
);
1309 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1315 return (scm_read_character (chr
, port
));
1317 return (scm_read_vector (chr
, port
));
1322 /* This one may return either a boolean or an SRFI-4 vector. */
1323 return (scm_read_srfi4_vector (chr
, port
));
1325 return (scm_read_bytevector (chr
, port
));
1327 return (scm_read_guile_bit_vector (chr
, port
));
1331 /* This one may return either a boolean or an SRFI-4 vector. */
1332 return (scm_read_boolean (chr
, port
));
1334 return (scm_read_keyword (chr
, port
));
1335 case '0': case '1': case '2': case '3': case '4':
1336 case '5': case '6': case '7': case '8': case '9':
1338 #if SCM_ENABLE_DEPRECATED
1339 /* See below for 'i' and 'e'. */
1345 return (scm_i_read_array (port
, chr
));
1349 #if SCM_ENABLE_DEPRECATED
1351 /* When next char is '(', it really is an old-style
1353 scm_t_wchar next_c
= scm_getc (port
);
1355 scm_ungetc (next_c
, port
);
1357 return scm_i_read_array (port
, chr
);
1371 return (scm_read_number_and_radix (chr
, port
));
1373 return (scm_read_extended_symbol (chr
, port
));
1375 return (scm_read_shebang (chr
, port
));
1377 return (scm_read_commented_expression (chr
, port
));
1381 return (scm_read_syntax (chr
, port
));
1383 return (scm_read_nil (chr
, port
));
1385 result
= scm_read_sharp_extension (chr
, port
);
1386 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1388 /* To remain compatible with 1.8 and earlier, the following
1389 characters have lower precedence than `read-hash-extend'
1394 return scm_read_r6rs_block_comment (chr
, port
);
1396 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1397 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1404 return SCM_UNSPECIFIED
;
1409 scm_read_expression (SCM port
)
1410 #define FUNC_NAME "scm_read_expression"
1414 register scm_t_wchar chr
;
1416 chr
= scm_getc (port
);
1420 case SCM_WHITE_SPACES
:
1421 case SCM_LINE_INCREMENTORS
:
1424 (void) scm_read_semicolon_comment (chr
, port
);
1427 if (!SCM_SQUARE_BRACKETS_P
)
1428 return (scm_read_mixed_case_symbol (chr
, port
));
1429 /* otherwise fall through */
1431 return (scm_read_sexp (chr
, port
));
1433 return (scm_read_string (chr
, port
));
1437 return (scm_read_quote (chr
, port
));
1441 result
= scm_read_sharp (chr
, port
);
1442 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1443 /* We read a comment or some such. */
1449 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1452 if (SCM_SQUARE_BRACKETS_P
)
1453 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1454 /* otherwise fall through */
1458 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1459 return scm_symbol_to_keyword (scm_read_expression (port
));
1464 if (((chr
>= '0') && (chr
<= '9'))
1465 || (strchr ("+-.", chr
)))
1466 return (scm_read_number (chr
, port
));
1468 return (scm_read_mixed_case_symbol (chr
, port
));
1476 /* Actual reader. */
1478 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1480 "Read an s-expression from the input port @var{port}, or from\n"
1481 "the current input port if @var{port} is not specified.\n"
1482 "Any whitespace before the next token is discarded.")
1483 #define FUNC_NAME s_scm_read
1487 if (SCM_UNBNDP (port
))
1488 port
= scm_current_input_port ();
1489 SCM_VALIDATE_OPINPORT (1, port
);
1491 c
= flush_ws (port
, (char *) NULL
);
1494 scm_ungetc (c
, port
);
1496 return (scm_read_expression (port
));
1503 /* Manipulate the read-hash-procedures alist. This could be written in
1504 Scheme, but maybe it will also be used by C code during initialisation. */
1505 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1506 (SCM chr
, SCM proc
),
1507 "Install the procedure @var{proc} for reading expressions\n"
1508 "starting with the character sequence @code{#} and @var{chr}.\n"
1509 "@var{proc} will be called with two arguments: the character\n"
1510 "@var{chr} and the port to read further data from. The object\n"
1511 "returned will be the return value of @code{read}. \n"
1512 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1514 #define FUNC_NAME s_scm_read_hash_extend
1519 SCM_VALIDATE_CHAR (1, chr
);
1520 SCM_ASSERT (scm_is_false (proc
)
1521 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1522 proc
, SCM_ARG2
, FUNC_NAME
);
1524 /* Check if chr is already in the alist. */
1525 this = scm_i_read_hash_procedures_ref ();
1529 if (scm_is_null (this))
1531 /* not found, so add it to the beginning. */
1532 if (scm_is_true (proc
))
1534 SCM
new = scm_cons (scm_cons (chr
, proc
),
1535 scm_i_read_hash_procedures_ref ());
1536 scm_i_read_hash_procedures_set_x (new);
1540 if (scm_is_eq (chr
, SCM_CAAR (this)))
1542 /* already in the alist. */
1543 if (scm_is_false (proc
))
1546 if (scm_is_false (prev
))
1548 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1549 scm_i_read_hash_procedures_set_x (rest
);
1552 scm_set_cdr_x (prev
, SCM_CDR (this));
1557 scm_set_cdr_x (SCM_CAR (this), proc
);
1562 this = SCM_CDR (this);
1565 return SCM_UNSPECIFIED
;
1569 /* Recover the read-hash procedure corresponding to char c. */
1571 scm_get_hash_procedure (int c
)
1573 SCM rest
= scm_i_read_hash_procedures_ref ();
1577 if (scm_is_null (rest
))
1580 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1581 return SCM_CDAR (rest
);
1583 rest
= SCM_CDR (rest
);
1587 #define SCM_ENCODING_SEARCH_SIZE (500)
1589 /* Search the first few hundred characters of a file for an Emacs-like coding
1590 declaration. Returns either NULL or a string whose storage has been
1591 allocated with `scm_gc_malloc ()'. */
1593 scm_i_scan_for_encoding (SCM port
)
1596 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1597 size_t bytes_read
, encoding_length
, i
;
1598 char *encoding
= NULL
;
1600 char *pos
, *encoding_start
;
1603 pt
= SCM_PTAB_ENTRY (port
);
1605 if (pt
->rw_active
== SCM_PORT_WRITE
)
1609 pt
->rw_active
= SCM_PORT_READ
;
1611 if (pt
->read_pos
== pt
->read_end
)
1613 /* We can use the read buffer, and thus avoid a seek. */
1614 if (scm_fill_input (port
) == EOF
)
1617 bytes_read
= pt
->read_end
- pt
->read_pos
;
1618 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1619 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1621 if (bytes_read
<= 1)
1622 /* An unbuffered port -- don't scan. */
1625 memcpy (header
, pt
->read_pos
, bytes_read
);
1626 header
[bytes_read
] = '\0';
1630 /* Try to read some bytes and then seek back. Not all ports
1631 support seeking back; and indeed some file ports (like
1632 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1633 check performed by SCM_FPORT_FDES---but fail to seek
1634 backwards. Hence this block comes second. We prefer to use
1635 the read buffer in-place. */
1636 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1639 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1640 header
[bytes_read
] = '\0';
1641 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1645 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1648 /* search past "coding[:=]" */
1652 if ((pos
= strstr(pos
, "coding")) == NULL
)
1655 pos
+= strlen("coding");
1656 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1657 (*pos
== ':' || *pos
== '='))
1665 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1666 (*pos
== ' ' || *pos
== '\t'))
1669 /* grab the next token */
1670 encoding_start
= pos
;
1672 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1673 && encoding_start
+ i
- header
< bytes_read
1674 && (isalnum ((int) encoding_start
[i
])
1675 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1678 encoding_length
= i
;
1679 if (encoding_length
== 0)
1682 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1683 for (i
= 0; i
< encoding_length
; i
++)
1684 encoding
[i
] = toupper ((int) encoding
[i
]);
1686 /* push backwards to make sure we were in a comment */
1688 pos
= encoding_start
;
1689 while (pos
>= header
)
1696 else if (*pos
== '\n' || pos
== header
)
1698 /* This wasn't in a semicolon comment. Check for a
1699 hash-bang comment. */
1700 char *beg
= strstr (header
, "#!");
1701 char *end
= strstr (header
, "!#");
1702 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1713 /* This wasn't in a comment */
1716 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1717 scm_misc_error (NULL
,
1718 "the port input declares the encoding ~s but is encoded as UTF-8",
1719 scm_list_1 (scm_from_locale_string (encoding
)));
1724 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1726 "Scans the port for an Emacs-like character coding declaration\n"
1727 "near the top of the contents of a port with random-accessible contents.\n"
1728 "The coding declaration is of the form\n"
1729 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1731 "Returns a string containing the character encoding of the file\n"
1732 "if a declaration was found, or @code{#f} otherwise.\n")
1733 #define FUNC_NAME s_scm_file_encoding
1738 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1740 enc
= scm_i_scan_for_encoding (port
);
1745 s_enc
= scm_from_locale_string (enc
);
1756 SCM read_hash_procs
;
1758 read_hash_procs
= scm_make_fluid ();
1759 scm_fluid_set_x (read_hash_procs
, SCM_EOL
);
1761 scm_i_read_hash_procedures
=
1762 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1764 scm_init_opts (scm_read_options
, scm_read_opts
);
1765 #include "libguile/read.x"