1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
244 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
245 char **overflow_buffer
, size_t *read
)
248 size_t bytes_read
, overflow_size
;
250 *overflow_buffer
= NULL
;
255 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
258 if (overflow
|| overflow_size
!= 0)
260 if (overflow_size
== 0)
262 *overflow_buffer
= scm_malloc (bytes_read
);
263 memcpy (*overflow_buffer
, buffer
, bytes_read
);
264 overflow_size
= bytes_read
;
268 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
269 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
!= 0);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
289 register scm_t_wchar c
;
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
);
363 scm_read_sexp (scm_t_wchar chr
, SCM port
)
364 #define FUNC_NAME "scm_i_lreadparen"
367 SCM tmp
, tl
, ans
= SCM_EOL
;
368 const int terminating_char
= ((chr
== '[') ? ']' : ')');
370 /* Need to capture line and column numbers here. */
371 long line
= SCM_LINUM (port
);
372 int column
= SCM_COL (port
) - 1;
374 c
= flush_ws (port
, FUNC_NAME
);
375 if (terminating_char
== c
)
378 scm_ungetc (c
, port
);
379 tmp
= scm_read_expression (port
);
381 /* Note that it is possible for scm_read_expression to return
382 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
383 check that it's a real dot by checking `c'. */
384 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
386 ans
= scm_read_expression (port
);
387 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
388 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
393 /* Build the head of the list structure. */
394 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
396 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
400 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
401 scm_i_input_error (FUNC_NAME
, port
,
402 "in pair: mismatched close paren: ~A",
403 scm_list_1 (SCM_MAKE_CHAR (c
)));
405 scm_ungetc (c
, port
);
406 tmp
= scm_read_expression (port
);
408 /* See above note about scm_sym_dot. */
409 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
411 SCM_SETCDR (tl
, scm_read_expression (port
));
413 c
= flush_ws (port
, FUNC_NAME
);
414 if (terminating_char
!= c
)
415 scm_i_input_error (FUNC_NAME
, port
,
416 "in pair: missing close paren", SCM_EOL
);
420 new_tail
= scm_cons (tmp
, SCM_EOL
);
421 SCM_SETCDR (tl
, new_tail
);
426 if (SCM_RECORD_POSITIONS_P
)
427 scm_i_set_source_properties_x (ans
, line
, column
, SCM_FILENAME (port
));
434 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
435 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
437 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 while (i < ndigits) \
445 a = scm_getc (port); \
449 && (a == (scm_t_wchar) terminator) \
452 if ('0' <= a && a <= '9') \
454 else if ('A' <= a && a <= 'F') \
456 else if ('a' <= a && a <= 'f') \
469 skip_intraline_whitespace (SCM port
)
479 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
481 scm_ungetc (c
, port
);
485 scm_read_string (int chr
, SCM port
)
486 #define FUNC_NAME "scm_lreadr"
488 /* For strings smaller than C_STR, this function creates only one Scheme
489 object (the string returned). */
491 SCM str
= SCM_BOOL_F
;
492 unsigned c_str_len
= 0;
495 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
496 while ('"' != (c
= scm_getc (port
)))
501 scm_i_input_error (FUNC_NAME
, port
,
502 "end of file in string constant", SCM_EOL
);
505 if (c_str_len
+ 1 >= scm_i_string_length (str
))
507 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
509 str
= scm_string_append (scm_list_2 (str
, addy
));
514 switch (c
= scm_getc (port
))
522 if (SCM_HUNGRY_EOL_ESCAPES_P
)
523 skip_intraline_whitespace (port
);
550 if (SCM_R6RS_ESCAPES_P
)
551 SCM_READ_HEX_ESCAPE (10, ';');
553 SCM_READ_HEX_ESCAPE (2, '\0');
556 if (!SCM_R6RS_ESCAPES_P
)
558 SCM_READ_HEX_ESCAPE (4, '\0');
562 if (!SCM_R6RS_ESCAPES_P
)
564 SCM_READ_HEX_ESCAPE (6, '\0');
569 scm_i_input_error (FUNC_NAME
, port
,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c
)));
574 str
= scm_i_string_start_writing (str
);
575 scm_i_string_set_x (str
, c_str_len
++, c
);
576 scm_i_string_stop_writing ();
581 return scm_i_substring_copy (str
, 0, c_str_len
);
590 scm_read_number (scm_t_wchar chr
, SCM port
)
592 SCM result
, str
= SCM_EOL
;
593 char buffer
[READER_BUFFER_SIZE
];
594 char *overflow_buffer
= NULL
;
597 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
599 scm_ungetc (chr
, port
);
600 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
601 &overflow_buffer
, &bytes_read
);
604 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
606 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
609 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
610 if (!scm_is_true (result
))
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P
)
614 str
= scm_string_downcase_x (str
);
615 result
= scm_string_to_symbol (str
);
619 free (overflow_buffer
);
620 SCM_COL (port
) += scm_i_string_length (str
);
625 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
628 int ends_with_colon
= 0;
630 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
632 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
633 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
636 scm_ungetc (chr
, port
);
637 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
638 &overflow_buffer
, &bytes_read
);
642 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
644 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
647 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
650 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
652 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
655 if (SCM_CASE_INSENSITIVE_P
)
656 str
= scm_string_downcase_x (str
);
657 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
662 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
664 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
667 if (SCM_CASE_INSENSITIVE_P
)
668 str
= scm_string_downcase_x (str
);
669 result
= scm_string_to_symbol (str
);
673 free (overflow_buffer
);
674 SCM_COL (port
) += scm_i_string_length (str
);
679 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
680 #define FUNC_NAME "scm_lreadr"
684 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
713 scm_ungetc (chr
, port
);
714 scm_ungetc ('#', port
);
718 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
719 &overflow_buffer
, &read
);
721 pt
= SCM_PTAB_ENTRY (port
);
723 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
725 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
728 result
= scm_string_to_number (str
, scm_from_uint (radix
));
731 free (overflow_buffer
);
733 SCM_COL (port
) += scm_i_string_length (str
);
735 if (scm_is_true (result
))
738 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
745 scm_read_quote (int chr
, SCM port
)
748 long line
= SCM_LINUM (port
);
749 int column
= SCM_COL (port
) - 1;
754 p
= scm_sym_quasiquote
;
767 p
= scm_sym_uq_splicing
;
770 scm_ungetc (c
, port
);
777 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
778 "scm_read_quote", chr
);
782 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
783 if (SCM_RECORD_POSITIONS_P
)
784 scm_i_set_source_properties_x (p
, line
, column
, SCM_FILENAME (port
));
789 SCM_SYMBOL (sym_syntax
, "syntax");
790 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
791 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
792 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
795 scm_read_syntax (int chr
, SCM port
)
798 long line
= SCM_LINUM (port
);
799 int column
= SCM_COL (port
) - 1;
817 p
= sym_unsyntax_splicing
;
820 scm_ungetc (c
, port
);
827 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr
);
832 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
833 if (SCM_RECORD_POSITIONS_P
)
834 scm_i_set_source_properties_x (p
, line
, column
, SCM_FILENAME (port
));
840 scm_read_nil (int chr
, SCM port
)
842 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
844 if (!scm_is_eq (id
, sym_nil
))
845 scm_i_input_error ("scm_read_nil", port
,
846 "unexpected input while reading #nil: ~a",
849 return SCM_ELISP_NIL
;
853 scm_read_semicolon_comment (int chr
, SCM port
)
857 /* We use the get_byte here because there is no need to get the
858 locale correct with comment input. This presumes that newline
859 always represents itself no matter what the encoding is. */
860 for (c
= scm_get_byte_or_eof (port
);
861 (c
!= EOF
) && (c
!= '\n');
862 c
= scm_get_byte_or_eof (port
));
864 return SCM_UNSPECIFIED
;
868 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
871 scm_read_boolean (int chr
, SCM port
)
884 return SCM_UNSPECIFIED
;
888 scm_read_character (scm_t_wchar chr
, SCM port
)
889 #define FUNC_NAME "scm_lreadr"
891 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
893 size_t charname_len
, bytes_read
;
898 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
900 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
904 chr
= scm_getc (port
);
906 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
907 "while reading character", SCM_EOL
);
909 /* CHR must be a token delimiter, like a whitespace. */
910 return (SCM_MAKE_CHAR (chr
));
913 pt
= SCM_PTAB_ENTRY (port
);
915 /* Simple ASCII characters can be processed immediately. Also, simple
916 ISO-8859-1 characters can be processed immediately if the encoding for this
917 port is ISO-8859-1. */
918 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
921 return SCM_MAKE_CHAR (buffer
[0]);
924 /* Otherwise, convert the buffer into a proper scheme string for
926 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
928 charname_len
= scm_i_string_length (charname
);
929 SCM_COL (port
) += charname_len
;
930 cp
= scm_i_string_ref (charname
, 0);
931 if (charname_len
== 1)
932 return SCM_MAKE_CHAR (cp
);
934 /* Ignore dotted circles, which may be used to keep combining characters from
935 combining with the backslash in #\charname. */
936 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
937 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
939 if (cp
>= '0' && cp
< '8')
941 /* Dirk:FIXME:: This type of character syntax is not R5RS
942 * compliant. Further, it should be verified that the constant
943 * does only consist of octal digits. */
944 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
947 scm_t_wchar c
= scm_to_uint32 (p
);
948 if (SCM_IS_UNICODE_CHAR (c
))
949 return SCM_MAKE_CHAR (c
);
951 scm_i_input_error (FUNC_NAME
, port
,
952 "out-of-range octal character escape: ~a",
953 scm_list_1 (charname
));
957 if (cp
== 'x' && (charname_len
> 1))
961 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
962 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
966 scm_t_wchar c
= scm_to_uint32 (p
);
967 if (SCM_IS_UNICODE_CHAR (c
))
968 return SCM_MAKE_CHAR (c
);
970 scm_i_input_error (FUNC_NAME
, port
,
971 "out-of-range hex character escape: ~a",
972 scm_list_1 (charname
));
976 /* The names of characters should never have non-Latin1
978 if (scm_i_is_narrow_string (charname
)
979 || scm_i_try_narrow_string (charname
))
980 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
982 if (scm_is_true (ch
))
986 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
987 scm_list_1 (charname
));
989 return SCM_UNSPECIFIED
;
994 scm_read_keyword (int chr
, SCM port
)
998 /* Read the symbol that comprises the keyword. Doing this instead of
999 invoking a specific symbol reader function allows `scm_read_keyword ()'
1000 to adapt to the delimiters currently valid of symbols.
1002 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1003 symbol
= scm_read_expression (port
);
1004 if (!scm_is_symbol (symbol
))
1005 scm_i_input_error ("scm_read_keyword", port
,
1006 "keyword prefix `~a' not followed by a symbol: ~s",
1007 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1009 return (scm_symbol_to_keyword (symbol
));
1013 scm_read_vector (int chr
, SCM port
)
1015 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016 guarantee that it's going to do what we want. After all, this is an
1017 implementation detail of `scm_read_vector ()', not a desirable
1019 return (scm_vector (scm_read_sexp (chr
, port
)));
1023 scm_read_srfi4_vector (int chr
, SCM port
)
1025 return scm_i_read_array (port
, chr
);
1029 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1031 chr
= scm_getc (port
);
1035 chr
= scm_getc (port
);
1039 chr
= scm_getc (port
);
1043 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1046 scm_i_input_error ("read_bytevector", port
,
1047 "invalid bytevector prefix",
1048 SCM_MAKE_CHAR (chr
));
1049 return SCM_UNSPECIFIED
;
1053 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1055 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1056 terribly inefficient but who cares? */
1057 SCM s_bits
= SCM_EOL
;
1059 for (chr
= scm_getc (port
);
1060 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1061 chr
= scm_getc (port
))
1063 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1067 scm_ungetc (chr
, port
);
1069 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1073 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1079 int c
= scm_getc (port
);
1082 scm_i_input_error ("skip_block_comment", port
,
1083 "unterminated `#! ... !#' comment", SCM_EOL
);
1087 else if (c
== '#' && bang_seen
)
1093 return SCM_UNSPECIFIED
;
1097 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1100 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1102 scm_ungetc (c
, port
);
1103 return scm_read_scsh_block_comment (chr
, port
);
1105 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1107 scm_ungetc (c
, port
);
1108 scm_ungetc ('r', port
);
1109 return scm_read_scsh_block_comment (chr
, port
);
1111 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1113 scm_ungetc (c
, port
);
1114 scm_ungetc ('6', port
);
1115 scm_ungetc ('r', port
);
1116 return scm_read_scsh_block_comment (chr
, port
);
1118 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1120 scm_ungetc (c
, port
);
1121 scm_ungetc ('r', port
);
1122 scm_ungetc ('6', port
);
1123 scm_ungetc ('r', port
);
1124 return scm_read_scsh_block_comment (chr
, port
);
1127 return SCM_UNSPECIFIED
;
1131 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1133 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134 nested. So care must be taken. */
1135 int nesting_level
= 1;
1136 int opening_seen
= 0, closing_seen
= 0;
1138 while (nesting_level
> 0)
1140 int c
= scm_getc (port
);
1143 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1144 "unterminated `#| ... |#' comment", SCM_EOL
);
1152 else if (closing_seen
)
1163 opening_seen
= closing_seen
= 0;
1166 return SCM_UNSPECIFIED
;
1170 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1174 c
= flush_ws (port
, (char *) NULL
);
1176 scm_i_input_error ("read_commented_expression", port
,
1177 "no expression after #; comment", SCM_EOL
);
1178 scm_ungetc (c
, port
);
1179 scm_read_expression (port
);
1180 return SCM_UNSPECIFIED
;
1184 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1186 /* Guile's extended symbol read syntax looks like this:
1188 #{This is all a symbol name}#
1190 So here, CHR is expected to be `{'. */
1193 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1195 buf
= scm_i_string_start_writing (buf
);
1197 while ((chr
= scm_getc (port
)) != EOF
)
1208 scm_i_string_set_x (buf
, len
++, '}');
1214 else if (chr
== '\\')
1216 /* It used to be that print.c would print extended-read-syntax
1217 symbols with backslashes before "non-standard" chars, but
1218 this routine wouldn't do anything with those escapes.
1219 Bummer. What we've done is to change print.c to output
1220 R6RS hex escapes for those characters, relying on the fact
1221 that the extended read syntax would never put a `\' before
1222 an `x'. For now, we just ignore other instances of
1223 backslash in the string. */
1224 switch ((chr
= scm_getc (port
)))
1232 SCM_READ_HEX_ESCAPE (10, ';');
1233 scm_i_string_set_x (buf
, len
++, c
);
1241 scm_i_string_stop_writing ();
1242 scm_i_input_error ("scm_read_extended_symbol", port
,
1243 "illegal character in escape sequence: ~S",
1244 scm_list_1 (SCM_MAKE_CHAR (c
)));
1248 scm_i_string_set_x (buf
, len
++, chr
);
1253 scm_i_string_set_x (buf
, len
++, chr
);
1255 if (len
>= scm_i_string_length (buf
) - 2)
1259 scm_i_string_stop_writing ();
1260 addy
= scm_i_make_string (1024, NULL
, 0);
1261 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1263 buf
= scm_i_string_start_writing (buf
);
1268 scm_i_string_stop_writing ();
1270 scm_i_input_error ("scm_read_extended_symbol", port
,
1271 "end of file while reading symbol", SCM_EOL
);
1273 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1278 /* Top-level token readers, i.e., dispatchers. */
1281 scm_read_sharp_extension (int chr
, SCM port
)
1285 proc
= scm_get_hash_procedure (chr
);
1286 if (scm_is_true (scm_procedure_p (proc
)))
1288 long line
= SCM_LINUM (port
);
1289 int column
= SCM_COL (port
) - 2;
1292 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1294 if (scm_is_pair (got
) && !scm_i_has_source_properties (got
))
1295 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1300 return SCM_UNSPECIFIED
;
1303 /* The reader for the sharp `#' character. It basically dispatches reads
1304 among the above token readers. */
1306 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1307 #define FUNC_NAME "scm_lreadr"
1311 chr
= scm_getc (port
);
1313 result
= scm_read_sharp_extension (chr
, port
);
1314 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1320 return (scm_read_character (chr
, port
));
1322 return (scm_read_vector (chr
, port
));
1327 /* This one may return either a boolean or an SRFI-4 vector. */
1328 return (scm_read_srfi4_vector (chr
, port
));
1330 return (scm_read_bytevector (chr
, port
));
1332 return (scm_read_guile_bit_vector (chr
, port
));
1336 /* This one may return either a boolean or an SRFI-4 vector. */
1337 return (scm_read_boolean (chr
, port
));
1339 return (scm_read_keyword (chr
, port
));
1340 case '0': case '1': case '2': case '3': case '4':
1341 case '5': case '6': case '7': case '8': case '9':
1343 #if SCM_ENABLE_DEPRECATED
1344 /* See below for 'i' and 'e'. */
1350 return (scm_i_read_array (port
, chr
));
1354 #if SCM_ENABLE_DEPRECATED
1356 /* When next char is '(', it really is an old-style
1358 scm_t_wchar next_c
= scm_getc (port
);
1360 scm_ungetc (next_c
, port
);
1362 return scm_i_read_array (port
, chr
);
1376 return (scm_read_number_and_radix (chr
, port
));
1378 return (scm_read_extended_symbol (chr
, port
));
1380 return (scm_read_shebang (chr
, port
));
1382 return (scm_read_commented_expression (chr
, port
));
1386 return (scm_read_syntax (chr
, port
));
1388 return (scm_read_nil (chr
, port
));
1390 result
= scm_read_sharp_extension (chr
, port
);
1391 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1393 /* To remain compatible with 1.8 and earlier, the following
1394 characters have lower precedence than `read-hash-extend'
1399 return scm_read_r6rs_block_comment (chr
, port
);
1401 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1402 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1409 return SCM_UNSPECIFIED
;
1414 scm_read_expression (SCM port
)
1415 #define FUNC_NAME "scm_read_expression"
1419 register scm_t_wchar chr
;
1421 chr
= scm_getc (port
);
1425 case SCM_WHITE_SPACES
:
1426 case SCM_LINE_INCREMENTORS
:
1429 (void) scm_read_semicolon_comment (chr
, port
);
1432 if (!SCM_SQUARE_BRACKETS_P
)
1433 return (scm_read_mixed_case_symbol (chr
, port
));
1434 /* otherwise fall through */
1436 return (scm_read_sexp (chr
, port
));
1438 return (scm_read_string (chr
, port
));
1442 return (scm_read_quote (chr
, port
));
1446 result
= scm_read_sharp (chr
, port
);
1447 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1448 /* We read a comment or some such. */
1454 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1457 if (SCM_SQUARE_BRACKETS_P
)
1458 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1459 /* otherwise fall through */
1463 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1464 return scm_symbol_to_keyword (scm_read_expression (port
));
1469 if (((chr
>= '0') && (chr
<= '9'))
1470 || (strchr ("+-.", chr
)))
1471 return (scm_read_number (chr
, port
));
1473 return (scm_read_mixed_case_symbol (chr
, port
));
1481 /* Actual reader. */
1483 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1485 "Read an s-expression from the input port @var{port}, or from\n"
1486 "the current input port if @var{port} is not specified.\n"
1487 "Any whitespace before the next token is discarded.")
1488 #define FUNC_NAME s_scm_read
1492 if (SCM_UNBNDP (port
))
1493 port
= scm_current_input_port ();
1494 SCM_VALIDATE_OPINPORT (1, port
);
1496 c
= flush_ws (port
, (char *) NULL
);
1499 scm_ungetc (c
, port
);
1501 return (scm_read_expression (port
));
1508 /* Manipulate the read-hash-procedures alist. This could be written in
1509 Scheme, but maybe it will also be used by C code during initialisation. */
1510 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1511 (SCM chr
, SCM proc
),
1512 "Install the procedure @var{proc} for reading expressions\n"
1513 "starting with the character sequence @code{#} and @var{chr}.\n"
1514 "@var{proc} will be called with two arguments: the character\n"
1515 "@var{chr} and the port to read further data from. The object\n"
1516 "returned will be the return value of @code{read}. \n"
1517 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1519 #define FUNC_NAME s_scm_read_hash_extend
1524 SCM_VALIDATE_CHAR (1, chr
);
1525 SCM_ASSERT (scm_is_false (proc
)
1526 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1527 proc
, SCM_ARG2
, FUNC_NAME
);
1529 /* Check if chr is already in the alist. */
1530 this = scm_i_read_hash_procedures_ref ();
1534 if (scm_is_null (this))
1536 /* not found, so add it to the beginning. */
1537 if (scm_is_true (proc
))
1539 SCM
new = scm_cons (scm_cons (chr
, proc
),
1540 scm_i_read_hash_procedures_ref ());
1541 scm_i_read_hash_procedures_set_x (new);
1545 if (scm_is_eq (chr
, SCM_CAAR (this)))
1547 /* already in the alist. */
1548 if (scm_is_false (proc
))
1551 if (scm_is_false (prev
))
1553 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1554 scm_i_read_hash_procedures_set_x (rest
);
1557 scm_set_cdr_x (prev
, SCM_CDR (this));
1562 scm_set_cdr_x (SCM_CAR (this), proc
);
1567 this = SCM_CDR (this);
1570 return SCM_UNSPECIFIED
;
1574 /* Recover the read-hash procedure corresponding to char c. */
1576 scm_get_hash_procedure (int c
)
1578 SCM rest
= scm_i_read_hash_procedures_ref ();
1582 if (scm_is_null (rest
))
1585 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1586 return SCM_CDAR (rest
);
1588 rest
= SCM_CDR (rest
);
1592 #define SCM_ENCODING_SEARCH_SIZE (500)
1594 /* Search the first few hundred characters of a file for an Emacs-like coding
1595 declaration. Returns either NULL or a string whose storage has been
1596 allocated with `scm_gc_malloc ()'. */
1598 scm_i_scan_for_encoding (SCM port
)
1601 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1602 size_t bytes_read
, encoding_length
, i
;
1603 char *encoding
= NULL
;
1605 char *pos
, *encoding_start
;
1608 pt
= SCM_PTAB_ENTRY (port
);
1610 if (pt
->rw_active
== SCM_PORT_WRITE
)
1614 pt
->rw_active
= SCM_PORT_READ
;
1616 if (pt
->read_pos
== pt
->read_end
)
1618 /* We can use the read buffer, and thus avoid a seek. */
1619 if (scm_fill_input (port
) == EOF
)
1622 bytes_read
= pt
->read_end
- pt
->read_pos
;
1623 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1624 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1626 if (bytes_read
<= 1)
1627 /* An unbuffered port -- don't scan. */
1630 memcpy (header
, pt
->read_pos
, bytes_read
);
1631 header
[bytes_read
] = '\0';
1635 /* Try to read some bytes and then seek back. Not all ports
1636 support seeking back; and indeed some file ports (like
1637 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1638 check performed by SCM_FPORT_FDES---but fail to seek
1639 backwards. Hence this block comes second. We prefer to use
1640 the read buffer in-place. */
1641 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1644 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1645 header
[bytes_read
] = '\0';
1646 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1650 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1653 /* search past "coding[:=]" */
1657 if ((pos
= strstr(pos
, "coding")) == NULL
)
1660 pos
+= strlen("coding");
1661 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1662 (*pos
== ':' || *pos
== '='))
1670 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1671 (*pos
== ' ' || *pos
== '\t'))
1674 /* grab the next token */
1675 encoding_start
= pos
;
1677 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1678 && encoding_start
+ i
- header
< bytes_read
1679 && (isalnum ((int) encoding_start
[i
])
1680 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1683 encoding_length
= i
;
1684 if (encoding_length
== 0)
1687 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1688 for (i
= 0; i
< encoding_length
; i
++)
1689 encoding
[i
] = toupper ((int) encoding
[i
]);
1691 /* push backwards to make sure we were in a comment */
1693 pos
= encoding_start
;
1694 while (pos
>= header
)
1701 else if (*pos
== '\n' || pos
== header
)
1703 /* This wasn't in a semicolon comment. Check for a
1704 hash-bang comment. */
1705 char *beg
= strstr (header
, "#!");
1706 char *end
= strstr (header
, "!#");
1707 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1718 /* This wasn't in a comment */
1721 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1722 scm_misc_error (NULL
,
1723 "the port input declares the encoding ~s but is encoded as UTF-8",
1724 scm_list_1 (scm_from_locale_string (encoding
)));
1729 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1731 "Scans the port for an Emacs-like character coding declaration\n"
1732 "near the top of the contents of a port with random-accessible contents.\n"
1733 "The coding declaration is of the form\n"
1734 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1736 "Returns a string containing the character encoding of the file\n"
1737 "if a declaration was found, or @code{#f} otherwise.\n")
1738 #define FUNC_NAME s_scm_file_encoding
1743 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1745 enc
= scm_i_scan_for_encoding (port
);
1750 s_enc
= scm_from_locale_string (enc
);
1761 SCM read_hash_procs
;
1763 read_hash_procs
= scm_make_fluid ();
1764 scm_fluid_set_x (read_hash_procs
, SCM_EOL
);
1766 scm_i_read_hash_procedures
=
1767 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1769 scm_init_opts (scm_read_options
, scm_read_opts
);
1770 #include "libguile/read.x"