1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
63 SCM_SYMBOL (sym_nil
, "nil");
65 scm_t_option scm_read_opts
[] = {
66 { SCM_OPTION_BOOLEAN
, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN
, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM
, "keywords", (unsigned long) SCM_BOOL_F
,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN
, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
76 { SCM_OPTION_BOOLEAN
, "elisp-strings", 0,
77 "Support `\\(' and `\\)' in strings."},
78 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
79 "Use R6RS variable-length character and string hex escapes."},
80 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
81 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 Give meaningful error messages for errors
90 FILE:LINE:COL: MESSAGE
93 This is not standard GNU format, but the test-suite likes the real
94 message to be in front.
100 scm_i_input_error (char const *function
,
101 SCM port
, const char *message
, SCM arg
)
103 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
105 : scm_from_locale_string ("#<unknown port>"));
107 SCM string_port
= scm_open_output_string ();
108 SCM string
= SCM_EOL
;
109 scm_simple_format (string_port
,
110 scm_from_locale_string ("~A:~S:~S: ~A"),
112 scm_from_long (SCM_LINUM (port
) + 1),
113 scm_from_int (SCM_COL (port
) + 1),
114 scm_from_locale_string (message
)));
116 string
= scm_get_output_string (string_port
);
117 scm_close_output_port (string_port
);
118 scm_error_scm (scm_from_locale_symbol ("read-error"),
119 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
126 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
128 "Option interface for the read options. Instead of using\n"
129 "this procedure directly, use the procedures @code{read-enable},\n"
130 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
131 #define FUNC_NAME s_scm_read_options
133 SCM ans
= scm_options (setting
,
136 if (SCM_COPY_SOURCE_P
)
137 SCM_RECORD_POSITIONS_P
= 1;
142 /* An association list mapping extra hash characters to procedures. */
143 static SCM
*scm_read_hash_procedures
;
150 /* Size of the C buffer used to read symbols and numbers. */
151 #define READER_BUFFER_SIZE 128
153 /* Size of the C buffer used to read strings. */
154 #define READER_STRING_BUFFER_SIZE 512
156 /* The maximum size of Scheme character names. */
157 #define READER_CHAR_NAME_MAX_SIZE 50
160 /* `isblank' is only in C99. */
161 #define CHAR_IS_BLANK_(_chr) \
162 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
163 || ((_chr) == '\f') || ((_chr) == '\r'))
166 # define CHAR_IS_BLANK(_chr) \
167 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
175 #define CHAR_IS_R5RS_DELIMITER(c) \
177 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
178 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
180 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
184 #define CHAR_IS_EXPONENT_MARKER(_chr) \
185 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
186 || ((_chr) == 'd') || ((_chr) == 'l'))
188 /* Read an SCSH block comment. */
189 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
190 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
191 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
192 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
193 static SCM
scm_get_hash_procedure (int);
195 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
196 result in the pre-allocated buffer BUF. Return zero if the whole token has
197 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
198 bytes actually read. */
200 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
204 while (*read
< buf_size
)
208 chr
= scm_get_byte_or_eof (port
);
212 else if (CHAR_IS_DELIMITER (chr
))
214 scm_unget_byte (chr
, port
);
227 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
228 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
229 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
230 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
231 will be set the number of bytes actually read. */
233 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
234 char **overflow_buffer
, size_t *read
)
237 size_t bytes_read
, overflow_size
;
239 *overflow_buffer
= NULL
;
244 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
247 if (overflow
|| overflow_size
!= 0)
249 if (overflow_size
== 0)
251 *overflow_buffer
= scm_malloc (bytes_read
);
252 memcpy (*overflow_buffer
, buffer
, bytes_read
);
253 overflow_size
= bytes_read
;
257 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
258 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
259 overflow_size
+= bytes_read
;
266 *read
= overflow_size
;
270 return (overflow_size
!= 0);
273 /* Skip whitespace from PORT and return the first non-whitespace character
274 read. Raise an error on end-of-file. */
276 flush_ws (SCM port
, const char *eoferr
)
278 register scm_t_wchar c
;
280 switch (c
= scm_getc (port
))
286 scm_i_input_error (eoferr
,
295 switch (c
= scm_getc (port
))
301 case SCM_LINE_INCREMENTORS
:
307 switch (c
= scm_getc (port
))
310 eoferr
= "read_sharp";
313 scm_read_shebang (c
, port
);
316 scm_read_commented_expression (c
, port
);
319 if (scm_is_false (scm_get_hash_procedure (c
)))
321 scm_read_r6rs_block_comment (c
, port
);
326 scm_ungetc (c
, port
);
331 case SCM_LINE_INCREMENTORS
:
332 case SCM_SINGLE_SPACES
:
347 static SCM
scm_read_expression (SCM port
);
348 static SCM
scm_read_sharp (int chr
, SCM port
);
349 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
353 scm_read_sexp (scm_t_wchar chr
, SCM port
)
354 #define FUNC_NAME "scm_i_lreadparen"
358 register SCM tl
, ans
= SCM_EOL
;
359 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
360 const int terminating_char
= ((chr
== '[') ? ']' : ')');
362 /* Need to capture line and column numbers here. */
363 long line
= SCM_LINUM (port
);
364 int column
= SCM_COL (port
) - 1;
367 c
= flush_ws (port
, FUNC_NAME
);
368 if (terminating_char
== c
)
371 scm_ungetc (c
, port
);
372 if (scm_is_eq (scm_sym_dot
,
373 (tmp
= scm_read_expression (port
))))
375 ans
= scm_read_expression (port
);
376 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
377 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
382 /* Build the head of the list structure. */
383 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
385 if (SCM_COPY_SOURCE_P
)
386 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
391 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
395 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
396 scm_i_input_error (FUNC_NAME
, port
,
397 "in pair: mismatched close paren: ~A",
398 scm_list_1 (SCM_MAKE_CHAR (c
)));
400 scm_ungetc (c
, port
);
401 tmp
= scm_read_expression (port
);
403 if (scm_is_eq (scm_sym_dot
, tmp
))
405 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
407 if (SCM_COPY_SOURCE_P
)
408 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
411 c
= flush_ws (port
, FUNC_NAME
);
412 if (terminating_char
!= c
)
413 scm_i_input_error (FUNC_NAME
, port
,
414 "in pair: missing close paren", SCM_EOL
);
418 new_tail
= scm_cons (tmp
, SCM_EOL
);
419 SCM_SETCDR (tl
, new_tail
);
422 if (SCM_COPY_SOURCE_P
)
424 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
427 SCM_SETCDR (tl2
, new_tail2
);
433 if (SCM_RECORD_POSITIONS_P
)
434 scm_whash_insert (scm_source_whash
,
436 scm_make_srcprops (line
, column
,
447 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
448 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
450 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
456 while (i < ndigits) \
458 a = scm_getc (port); \
462 && (a == (scm_t_wchar) terminator) \
465 if ('0' <= a && a <= '9') \
467 else if ('A' <= a && a <= 'F') \
469 else if ('a' <= a && a <= 'f') \
482 scm_read_string (int chr
, SCM port
)
483 #define FUNC_NAME "scm_lreadr"
485 /* For strings smaller than C_STR, this function creates only one Scheme
486 object (the string returned). */
488 SCM str
= SCM_BOOL_F
;
489 unsigned c_str_len
= 0;
492 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
493 while ('"' != (c
= scm_getc (port
)))
498 scm_i_input_error (FUNC_NAME
, port
,
499 "end of file in string constant", SCM_EOL
);
502 if (c_str_len
+ 1 >= scm_i_string_length (str
))
504 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
506 str
= scm_string_append (scm_list_2 (str
, addy
));
511 switch (c
= scm_getc (port
))
520 if (SCM_ESCAPED_PARENS_P
)
550 if (SCM_R6RS_ESCAPES_P
)
551 SCM_READ_HEX_ESCAPE (10, ';');
553 SCM_READ_HEX_ESCAPE (2, '\0');
556 if (!SCM_R6RS_ESCAPES_P
)
558 SCM_READ_HEX_ESCAPE (4, '\0');
562 if (!SCM_R6RS_ESCAPES_P
)
564 SCM_READ_HEX_ESCAPE (6, '\0');
569 scm_i_input_error (FUNC_NAME
, port
,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c
)));
574 str
= scm_i_string_start_writing (str
);
575 scm_i_string_set_x (str
, c_str_len
++, c
);
576 scm_i_string_stop_writing ();
581 return scm_i_substring_copy (str
, 0, c_str_len
);
590 scm_read_number (scm_t_wchar chr
, SCM port
)
592 SCM result
, str
= SCM_EOL
;
593 char buffer
[READER_BUFFER_SIZE
];
594 char *overflow_buffer
= NULL
;
597 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
599 scm_ungetc (chr
, port
);
600 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
601 &overflow_buffer
, &bytes_read
);
604 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
606 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
609 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
610 if (!scm_is_true (result
))
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P
)
614 str
= scm_string_downcase_x (str
);
615 result
= scm_string_to_symbol (str
);
619 free (overflow_buffer
);
620 SCM_COL (port
) += scm_i_string_length (str
);
625 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
628 int ends_with_colon
= 0;
630 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
632 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
633 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
636 scm_ungetc (chr
, port
);
637 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
638 &overflow_buffer
, &bytes_read
);
642 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
644 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
647 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
650 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
652 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
655 if (SCM_CASE_INSENSITIVE_P
)
656 str
= scm_string_downcase_x (str
);
657 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
662 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
664 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
667 if (SCM_CASE_INSENSITIVE_P
)
668 str
= scm_string_downcase_x (str
);
669 result
= scm_string_to_symbol (str
);
673 free (overflow_buffer
);
674 SCM_COL (port
) += scm_i_string_length (str
);
679 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
680 #define FUNC_NAME "scm_lreadr"
684 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
713 scm_ungetc (chr
, port
);
714 scm_ungetc ('#', port
);
718 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
719 &overflow_buffer
, &read
);
721 pt
= SCM_PTAB_ENTRY (port
);
723 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
725 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
728 result
= scm_string_to_number (str
, scm_from_uint (radix
));
731 free (overflow_buffer
);
733 SCM_COL (port
) += scm_i_string_length (str
);
735 if (scm_is_true (result
))
738 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
745 scm_read_quote (int chr
, SCM port
)
748 long line
= SCM_LINUM (port
);
749 int column
= SCM_COL (port
) - 1;
754 p
= scm_sym_quasiquote
;
767 p
= scm_sym_uq_splicing
;
770 scm_ungetc (c
, port
);
777 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
778 "scm_read_quote", chr
);
782 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
783 if (SCM_RECORD_POSITIONS_P
)
784 scm_whash_insert (scm_source_whash
, p
,
785 scm_make_srcprops (line
, column
,
788 ? (scm_cons2 (SCM_CAR (p
),
789 SCM_CAR (SCM_CDR (p
)),
798 SCM_SYMBOL (sym_syntax
, "syntax");
799 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
800 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
801 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
804 scm_read_syntax (int chr
, SCM port
)
807 long line
= SCM_LINUM (port
);
808 int column
= SCM_COL (port
) - 1;
826 p
= sym_unsyntax_splicing
;
829 scm_ungetc (c
, port
);
836 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
837 "scm_read_syntax", chr
);
841 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
842 if (SCM_RECORD_POSITIONS_P
)
843 scm_whash_insert (scm_source_whash
, p
,
844 scm_make_srcprops (line
, column
,
847 ? (scm_cons2 (SCM_CAR (p
),
848 SCM_CAR (SCM_CDR (p
)),
858 scm_read_nil (int chr
, SCM port
)
860 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
862 if (!scm_is_eq (id
, sym_nil
))
863 scm_i_input_error ("scm_read_nil", port
,
864 "unexpected input while reading #nil: ~a",
867 return SCM_ELISP_NIL
;
871 scm_read_semicolon_comment (int chr
, SCM port
)
875 /* We use the get_byte here because there is no need to get the
876 locale correct with comment input. This presumes that newline
877 always represents itself no matter what the encoding is. */
878 for (c
= scm_get_byte_or_eof (port
);
879 (c
!= EOF
) && (c
!= '\n');
880 c
= scm_get_byte_or_eof (port
));
882 return SCM_UNSPECIFIED
;
886 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
889 scm_read_boolean (int chr
, SCM port
)
902 return SCM_UNSPECIFIED
;
906 scm_read_character (scm_t_wchar chr
, SCM port
)
907 #define FUNC_NAME "scm_lreadr"
909 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
911 size_t charname_len
, bytes_read
;
916 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
922 chr
= scm_getc (port
);
924 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
925 "while reading character", SCM_EOL
);
927 /* CHR must be a token delimiter, like a whitespace. */
928 return (SCM_MAKE_CHAR (chr
));
931 pt
= SCM_PTAB_ENTRY (port
);
933 /* Simple ASCII characters can be processed immediately. Also, simple
934 ISO-8859-1 characters can be processed immediately if the encoding for this
935 port is ISO-8859-1. */
936 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
939 return SCM_MAKE_CHAR (buffer
[0]);
942 /* Otherwise, convert the buffer into a proper scheme string for
944 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
946 charname_len
= scm_i_string_length (charname
);
947 SCM_COL (port
) += charname_len
;
948 cp
= scm_i_string_ref (charname
, 0);
949 if (charname_len
== 1)
950 return SCM_MAKE_CHAR (cp
);
952 /* Ignore dotted circles, which may be used to keep combining characters from
953 combining with the backslash in #\charname. */
954 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
955 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
957 if (cp
>= '0' && cp
< '8')
959 /* Dirk:FIXME:: This type of character syntax is not R5RS
960 * compliant. Further, it should be verified that the constant
961 * does only consist of octal digits. */
962 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
965 scm_t_wchar c
= SCM_I_INUM (p
);
966 if (SCM_IS_UNICODE_CHAR (c
))
967 return SCM_MAKE_CHAR (c
);
969 scm_i_input_error (FUNC_NAME
, port
,
970 "out-of-range octal character escape: ~a",
971 scm_list_1 (charname
));
975 if (cp
== 'x' && (charname_len
> 1))
979 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
980 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
984 scm_t_wchar c
= SCM_I_INUM (p
);
985 if (SCM_IS_UNICODE_CHAR (c
))
986 return SCM_MAKE_CHAR (c
);
988 scm_i_input_error (FUNC_NAME
, port
,
989 "out-of-range hex character escape: ~a",
990 scm_list_1 (charname
));
994 /* The names of characters should never have non-Latin1
996 if (scm_i_is_narrow_string (charname
)
997 || scm_i_try_narrow_string (charname
))
998 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1000 if (scm_is_true (ch
))
1005 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1006 scm_list_1 (charname
));
1008 return SCM_UNSPECIFIED
;
1013 scm_read_keyword (int chr
, SCM port
)
1017 /* Read the symbol that comprises the keyword. Doing this instead of
1018 invoking a specific symbol reader function allows `scm_read_keyword ()'
1019 to adapt to the delimiters currently valid of symbols.
1021 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1022 symbol
= scm_read_expression (port
);
1023 if (!scm_is_symbol (symbol
))
1024 scm_i_input_error ("scm_read_keyword", port
,
1025 "keyword prefix `~a' not followed by a symbol: ~s",
1026 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1028 return (scm_symbol_to_keyword (symbol
));
1032 scm_read_vector (int chr
, SCM port
)
1034 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1035 guarantee that it's going to do what we want. After all, this is an
1036 implementation detail of `scm_read_vector ()', not a desirable
1038 return (scm_vector (scm_read_sexp (chr
, port
)));
1042 scm_read_srfi4_vector (int chr
, SCM port
)
1044 return scm_i_read_array (port
, chr
);
1048 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1050 chr
= scm_getc (port
);
1054 chr
= scm_getc (port
);
1058 chr
= scm_getc (port
);
1062 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1065 scm_i_input_error ("read_bytevector", port
,
1066 "invalid bytevector prefix",
1067 SCM_MAKE_CHAR (chr
));
1068 return SCM_UNSPECIFIED
;
1072 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1074 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1075 terribly inefficient but who cares? */
1076 SCM s_bits
= SCM_EOL
;
1078 for (chr
= scm_getc (port
);
1079 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1080 chr
= scm_getc (port
))
1082 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1086 scm_ungetc (chr
, port
);
1088 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1092 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1096 /* We can use the get_byte here because there is no need to get the
1097 locale correct when reading comments. This presumes that
1098 hash and exclamation points always represent themselves no
1099 matter what the source encoding is.*/
1102 int c
= scm_get_byte_or_eof (port
);
1105 scm_i_input_error ("skip_block_comment", port
,
1106 "unterminated `#! ... !#' comment", SCM_EOL
);
1110 else if (c
== '#' && bang_seen
)
1116 return SCM_UNSPECIFIED
;
1120 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1123 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1125 scm_ungetc (c
, port
);
1126 return scm_read_scsh_block_comment (chr
, port
);
1128 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1130 scm_ungetc (c
, port
);
1131 scm_ungetc ('r', port
);
1132 return scm_read_scsh_block_comment (chr
, port
);
1134 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1136 scm_ungetc (c
, port
);
1137 scm_ungetc ('6', port
);
1138 scm_ungetc ('r', port
);
1139 return scm_read_scsh_block_comment (chr
, port
);
1141 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1143 scm_ungetc (c
, port
);
1144 scm_ungetc ('r', port
);
1145 scm_ungetc ('6', port
);
1146 scm_ungetc ('r', port
);
1147 return scm_read_scsh_block_comment (chr
, port
);
1150 return SCM_UNSPECIFIED
;
1154 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1156 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1157 nested. So care must be taken. */
1158 int nesting_level
= 1;
1159 int opening_seen
= 0, closing_seen
= 0;
1161 while (nesting_level
> 0)
1163 int c
= scm_getc (port
);
1166 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1167 "unterminated `#| ... |#' comment", SCM_EOL
);
1175 else if (closing_seen
)
1186 opening_seen
= closing_seen
= 0;
1189 return SCM_UNSPECIFIED
;
1193 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1197 c
= flush_ws (port
, (char *) NULL
);
1199 scm_i_input_error ("read_commented_expression", port
,
1200 "no expression after #; comment", SCM_EOL
);
1201 scm_ungetc (c
, port
);
1202 scm_read_expression (port
);
1203 return SCM_UNSPECIFIED
;
1207 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1209 /* Guile's extended symbol read syntax looks like this:
1211 #{This is all a symbol name}#
1213 So here, CHR is expected to be `{'. */
1214 int saw_brace
= 0, finished
= 0;
1216 SCM buf
= scm_i_make_string (1024, NULL
);
1218 buf
= scm_i_string_start_writing (buf
);
1220 while ((chr
= scm_getc (port
)) != EOF
)
1232 scm_i_string_set_x (buf
, len
++, '}');
1233 scm_i_string_set_x (buf
, len
++, chr
);
1236 else if (chr
== '}')
1239 scm_i_string_set_x (buf
, len
++, chr
);
1241 if (len
>= scm_i_string_length (buf
) - 2)
1245 scm_i_string_stop_writing ();
1246 addy
= scm_i_make_string (1024, NULL
);
1247 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1249 buf
= scm_i_string_start_writing (buf
);
1255 scm_i_string_stop_writing ();
1257 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1262 /* Top-level token readers, i.e., dispatchers. */
1265 scm_read_sharp_extension (int chr
, SCM port
)
1269 proc
= scm_get_hash_procedure (chr
);
1270 if (scm_is_true (scm_procedure_p (proc
)))
1272 long line
= SCM_LINUM (port
);
1273 int column
= SCM_COL (port
) - 2;
1276 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1277 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1279 if (SCM_RECORD_POSITIONS_P
)
1280 return (recsexpr (got
, line
, column
,
1281 SCM_FILENAME (port
)));
1287 return SCM_UNSPECIFIED
;
1290 /* The reader for the sharp `#' character. It basically dispatches reads
1291 among the above token readers. */
1293 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1294 #define FUNC_NAME "scm_lreadr"
1298 chr
= scm_getc (port
);
1300 result
= scm_read_sharp_extension (chr
, port
);
1301 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1307 return (scm_read_character (chr
, port
));
1309 return (scm_read_vector (chr
, port
));
1313 /* This one may return either a boolean or an SRFI-4 vector. */
1314 return (scm_read_srfi4_vector (chr
, port
));
1316 return (scm_read_bytevector (chr
, port
));
1318 return (scm_read_guile_bit_vector (chr
, port
));
1322 /* This one may return either a boolean or an SRFI-4 vector. */
1323 return (scm_read_boolean (chr
, port
));
1325 return (scm_read_keyword (chr
, port
));
1326 case '0': case '1': case '2': case '3': case '4':
1327 case '5': case '6': case '7': case '8': case '9':
1329 #if SCM_ENABLE_DEPRECATED
1330 /* See below for 'i' and 'e'. */
1337 return (scm_i_read_array (port
, chr
));
1341 #if SCM_ENABLE_DEPRECATED
1343 /* When next char is '(', it really is an old-style
1345 scm_t_wchar next_c
= scm_getc (port
);
1347 scm_ungetc (next_c
, port
);
1349 return scm_i_read_array (port
, chr
);
1363 return (scm_read_number_and_radix (chr
, port
));
1365 return (scm_read_extended_symbol (chr
, port
));
1367 return (scm_read_shebang (chr
, port
));
1369 return (scm_read_commented_expression (chr
, port
));
1373 return (scm_read_syntax (chr
, port
));
1375 return (scm_read_nil (chr
, port
));
1377 result
= scm_read_sharp_extension (chr
, port
);
1378 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1380 /* To remain compatible with 1.8 and earlier, the following
1381 characters have lower precedence than `read-hash-extend'
1386 return scm_read_r6rs_block_comment (chr
, port
);
1388 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1389 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1396 return SCM_UNSPECIFIED
;
1401 scm_read_expression (SCM port
)
1402 #define FUNC_NAME "scm_read_expression"
1406 register scm_t_wchar chr
;
1408 chr
= scm_getc (port
);
1412 case SCM_WHITE_SPACES
:
1413 case SCM_LINE_INCREMENTORS
:
1416 (void) scm_read_semicolon_comment (chr
, port
);
1419 if (!SCM_SQUARE_BRACKETS_P
)
1420 return (scm_read_mixed_case_symbol (chr
, port
));
1421 /* otherwise fall through */
1423 return (scm_read_sexp (chr
, port
));
1425 return (scm_read_string (chr
, port
));
1429 return (scm_read_quote (chr
, port
));
1433 result
= scm_read_sharp (chr
, port
);
1434 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1435 /* We read a comment or some such. */
1441 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1446 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1447 return scm_symbol_to_keyword (scm_read_expression (port
));
1452 if (((chr
>= '0') && (chr
<= '9'))
1453 || (strchr ("+-.", chr
)))
1454 return (scm_read_number (chr
, port
));
1456 return (scm_read_mixed_case_symbol (chr
, port
));
1464 /* Actual reader. */
1466 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1468 "Read an s-expression from the input port @var{port}, or from\n"
1469 "the current input port if @var{port} is not specified.\n"
1470 "Any whitespace before the next token is discarded.")
1471 #define FUNC_NAME s_scm_read
1475 if (SCM_UNBNDP (port
))
1476 port
= scm_current_input_port ();
1477 SCM_VALIDATE_OPINPORT (1, port
);
1479 c
= flush_ws (port
, (char *) NULL
);
1482 scm_ungetc (c
, port
);
1484 return (scm_read_expression (port
));
1491 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1493 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1495 if (!scm_is_pair(obj
)) {
1498 SCM tmp
= obj
, copy
;
1499 /* If this sexpr is visible in the read:sharp source, we want to
1500 keep that information, so only record non-constant cons cells
1501 which haven't previously been read by the reader. */
1502 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1504 if (SCM_COPY_SOURCE_P
)
1506 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1508 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1510 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1515 copy
= SCM_CDR (copy
);
1517 SCM_SETCDR (copy
, tmp
);
1521 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1522 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1523 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1524 copy
= SCM_UNDEFINED
;
1526 scm_whash_insert (scm_source_whash
,
1528 scm_make_srcprops (line
,
1538 /* Manipulate the read-hash-procedures alist. This could be written in
1539 Scheme, but maybe it will also be used by C code during initialisation. */
1540 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1541 (SCM chr
, SCM proc
),
1542 "Install the procedure @var{proc} for reading expressions\n"
1543 "starting with the character sequence @code{#} and @var{chr}.\n"
1544 "@var{proc} will be called with two arguments: the character\n"
1545 "@var{chr} and the port to read further data from. The object\n"
1546 "returned will be the return value of @code{read}. \n"
1547 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1549 #define FUNC_NAME s_scm_read_hash_extend
1554 SCM_VALIDATE_CHAR (1, chr
);
1555 SCM_ASSERT (scm_is_false (proc
)
1556 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1557 proc
, SCM_ARG2
, FUNC_NAME
);
1559 /* Check if chr is already in the alist. */
1560 this = *scm_read_hash_procedures
;
1564 if (scm_is_null (this))
1566 /* not found, so add it to the beginning. */
1567 if (scm_is_true (proc
))
1569 *scm_read_hash_procedures
=
1570 scm_cons (scm_cons (chr
, proc
), *scm_read_hash_procedures
);
1574 if (scm_is_eq (chr
, SCM_CAAR (this)))
1576 /* already in the alist. */
1577 if (scm_is_false (proc
))
1580 if (scm_is_false (prev
))
1582 *scm_read_hash_procedures
=
1583 SCM_CDR (*scm_read_hash_procedures
);
1586 scm_set_cdr_x (prev
, SCM_CDR (this));
1591 scm_set_cdr_x (SCM_CAR (this), proc
);
1596 this = SCM_CDR (this);
1599 return SCM_UNSPECIFIED
;
1603 /* Recover the read-hash procedure corresponding to char c. */
1605 scm_get_hash_procedure (int c
)
1607 SCM rest
= *scm_read_hash_procedures
;
1611 if (scm_is_null (rest
))
1614 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1615 return SCM_CDAR (rest
);
1617 rest
= SCM_CDR (rest
);
1621 #define SCM_ENCODING_SEARCH_SIZE (500)
1623 /* Search the first few hundred characters of a file for an Emacs-like coding
1624 declaration. Returns either NULL or a string whose storage has been
1625 allocated with `scm_gc_malloc ()'. */
1627 scm_i_scan_for_encoding (SCM port
)
1629 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1630 size_t bytes_read
, encoding_length
, i
;
1631 char *encoding
= NULL
;
1633 char *pos
, *encoding_start
;
1636 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1637 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1638 "guile <(echo '(display "hello")')") so bail out. */
1641 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1643 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1646 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1649 /* search past "coding[:=]" */
1653 if ((pos
= strstr(pos
, "coding")) == NULL
)
1656 pos
+= strlen("coding");
1657 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1658 (*pos
== ':' || *pos
== '='))
1666 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1667 (*pos
== ' ' || *pos
== '\t'))
1670 /* grab the next token */
1671 encoding_start
= pos
;
1673 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1674 && encoding_start
+ i
- header
< bytes_read
1675 && (isalnum ((int) encoding_start
[i
])
1676 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1679 encoding_length
= i
;
1680 if (encoding_length
== 0)
1683 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1684 for (i
= 0; i
< encoding_length
; i
++)
1685 encoding
[i
] = toupper ((int) encoding
[i
]);
1687 /* push backwards to make sure we were in a comment */
1689 pos
= encoding_start
;
1690 while (pos
>= header
)
1694 /* This wasn't in a semicolon comment. Check for a
1695 hash-bang comment. */
1696 char *beg
= strstr (header
, "#!");
1697 char *end
= strstr (header
, "!#");
1698 if (beg
< encoding_start
&& encoding_start
+ encoding_length
< end
)
1710 /* This wasn't in a comment */
1713 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1714 scm_misc_error (NULL
,
1715 "the port input declares the encoding ~s but is encoded as UTF-8",
1716 scm_list_1 (scm_from_locale_string (encoding
)));
1721 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1723 "Scans the port for an Emacs-like character coding declaration\n"
1724 "near the top of the contents of a port with random-acessible contents.\n"
1725 "The coding declaration is of the form\n"
1726 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1728 "Returns a string containing the character encoding of the file\n"
1729 "if a declaration was found, or @code{#f} otherwise.\n")
1730 #define FUNC_NAME s_scm_file_encoding
1735 enc
= scm_i_scan_for_encoding (port
);
1740 s_enc
= scm_from_locale_string (enc
);
1751 scm_read_hash_procedures
=
1752 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL
));
1754 scm_init_opts (scm_read_options
, scm_read_opts
);
1755 #include "libguile/read.x"