1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
63 SCM_SYMBOL (sym_nil
, "nil");
65 scm_t_option scm_read_opts
[] = {
66 { SCM_OPTION_BOOLEAN
, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN
, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F
,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
75 "Use R6RS variable-length character and string hex escapes."},
76 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
77 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
82 Give meaningful error messages for errors
86 FILE:LINE:COL: MESSAGE
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
96 scm_i_input_error (char const *function
,
97 SCM port
, const char *message
, SCM arg
)
99 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
101 : scm_from_locale_string ("#<unknown port>"));
103 SCM string_port
= scm_open_output_string ();
104 SCM string
= SCM_EOL
;
105 scm_simple_format (string_port
,
106 scm_from_locale_string ("~A:~S:~S: ~A"),
108 scm_from_long (SCM_LINUM (port
) + 1),
109 scm_from_int (SCM_COL (port
) + 1),
110 scm_from_locale_string (message
)));
112 string
= scm_get_output_string (string_port
);
113 scm_close_output_port (string_port
);
114 scm_error_scm (scm_from_locale_symbol ("read-error"),
115 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
122 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
127 #define FUNC_NAME s_scm_read_options
129 SCM ans
= scm_options (setting
,
132 if (SCM_COPY_SOURCE_P
)
133 SCM_RECORD_POSITIONS_P
= 1;
138 /* A fluid referring to an association list mapping extra hash
139 characters to procedures. */
140 static SCM
*scm_i_read_hash_procedures
;
143 scm_i_read_hash_procedures_ref (void)
145 return scm_fluid_ref (*scm_i_read_hash_procedures
);
149 scm_i_read_hash_procedures_set_x (SCM value
)
151 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
158 /* Size of the C buffer used to read symbols and numbers. */
159 #define READER_BUFFER_SIZE 128
161 /* Size of the C buffer used to read strings. */
162 #define READER_STRING_BUFFER_SIZE 512
164 /* The maximum size of Scheme character names. */
165 #define READER_CHAR_NAME_MAX_SIZE 50
168 /* `isblank' is only in C99. */
169 #define CHAR_IS_BLANK_(_chr) \
170 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
171 || ((_chr) == '\f') || ((_chr) == '\r'))
174 # define CHAR_IS_BLANK(_chr) \
175 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
177 # define CHAR_IS_BLANK CHAR_IS_BLANK_
181 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
183 #define CHAR_IS_R5RS_DELIMITER(c) \
185 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
186 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
188 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
190 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
192 #define CHAR_IS_EXPONENT_MARKER(_chr) \
193 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
194 || ((_chr) == 'd') || ((_chr) == 'l'))
196 /* Read an SCSH block comment. */
197 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
198 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
199 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
200 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
201 static SCM
scm_get_hash_procedure (int);
203 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
204 result in the pre-allocated buffer BUF. Return zero if the whole token has
205 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
206 bytes actually read. */
208 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
212 while (*read
< buf_size
)
216 chr
= scm_get_byte_or_eof (port
);
220 else if (CHAR_IS_DELIMITER (chr
))
222 scm_unget_byte (chr
, port
);
235 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
236 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
237 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
238 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
239 will be set the number of bytes actually read. */
241 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
242 char **overflow_buffer
, size_t *read
)
245 size_t bytes_read
, overflow_size
;
247 *overflow_buffer
= NULL
;
252 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
255 if (overflow
|| overflow_size
!= 0)
257 if (overflow_size
== 0)
259 *overflow_buffer
= scm_malloc (bytes_read
);
260 memcpy (*overflow_buffer
, buffer
, bytes_read
);
261 overflow_size
= bytes_read
;
265 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
266 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
267 overflow_size
+= bytes_read
;
274 *read
= overflow_size
;
278 return (overflow_size
!= 0);
281 /* Skip whitespace from PORT and return the first non-whitespace character
282 read. Raise an error on end-of-file. */
284 flush_ws (SCM port
, const char *eoferr
)
286 register scm_t_wchar c
;
288 switch (c
= scm_getc (port
))
294 scm_i_input_error (eoferr
,
303 switch (c
= scm_getc (port
))
309 case SCM_LINE_INCREMENTORS
:
315 switch (c
= scm_getc (port
))
318 eoferr
= "read_sharp";
321 scm_read_shebang (c
, port
);
324 scm_read_commented_expression (c
, port
);
327 if (scm_is_false (scm_get_hash_procedure (c
)))
329 scm_read_r6rs_block_comment (c
, port
);
334 scm_ungetc (c
, port
);
339 case SCM_LINE_INCREMENTORS
:
340 case SCM_SINGLE_SPACES
:
355 static SCM
scm_read_expression (SCM port
);
356 static SCM
scm_read_sharp (int chr
, SCM port
);
357 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
361 scm_read_sexp (scm_t_wchar chr
, SCM port
)
362 #define FUNC_NAME "scm_i_lreadparen"
366 register SCM tl
, ans
= SCM_EOL
;
367 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
368 const int terminating_char
= ((chr
== '[') ? ']' : ')');
370 /* Need to capture line and column numbers here. */
371 long line
= SCM_LINUM (port
);
372 int column
= SCM_COL (port
) - 1;
375 c
= flush_ws (port
, FUNC_NAME
);
376 if (terminating_char
== c
)
379 scm_ungetc (c
, port
);
380 if (scm_is_eq (scm_sym_dot
,
381 (tmp
= scm_read_expression (port
))))
383 ans
= scm_read_expression (port
);
384 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
385 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
390 /* Build the head of the list structure. */
391 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
393 if (SCM_COPY_SOURCE_P
)
394 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
399 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
403 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
404 scm_i_input_error (FUNC_NAME
, port
,
405 "in pair: mismatched close paren: ~A",
406 scm_list_1 (SCM_MAKE_CHAR (c
)));
408 scm_ungetc (c
, port
);
409 tmp
= scm_read_expression (port
);
411 if (scm_is_eq (scm_sym_dot
, tmp
))
413 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
415 if (SCM_COPY_SOURCE_P
)
416 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
419 c
= flush_ws (port
, FUNC_NAME
);
420 if (terminating_char
!= c
)
421 scm_i_input_error (FUNC_NAME
, port
,
422 "in pair: missing close paren", SCM_EOL
);
426 new_tail
= scm_cons (tmp
, SCM_EOL
);
427 SCM_SETCDR (tl
, new_tail
);
430 if (SCM_COPY_SOURCE_P
)
432 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
435 SCM_SETCDR (tl2
, new_tail2
);
441 if (SCM_RECORD_POSITIONS_P
)
442 scm_whash_insert (scm_source_whash
,
444 scm_make_srcprops (line
, column
,
455 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
456 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
458 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
464 while (i < ndigits) \
466 a = scm_getc (port); \
470 && (a == (scm_t_wchar) terminator) \
473 if ('0' <= a && a <= '9') \
475 else if ('A' <= a && a <= 'F') \
477 else if ('a' <= a && a <= 'f') \
490 scm_read_string (int chr
, SCM port
)
491 #define FUNC_NAME "scm_lreadr"
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
496 SCM str
= SCM_BOOL_F
;
497 unsigned c_str_len
= 0;
500 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
501 while ('"' != (c
= scm_getc (port
)))
506 scm_i_input_error (FUNC_NAME
, port
,
507 "end of file in string constant", SCM_EOL
);
510 if (c_str_len
+ 1 >= scm_i_string_length (str
))
512 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
514 str
= scm_string_append (scm_list_2 (str
, addy
));
519 switch (c
= scm_getc (port
))
553 if (SCM_R6RS_ESCAPES_P
)
554 SCM_READ_HEX_ESCAPE (10, ';');
556 SCM_READ_HEX_ESCAPE (2, '\0');
559 if (!SCM_R6RS_ESCAPES_P
)
561 SCM_READ_HEX_ESCAPE (4, '\0');
565 if (!SCM_R6RS_ESCAPES_P
)
567 SCM_READ_HEX_ESCAPE (6, '\0');
572 scm_i_input_error (FUNC_NAME
, port
,
573 "illegal character in escape sequence: ~S",
574 scm_list_1 (SCM_MAKE_CHAR (c
)));
577 str
= scm_i_string_start_writing (str
);
578 scm_i_string_set_x (str
, c_str_len
++, c
);
579 scm_i_string_stop_writing ();
584 return scm_i_substring_copy (str
, 0, c_str_len
);
593 scm_read_number (scm_t_wchar chr
, SCM port
)
595 SCM result
, str
= SCM_EOL
;
596 char buffer
[READER_BUFFER_SIZE
];
597 char *overflow_buffer
= NULL
;
600 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
602 scm_ungetc (chr
, port
);
603 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
604 &overflow_buffer
, &bytes_read
);
607 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
609 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
612 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
613 if (!scm_is_true (result
))
615 /* Return a symbol instead of a number */
616 if (SCM_CASE_INSENSITIVE_P
)
617 str
= scm_string_downcase_x (str
);
618 result
= scm_string_to_symbol (str
);
622 free (overflow_buffer
);
623 SCM_COL (port
) += scm_i_string_length (str
);
628 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
631 int ends_with_colon
= 0;
633 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
635 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
636 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
639 scm_ungetc (chr
, port
);
640 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
641 &overflow_buffer
, &bytes_read
);
645 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
647 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
650 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
653 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
655 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
658 if (SCM_CASE_INSENSITIVE_P
)
659 str
= scm_string_downcase_x (str
);
660 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
665 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
667 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
670 if (SCM_CASE_INSENSITIVE_P
)
671 str
= scm_string_downcase_x (str
);
672 result
= scm_string_to_symbol (str
);
676 free (overflow_buffer
);
677 SCM_COL (port
) += scm_i_string_length (str
);
682 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
683 #define FUNC_NAME "scm_lreadr"
687 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
716 scm_ungetc (chr
, port
);
717 scm_ungetc ('#', port
);
721 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
722 &overflow_buffer
, &read
);
724 pt
= SCM_PTAB_ENTRY (port
);
726 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
728 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
731 result
= scm_string_to_number (str
, scm_from_uint (radix
));
734 free (overflow_buffer
);
736 SCM_COL (port
) += scm_i_string_length (str
);
738 if (scm_is_true (result
))
741 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
748 scm_read_quote (int chr
, SCM port
)
751 long line
= SCM_LINUM (port
);
752 int column
= SCM_COL (port
) - 1;
757 p
= scm_sym_quasiquote
;
770 p
= scm_sym_uq_splicing
;
773 scm_ungetc (c
, port
);
780 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
781 "scm_read_quote", chr
);
785 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
786 if (SCM_RECORD_POSITIONS_P
)
787 scm_whash_insert (scm_source_whash
, p
,
788 scm_make_srcprops (line
, column
,
791 ? (scm_cons2 (SCM_CAR (p
),
792 SCM_CAR (SCM_CDR (p
)),
801 SCM_SYMBOL (sym_syntax
, "syntax");
802 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
803 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
804 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
807 scm_read_syntax (int chr
, SCM port
)
810 long line
= SCM_LINUM (port
);
811 int column
= SCM_COL (port
) - 1;
829 p
= sym_unsyntax_splicing
;
832 scm_ungetc (c
, port
);
839 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
840 "scm_read_syntax", chr
);
844 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
845 if (SCM_RECORD_POSITIONS_P
)
846 scm_whash_insert (scm_source_whash
, p
,
847 scm_make_srcprops (line
, column
,
850 ? (scm_cons2 (SCM_CAR (p
),
851 SCM_CAR (SCM_CDR (p
)),
861 scm_read_nil (int chr
, SCM port
)
863 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
865 if (!scm_is_eq (id
, sym_nil
))
866 scm_i_input_error ("scm_read_nil", port
,
867 "unexpected input while reading #nil: ~a",
870 return SCM_ELISP_NIL
;
874 scm_read_semicolon_comment (int chr
, SCM port
)
878 /* We use the get_byte here because there is no need to get the
879 locale correct with comment input. This presumes that newline
880 always represents itself no matter what the encoding is. */
881 for (c
= scm_get_byte_or_eof (port
);
882 (c
!= EOF
) && (c
!= '\n');
883 c
= scm_get_byte_or_eof (port
));
885 return SCM_UNSPECIFIED
;
889 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
892 scm_read_boolean (int chr
, SCM port
)
905 return SCM_UNSPECIFIED
;
909 scm_read_character (scm_t_wchar chr
, SCM port
)
910 #define FUNC_NAME "scm_lreadr"
912 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
914 size_t charname_len
, bytes_read
;
919 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
921 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
925 chr
= scm_getc (port
);
927 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
928 "while reading character", SCM_EOL
);
930 /* CHR must be a token delimiter, like a whitespace. */
931 return (SCM_MAKE_CHAR (chr
));
934 pt
= SCM_PTAB_ENTRY (port
);
936 /* Simple ASCII characters can be processed immediately. Also, simple
937 ISO-8859-1 characters can be processed immediately if the encoding for this
938 port is ISO-8859-1. */
939 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
942 return SCM_MAKE_CHAR (buffer
[0]);
945 /* Otherwise, convert the buffer into a proper scheme string for
947 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
949 charname_len
= scm_i_string_length (charname
);
950 SCM_COL (port
) += charname_len
;
951 cp
= scm_i_string_ref (charname
, 0);
952 if (charname_len
== 1)
953 return SCM_MAKE_CHAR (cp
);
955 /* Ignore dotted circles, which may be used to keep combining characters from
956 combining with the backslash in #\charname. */
957 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
958 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
960 if (cp
>= '0' && cp
< '8')
962 /* Dirk:FIXME:: This type of character syntax is not R5RS
963 * compliant. Further, it should be verified that the constant
964 * does only consist of octal digits. */
965 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
968 scm_t_wchar c
= scm_to_uint32 (p
);
969 if (SCM_IS_UNICODE_CHAR (c
))
970 return SCM_MAKE_CHAR (c
);
972 scm_i_input_error (FUNC_NAME
, port
,
973 "out-of-range octal character escape: ~a",
974 scm_list_1 (charname
));
978 if (cp
== 'x' && (charname_len
> 1))
982 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
983 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
987 scm_t_wchar c
= scm_to_uint32 (p
);
988 if (SCM_IS_UNICODE_CHAR (c
))
989 return SCM_MAKE_CHAR (c
);
991 scm_i_input_error (FUNC_NAME
, port
,
992 "out-of-range hex character escape: ~a",
993 scm_list_1 (charname
));
997 /* The names of characters should never have non-Latin1
999 if (scm_i_is_narrow_string (charname
)
1000 || scm_i_try_narrow_string (charname
))
1001 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1003 if (scm_is_true (ch
))
1007 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1008 scm_list_1 (charname
));
1010 return SCM_UNSPECIFIED
;
1015 scm_read_keyword (int chr
, SCM port
)
1019 /* Read the symbol that comprises the keyword. Doing this instead of
1020 invoking a specific symbol reader function allows `scm_read_keyword ()'
1021 to adapt to the delimiters currently valid of symbols.
1023 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1024 symbol
= scm_read_expression (port
);
1025 if (!scm_is_symbol (symbol
))
1026 scm_i_input_error ("scm_read_keyword", port
,
1027 "keyword prefix `~a' not followed by a symbol: ~s",
1028 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1030 return (scm_symbol_to_keyword (symbol
));
1034 scm_read_vector (int chr
, SCM port
)
1036 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1037 guarantee that it's going to do what we want. After all, this is an
1038 implementation detail of `scm_read_vector ()', not a desirable
1040 return (scm_vector (scm_read_sexp (chr
, port
)));
1044 scm_read_srfi4_vector (int chr
, SCM port
)
1046 return scm_i_read_array (port
, chr
);
1050 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1052 chr
= scm_getc (port
);
1056 chr
= scm_getc (port
);
1060 chr
= scm_getc (port
);
1064 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1067 scm_i_input_error ("read_bytevector", port
,
1068 "invalid bytevector prefix",
1069 SCM_MAKE_CHAR (chr
));
1070 return SCM_UNSPECIFIED
;
1074 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1076 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1077 terribly inefficient but who cares? */
1078 SCM s_bits
= SCM_EOL
;
1080 for (chr
= scm_getc (port
);
1081 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1082 chr
= scm_getc (port
))
1084 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1088 scm_ungetc (chr
, port
);
1090 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1094 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1098 /* We can use the get_byte here because there is no need to get the
1099 locale correct when reading comments. This presumes that
1100 hash and exclamation points always represent themselves no
1101 matter what the source encoding is.*/
1104 int c
= scm_get_byte_or_eof (port
);
1107 scm_i_input_error ("skip_block_comment", port
,
1108 "unterminated `#! ... !#' comment", SCM_EOL
);
1112 else if (c
== '#' && bang_seen
)
1118 return SCM_UNSPECIFIED
;
1122 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1125 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1127 scm_ungetc (c
, port
);
1128 return scm_read_scsh_block_comment (chr
, port
);
1130 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1132 scm_ungetc (c
, port
);
1133 scm_ungetc ('r', port
);
1134 return scm_read_scsh_block_comment (chr
, port
);
1136 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1138 scm_ungetc (c
, port
);
1139 scm_ungetc ('6', port
);
1140 scm_ungetc ('r', port
);
1141 return scm_read_scsh_block_comment (chr
, port
);
1143 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1145 scm_ungetc (c
, port
);
1146 scm_ungetc ('r', port
);
1147 scm_ungetc ('6', port
);
1148 scm_ungetc ('r', port
);
1149 return scm_read_scsh_block_comment (chr
, port
);
1152 return SCM_UNSPECIFIED
;
1156 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1158 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1159 nested. So care must be taken. */
1160 int nesting_level
= 1;
1161 int opening_seen
= 0, closing_seen
= 0;
1163 while (nesting_level
> 0)
1165 int c
= scm_getc (port
);
1168 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1169 "unterminated `#| ... |#' comment", SCM_EOL
);
1177 else if (closing_seen
)
1188 opening_seen
= closing_seen
= 0;
1191 return SCM_UNSPECIFIED
;
1195 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1199 c
= flush_ws (port
, (char *) NULL
);
1201 scm_i_input_error ("read_commented_expression", port
,
1202 "no expression after #; comment", SCM_EOL
);
1203 scm_ungetc (c
, port
);
1204 scm_read_expression (port
);
1205 return SCM_UNSPECIFIED
;
1209 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1211 /* Guile's extended symbol read syntax looks like this:
1213 #{This is all a symbol name}#
1215 So here, CHR is expected to be `{'. */
1216 int saw_brace
= 0, finished
= 0;
1218 SCM buf
= scm_i_make_string (1024, NULL
);
1220 buf
= scm_i_string_start_writing (buf
);
1222 while ((chr
= scm_getc (port
)) != EOF
)
1234 scm_i_string_set_x (buf
, len
++, '}');
1235 scm_i_string_set_x (buf
, len
++, chr
);
1238 else if (chr
== '}')
1241 scm_i_string_set_x (buf
, len
++, chr
);
1243 if (len
>= scm_i_string_length (buf
) - 2)
1247 scm_i_string_stop_writing ();
1248 addy
= scm_i_make_string (1024, NULL
);
1249 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1251 buf
= scm_i_string_start_writing (buf
);
1257 scm_i_string_stop_writing ();
1259 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1264 /* Top-level token readers, i.e., dispatchers. */
1267 scm_read_sharp_extension (int chr
, SCM port
)
1271 proc
= scm_get_hash_procedure (chr
);
1272 if (scm_is_true (scm_procedure_p (proc
)))
1274 long line
= SCM_LINUM (port
);
1275 int column
= SCM_COL (port
) - 2;
1278 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1279 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1281 if (SCM_RECORD_POSITIONS_P
)
1282 return (recsexpr (got
, line
, column
,
1283 SCM_FILENAME (port
)));
1289 return SCM_UNSPECIFIED
;
1292 /* The reader for the sharp `#' character. It basically dispatches reads
1293 among the above token readers. */
1295 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1296 #define FUNC_NAME "scm_lreadr"
1300 chr
= scm_getc (port
);
1302 result
= scm_read_sharp_extension (chr
, port
);
1303 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1309 return (scm_read_character (chr
, port
));
1311 return (scm_read_vector (chr
, port
));
1315 /* This one may return either a boolean or an SRFI-4 vector. */
1316 return (scm_read_srfi4_vector (chr
, port
));
1318 return (scm_read_bytevector (chr
, port
));
1320 return (scm_read_guile_bit_vector (chr
, port
));
1324 /* This one may return either a boolean or an SRFI-4 vector. */
1325 return (scm_read_boolean (chr
, port
));
1327 return (scm_read_keyword (chr
, port
));
1328 case '0': case '1': case '2': case '3': case '4':
1329 case '5': case '6': case '7': case '8': case '9':
1331 #if SCM_ENABLE_DEPRECATED
1332 /* See below for 'i' and 'e'. */
1339 return (scm_i_read_array (port
, chr
));
1343 #if SCM_ENABLE_DEPRECATED
1345 /* When next char is '(', it really is an old-style
1347 scm_t_wchar next_c
= scm_getc (port
);
1349 scm_ungetc (next_c
, port
);
1351 return scm_i_read_array (port
, chr
);
1365 return (scm_read_number_and_radix (chr
, port
));
1367 return (scm_read_extended_symbol (chr
, port
));
1369 return (scm_read_shebang (chr
, port
));
1371 return (scm_read_commented_expression (chr
, port
));
1375 return (scm_read_syntax (chr
, port
));
1377 return (scm_read_nil (chr
, port
));
1379 result
= scm_read_sharp_extension (chr
, port
);
1380 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1382 /* To remain compatible with 1.8 and earlier, the following
1383 characters have lower precedence than `read-hash-extend'
1388 return scm_read_r6rs_block_comment (chr
, port
);
1390 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1391 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1398 return SCM_UNSPECIFIED
;
1403 scm_read_expression (SCM port
)
1404 #define FUNC_NAME "scm_read_expression"
1408 register scm_t_wchar chr
;
1410 chr
= scm_getc (port
);
1414 case SCM_WHITE_SPACES
:
1415 case SCM_LINE_INCREMENTORS
:
1418 (void) scm_read_semicolon_comment (chr
, port
);
1421 if (!SCM_SQUARE_BRACKETS_P
)
1422 return (scm_read_mixed_case_symbol (chr
, port
));
1423 /* otherwise fall through */
1425 return (scm_read_sexp (chr
, port
));
1427 return (scm_read_string (chr
, port
));
1431 return (scm_read_quote (chr
, port
));
1435 result
= scm_read_sharp (chr
, port
);
1436 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1437 /* We read a comment or some such. */
1443 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1446 if (SCM_SQUARE_BRACKETS_P
)
1447 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1448 /* otherwise fall through */
1452 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1453 return scm_symbol_to_keyword (scm_read_expression (port
));
1458 if (((chr
>= '0') && (chr
<= '9'))
1459 || (strchr ("+-.", chr
)))
1460 return (scm_read_number (chr
, port
));
1462 return (scm_read_mixed_case_symbol (chr
, port
));
1470 /* Actual reader. */
1472 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1474 "Read an s-expression from the input port @var{port}, or from\n"
1475 "the current input port if @var{port} is not specified.\n"
1476 "Any whitespace before the next token is discarded.")
1477 #define FUNC_NAME s_scm_read
1481 if (SCM_UNBNDP (port
))
1482 port
= scm_current_input_port ();
1483 SCM_VALIDATE_OPINPORT (1, port
);
1485 c
= flush_ws (port
, (char *) NULL
);
1488 scm_ungetc (c
, port
);
1490 return (scm_read_expression (port
));
1497 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1499 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1501 if (!scm_is_pair(obj
)) {
1504 SCM tmp
= obj
, copy
;
1505 /* If this sexpr is visible in the read:sharp source, we want to
1506 keep that information, so only record non-constant cons cells
1507 which haven't previously been read by the reader. */
1508 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1510 if (SCM_COPY_SOURCE_P
)
1512 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1514 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1516 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1521 copy
= SCM_CDR (copy
);
1523 SCM_SETCDR (copy
, tmp
);
1527 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1528 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1529 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1530 copy
= SCM_UNDEFINED
;
1532 scm_whash_insert (scm_source_whash
,
1534 scm_make_srcprops (line
,
1544 /* Manipulate the read-hash-procedures alist. This could be written in
1545 Scheme, but maybe it will also be used by C code during initialisation. */
1546 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1547 (SCM chr
, SCM proc
),
1548 "Install the procedure @var{proc} for reading expressions\n"
1549 "starting with the character sequence @code{#} and @var{chr}.\n"
1550 "@var{proc} will be called with two arguments: the character\n"
1551 "@var{chr} and the port to read further data from. The object\n"
1552 "returned will be the return value of @code{read}. \n"
1553 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1555 #define FUNC_NAME s_scm_read_hash_extend
1560 SCM_VALIDATE_CHAR (1, chr
);
1561 SCM_ASSERT (scm_is_false (proc
)
1562 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1563 proc
, SCM_ARG2
, FUNC_NAME
);
1565 /* Check if chr is already in the alist. */
1566 this = scm_i_read_hash_procedures_ref ();
1570 if (scm_is_null (this))
1572 /* not found, so add it to the beginning. */
1573 if (scm_is_true (proc
))
1575 SCM
new = scm_cons (scm_cons (chr
, proc
),
1576 scm_i_read_hash_procedures_ref ());
1577 scm_i_read_hash_procedures_set_x (new);
1581 if (scm_is_eq (chr
, SCM_CAAR (this)))
1583 /* already in the alist. */
1584 if (scm_is_false (proc
))
1587 if (scm_is_false (prev
))
1589 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1590 scm_i_read_hash_procedures_set_x (rest
);
1593 scm_set_cdr_x (prev
, SCM_CDR (this));
1598 scm_set_cdr_x (SCM_CAR (this), proc
);
1603 this = SCM_CDR (this);
1606 return SCM_UNSPECIFIED
;
1610 /* Recover the read-hash procedure corresponding to char c. */
1612 scm_get_hash_procedure (int c
)
1614 SCM rest
= scm_i_read_hash_procedures_ref ();
1618 if (scm_is_null (rest
))
1621 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1622 return SCM_CDAR (rest
);
1624 rest
= SCM_CDR (rest
);
1628 #define SCM_ENCODING_SEARCH_SIZE (500)
1630 /* Search the first few hundred characters of a file for an Emacs-like coding
1631 declaration. Returns either NULL or a string whose storage has been
1632 allocated with `scm_gc_malloc ()'. */
1634 scm_i_scan_for_encoding (SCM port
)
1636 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1637 size_t bytes_read
, encoding_length
, i
;
1638 char *encoding
= NULL
;
1640 char *pos
, *encoding_start
;
1643 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1644 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1645 "guile <(echo '(display "hello")')") so bail out. */
1648 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1649 header
[bytes_read
] = '\0';
1651 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1654 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1657 /* search past "coding[:=]" */
1661 if ((pos
= strstr(pos
, "coding")) == NULL
)
1664 pos
+= strlen("coding");
1665 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1666 (*pos
== ':' || *pos
== '='))
1674 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1675 (*pos
== ' ' || *pos
== '\t'))
1678 /* grab the next token */
1679 encoding_start
= pos
;
1681 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1682 && encoding_start
+ i
- header
< bytes_read
1683 && (isalnum ((int) encoding_start
[i
])
1684 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1687 encoding_length
= i
;
1688 if (encoding_length
== 0)
1691 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1692 for (i
= 0; i
< encoding_length
; i
++)
1693 encoding
[i
] = toupper ((int) encoding
[i
]);
1695 /* push backwards to make sure we were in a comment */
1697 pos
= encoding_start
;
1698 while (pos
>= header
)
1702 /* This wasn't in a semicolon comment. Check for a
1703 hash-bang comment. */
1704 char *beg
= strstr (header
, "#!");
1705 char *end
= strstr (header
, "!#");
1706 if (beg
< encoding_start
&& encoding_start
+ encoding_length
< end
)
1718 /* This wasn't in a comment */
1721 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1722 scm_misc_error (NULL
,
1723 "the port input declares the encoding ~s but is encoded as UTF-8",
1724 scm_list_1 (scm_from_locale_string (encoding
)));
1729 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1731 "Scans the port for an Emacs-like character coding declaration\n"
1732 "near the top of the contents of a port with random-acessible contents.\n"
1733 "The coding declaration is of the form\n"
1734 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1736 "Returns a string containing the character encoding of the file\n"
1737 "if a declaration was found, or @code{#f} otherwise.\n")
1738 #define FUNC_NAME s_scm_file_encoding
1743 enc
= scm_i_scan_for_encoding (port
);
1748 s_enc
= scm_from_locale_string (enc
);
1759 SCM read_hash_procs
;
1761 read_hash_procs
= scm_make_fluid ();
1762 scm_fluid_set_x (read_hash_procs
, SCM_EOL
);
1764 scm_i_read_hash_procedures
=
1765 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1767 scm_init_opts (scm_read_options
, scm_read_opts
);
1768 #include "libguile/read.x"