1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
244 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
245 char **overflow_buffer
, size_t *read
)
248 size_t bytes_read
, overflow_size
;
250 *overflow_buffer
= NULL
;
255 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
258 if (overflow
|| overflow_size
!= 0)
260 if (overflow_size
== 0)
262 *overflow_buffer
= scm_malloc (bytes_read
);
263 memcpy (*overflow_buffer
, buffer
, bytes_read
);
264 overflow_size
= bytes_read
;
268 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
269 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
!= 0);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
289 register scm_t_wchar c
;
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
);
360 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
364 scm_read_sexp (scm_t_wchar chr
, SCM port
)
365 #define FUNC_NAME "scm_i_lreadparen"
369 register SCM tl
, ans
= SCM_EOL
;
370 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
371 const int terminating_char
= ((chr
== '[') ? ']' : ')');
373 /* Need to capture line and column numbers here. */
374 long line
= SCM_LINUM (port
);
375 int column
= SCM_COL (port
) - 1;
378 c
= flush_ws (port
, FUNC_NAME
);
379 if (terminating_char
== c
)
382 scm_ungetc (c
, port
);
383 if (scm_is_eq (scm_sym_dot
,
384 (tmp
= scm_read_expression (port
))))
386 ans
= scm_read_expression (port
);
387 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
388 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
393 /* Build the head of the list structure. */
394 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
396 if (SCM_COPY_SOURCE_P
)
397 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
402 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
406 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
407 scm_i_input_error (FUNC_NAME
, port
,
408 "in pair: mismatched close paren: ~A",
409 scm_list_1 (SCM_MAKE_CHAR (c
)));
411 scm_ungetc (c
, port
);
412 tmp
= scm_read_expression (port
);
414 if (scm_is_eq (scm_sym_dot
, tmp
))
416 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
418 if (SCM_COPY_SOURCE_P
)
419 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
422 c
= flush_ws (port
, FUNC_NAME
);
423 if (terminating_char
!= c
)
424 scm_i_input_error (FUNC_NAME
, port
,
425 "in pair: missing close paren", SCM_EOL
);
429 new_tail
= scm_cons (tmp
, SCM_EOL
);
430 SCM_SETCDR (tl
, new_tail
);
433 if (SCM_COPY_SOURCE_P
)
435 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
438 SCM_SETCDR (tl2
, new_tail2
);
444 if (SCM_RECORD_POSITIONS_P
)
445 scm_whash_insert (scm_source_whash
,
447 scm_make_srcprops (line
, column
,
458 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
459 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
467 while (i < ndigits) \
469 a = scm_getc (port); \
473 && (a == (scm_t_wchar) terminator) \
476 if ('0' <= a && a <= '9') \
478 else if ('A' <= a && a <= 'F') \
480 else if ('a' <= a && a <= 'f') \
493 skip_intraline_whitespace (SCM port
)
503 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
505 scm_ungetc (c
, port
);
509 scm_read_string (int chr
, SCM port
)
510 #define FUNC_NAME "scm_lreadr"
512 /* For strings smaller than C_STR, this function creates only one Scheme
513 object (the string returned). */
515 SCM str
= SCM_BOOL_F
;
516 unsigned c_str_len
= 0;
519 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
520 while ('"' != (c
= scm_getc (port
)))
525 scm_i_input_error (FUNC_NAME
, port
,
526 "end of file in string constant", SCM_EOL
);
529 if (c_str_len
+ 1 >= scm_i_string_length (str
))
531 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
533 str
= scm_string_append (scm_list_2 (str
, addy
));
538 switch (c
= scm_getc (port
))
546 if (SCM_HUNGRY_EOL_ESCAPES_P
)
547 skip_intraline_whitespace (port
);
574 if (SCM_R6RS_ESCAPES_P
)
575 SCM_READ_HEX_ESCAPE (10, ';');
577 SCM_READ_HEX_ESCAPE (2, '\0');
580 if (!SCM_R6RS_ESCAPES_P
)
582 SCM_READ_HEX_ESCAPE (4, '\0');
586 if (!SCM_R6RS_ESCAPES_P
)
588 SCM_READ_HEX_ESCAPE (6, '\0');
593 scm_i_input_error (FUNC_NAME
, port
,
594 "illegal character in escape sequence: ~S",
595 scm_list_1 (SCM_MAKE_CHAR (c
)));
598 str
= scm_i_string_start_writing (str
);
599 scm_i_string_set_x (str
, c_str_len
++, c
);
600 scm_i_string_stop_writing ();
605 return scm_i_substring_copy (str
, 0, c_str_len
);
614 scm_read_number (scm_t_wchar chr
, SCM port
)
616 SCM result
, str
= SCM_EOL
;
617 char buffer
[READER_BUFFER_SIZE
];
618 char *overflow_buffer
= NULL
;
621 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
623 scm_ungetc (chr
, port
);
624 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
625 &overflow_buffer
, &bytes_read
);
628 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
630 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
633 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
634 if (!scm_is_true (result
))
636 /* Return a symbol instead of a number */
637 if (SCM_CASE_INSENSITIVE_P
)
638 str
= scm_string_downcase_x (str
);
639 result
= scm_string_to_symbol (str
);
643 free (overflow_buffer
);
644 SCM_COL (port
) += scm_i_string_length (str
);
649 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
652 int ends_with_colon
= 0;
654 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
656 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
657 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
660 scm_ungetc (chr
, port
);
661 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
662 &overflow_buffer
, &bytes_read
);
666 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
668 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
671 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
674 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
676 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
679 if (SCM_CASE_INSENSITIVE_P
)
680 str
= scm_string_downcase_x (str
);
681 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
686 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
688 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
691 if (SCM_CASE_INSENSITIVE_P
)
692 str
= scm_string_downcase_x (str
);
693 result
= scm_string_to_symbol (str
);
697 free (overflow_buffer
);
698 SCM_COL (port
) += scm_i_string_length (str
);
703 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
704 #define FUNC_NAME "scm_lreadr"
708 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
737 scm_ungetc (chr
, port
);
738 scm_ungetc ('#', port
);
742 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
743 &overflow_buffer
, &read
);
745 pt
= SCM_PTAB_ENTRY (port
);
747 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
749 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
752 result
= scm_string_to_number (str
, scm_from_uint (radix
));
755 free (overflow_buffer
);
757 SCM_COL (port
) += scm_i_string_length (str
);
759 if (scm_is_true (result
))
762 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
769 scm_read_quote (int chr
, SCM port
)
772 long line
= SCM_LINUM (port
);
773 int column
= SCM_COL (port
) - 1;
778 p
= scm_sym_quasiquote
;
791 p
= scm_sym_uq_splicing
;
794 scm_ungetc (c
, port
);
801 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
802 "scm_read_quote", chr
);
806 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
807 if (SCM_RECORD_POSITIONS_P
)
808 scm_whash_insert (scm_source_whash
, p
,
809 scm_make_srcprops (line
, column
,
812 ? (scm_cons2 (SCM_CAR (p
),
813 SCM_CAR (SCM_CDR (p
)),
822 SCM_SYMBOL (sym_syntax
, "syntax");
823 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
824 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
825 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
828 scm_read_syntax (int chr
, SCM port
)
831 long line
= SCM_LINUM (port
);
832 int column
= SCM_COL (port
) - 1;
850 p
= sym_unsyntax_splicing
;
853 scm_ungetc (c
, port
);
860 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
861 "scm_read_syntax", chr
);
865 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
866 if (SCM_RECORD_POSITIONS_P
)
867 scm_whash_insert (scm_source_whash
, p
,
868 scm_make_srcprops (line
, column
,
871 ? (scm_cons2 (SCM_CAR (p
),
872 SCM_CAR (SCM_CDR (p
)),
882 scm_read_nil (int chr
, SCM port
)
884 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
886 if (!scm_is_eq (id
, sym_nil
))
887 scm_i_input_error ("scm_read_nil", port
,
888 "unexpected input while reading #nil: ~a",
891 return SCM_ELISP_NIL
;
895 scm_read_semicolon_comment (int chr
, SCM port
)
899 /* We use the get_byte here because there is no need to get the
900 locale correct with comment input. This presumes that newline
901 always represents itself no matter what the encoding is. */
902 for (c
= scm_get_byte_or_eof (port
);
903 (c
!= EOF
) && (c
!= '\n');
904 c
= scm_get_byte_or_eof (port
));
906 return SCM_UNSPECIFIED
;
910 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
913 scm_read_boolean (int chr
, SCM port
)
926 return SCM_UNSPECIFIED
;
930 scm_read_character (scm_t_wchar chr
, SCM port
)
931 #define FUNC_NAME "scm_lreadr"
933 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
935 size_t charname_len
, bytes_read
;
940 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
942 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
946 chr
= scm_getc (port
);
948 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
949 "while reading character", SCM_EOL
);
951 /* CHR must be a token delimiter, like a whitespace. */
952 return (SCM_MAKE_CHAR (chr
));
955 pt
= SCM_PTAB_ENTRY (port
);
957 /* Simple ASCII characters can be processed immediately. Also, simple
958 ISO-8859-1 characters can be processed immediately if the encoding for this
959 port is ISO-8859-1. */
960 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
963 return SCM_MAKE_CHAR (buffer
[0]);
966 /* Otherwise, convert the buffer into a proper scheme string for
968 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
970 charname_len
= scm_i_string_length (charname
);
971 SCM_COL (port
) += charname_len
;
972 cp
= scm_i_string_ref (charname
, 0);
973 if (charname_len
== 1)
974 return SCM_MAKE_CHAR (cp
);
976 /* Ignore dotted circles, which may be used to keep combining characters from
977 combining with the backslash in #\charname. */
978 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
979 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
981 if (cp
>= '0' && cp
< '8')
983 /* Dirk:FIXME:: This type of character syntax is not R5RS
984 * compliant. Further, it should be verified that the constant
985 * does only consist of octal digits. */
986 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
989 scm_t_wchar c
= scm_to_uint32 (p
);
990 if (SCM_IS_UNICODE_CHAR (c
))
991 return SCM_MAKE_CHAR (c
);
993 scm_i_input_error (FUNC_NAME
, port
,
994 "out-of-range octal character escape: ~a",
995 scm_list_1 (charname
));
999 if (cp
== 'x' && (charname_len
> 1))
1003 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1005 scm_from_uint (16));
1006 if (SCM_I_INUMP (p
))
1008 scm_t_wchar c
= scm_to_uint32 (p
);
1009 if (SCM_IS_UNICODE_CHAR (c
))
1010 return SCM_MAKE_CHAR (c
);
1012 scm_i_input_error (FUNC_NAME
, port
,
1013 "out-of-range hex character escape: ~a",
1014 scm_list_1 (charname
));
1018 /* The names of characters should never have non-Latin1
1020 if (scm_i_is_narrow_string (charname
)
1021 || scm_i_try_narrow_string (charname
))
1022 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1024 if (scm_is_true (ch
))
1028 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1029 scm_list_1 (charname
));
1031 return SCM_UNSPECIFIED
;
1036 scm_read_keyword (int chr
, SCM port
)
1040 /* Read the symbol that comprises the keyword. Doing this instead of
1041 invoking a specific symbol reader function allows `scm_read_keyword ()'
1042 to adapt to the delimiters currently valid of symbols.
1044 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1045 symbol
= scm_read_expression (port
);
1046 if (!scm_is_symbol (symbol
))
1047 scm_i_input_error ("scm_read_keyword", port
,
1048 "keyword prefix `~a' not followed by a symbol: ~s",
1049 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1051 return (scm_symbol_to_keyword (symbol
));
1055 scm_read_vector (int chr
, SCM port
)
1057 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058 guarantee that it's going to do what we want. After all, this is an
1059 implementation detail of `scm_read_vector ()', not a desirable
1061 return (scm_vector (scm_read_sexp (chr
, port
)));
1065 scm_read_srfi4_vector (int chr
, SCM port
)
1067 return scm_i_read_array (port
, chr
);
1071 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1073 chr
= scm_getc (port
);
1077 chr
= scm_getc (port
);
1081 chr
= scm_getc (port
);
1085 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1088 scm_i_input_error ("read_bytevector", port
,
1089 "invalid bytevector prefix",
1090 SCM_MAKE_CHAR (chr
));
1091 return SCM_UNSPECIFIED
;
1095 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1097 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1098 terribly inefficient but who cares? */
1099 SCM s_bits
= SCM_EOL
;
1101 for (chr
= scm_getc (port
);
1102 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1103 chr
= scm_getc (port
))
1105 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1109 scm_ungetc (chr
, port
);
1111 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1115 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1121 int c
= scm_getc (port
);
1124 scm_i_input_error ("skip_block_comment", port
,
1125 "unterminated `#! ... !#' comment", SCM_EOL
);
1129 else if (c
== '#' && bang_seen
)
1135 return SCM_UNSPECIFIED
;
1139 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1142 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1144 scm_ungetc (c
, port
);
1145 return scm_read_scsh_block_comment (chr
, port
);
1147 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1149 scm_ungetc (c
, port
);
1150 scm_ungetc ('r', port
);
1151 return scm_read_scsh_block_comment (chr
, port
);
1153 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1155 scm_ungetc (c
, port
);
1156 scm_ungetc ('6', port
);
1157 scm_ungetc ('r', port
);
1158 return scm_read_scsh_block_comment (chr
, port
);
1160 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1162 scm_ungetc (c
, port
);
1163 scm_ungetc ('r', port
);
1164 scm_ungetc ('6', port
);
1165 scm_ungetc ('r', port
);
1166 return scm_read_scsh_block_comment (chr
, port
);
1169 return SCM_UNSPECIFIED
;
1173 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1175 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1176 nested. So care must be taken. */
1177 int nesting_level
= 1;
1178 int opening_seen
= 0, closing_seen
= 0;
1180 while (nesting_level
> 0)
1182 int c
= scm_getc (port
);
1185 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1186 "unterminated `#| ... |#' comment", SCM_EOL
);
1194 else if (closing_seen
)
1205 opening_seen
= closing_seen
= 0;
1208 return SCM_UNSPECIFIED
;
1212 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1216 c
= flush_ws (port
, (char *) NULL
);
1218 scm_i_input_error ("read_commented_expression", port
,
1219 "no expression after #; comment", SCM_EOL
);
1220 scm_ungetc (c
, port
);
1221 scm_read_expression (port
);
1222 return SCM_UNSPECIFIED
;
1226 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1228 /* Guile's extended symbol read syntax looks like this:
1230 #{This is all a symbol name}#
1232 So here, CHR is expected to be `{'. */
1233 int saw_brace
= 0, finished
= 0;
1235 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1237 buf
= scm_i_string_start_writing (buf
);
1239 while ((chr
= scm_getc (port
)) != EOF
)
1251 scm_i_string_set_x (buf
, len
++, '}');
1252 scm_i_string_set_x (buf
, len
++, chr
);
1255 else if (chr
== '}')
1258 scm_i_string_set_x (buf
, len
++, chr
);
1260 if (len
>= scm_i_string_length (buf
) - 2)
1264 scm_i_string_stop_writing ();
1265 addy
= scm_i_make_string (1024, NULL
, 0);
1266 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1268 buf
= scm_i_string_start_writing (buf
);
1274 scm_i_string_stop_writing ();
1276 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1281 /* Top-level token readers, i.e., dispatchers. */
1284 scm_read_sharp_extension (int chr
, SCM port
)
1288 proc
= scm_get_hash_procedure (chr
);
1289 if (scm_is_true (scm_procedure_p (proc
)))
1291 long line
= SCM_LINUM (port
);
1292 int column
= SCM_COL (port
) - 2;
1295 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1296 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1298 if (SCM_RECORD_POSITIONS_P
)
1299 return (recsexpr (got
, line
, column
,
1300 SCM_FILENAME (port
)));
1306 return SCM_UNSPECIFIED
;
1309 /* The reader for the sharp `#' character. It basically dispatches reads
1310 among the above token readers. */
1312 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1313 #define FUNC_NAME "scm_lreadr"
1317 chr
= scm_getc (port
);
1319 result
= scm_read_sharp_extension (chr
, port
);
1320 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1326 return (scm_read_character (chr
, port
));
1328 return (scm_read_vector (chr
, port
));
1332 /* This one may return either a boolean or an SRFI-4 vector. */
1333 return (scm_read_srfi4_vector (chr
, port
));
1335 return (scm_read_bytevector (chr
, port
));
1337 return (scm_read_guile_bit_vector (chr
, port
));
1341 /* This one may return either a boolean or an SRFI-4 vector. */
1342 return (scm_read_boolean (chr
, port
));
1344 return (scm_read_keyword (chr
, port
));
1345 case '0': case '1': case '2': case '3': case '4':
1346 case '5': case '6': case '7': case '8': case '9':
1348 #if SCM_ENABLE_DEPRECATED
1349 /* See below for 'i' and 'e'. */
1356 return (scm_i_read_array (port
, chr
));
1360 #if SCM_ENABLE_DEPRECATED
1362 /* When next char is '(', it really is an old-style
1364 scm_t_wchar next_c
= scm_getc (port
);
1366 scm_ungetc (next_c
, port
);
1368 return scm_i_read_array (port
, chr
);
1382 return (scm_read_number_and_radix (chr
, port
));
1384 return (scm_read_extended_symbol (chr
, port
));
1386 return (scm_read_shebang (chr
, port
));
1388 return (scm_read_commented_expression (chr
, port
));
1392 return (scm_read_syntax (chr
, port
));
1394 return (scm_read_nil (chr
, port
));
1396 result
= scm_read_sharp_extension (chr
, port
);
1397 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1399 /* To remain compatible with 1.8 and earlier, the following
1400 characters have lower precedence than `read-hash-extend'
1405 return scm_read_r6rs_block_comment (chr
, port
);
1407 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1408 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1415 return SCM_UNSPECIFIED
;
1420 scm_read_expression (SCM port
)
1421 #define FUNC_NAME "scm_read_expression"
1425 register scm_t_wchar chr
;
1427 chr
= scm_getc (port
);
1431 case SCM_WHITE_SPACES
:
1432 case SCM_LINE_INCREMENTORS
:
1435 (void) scm_read_semicolon_comment (chr
, port
);
1438 if (!SCM_SQUARE_BRACKETS_P
)
1439 return (scm_read_mixed_case_symbol (chr
, port
));
1440 /* otherwise fall through */
1442 return (scm_read_sexp (chr
, port
));
1444 return (scm_read_string (chr
, port
));
1448 return (scm_read_quote (chr
, port
));
1452 result
= scm_read_sharp (chr
, port
);
1453 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1454 /* We read a comment or some such. */
1460 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1463 if (SCM_SQUARE_BRACKETS_P
)
1464 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1465 /* otherwise fall through */
1469 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1470 return scm_symbol_to_keyword (scm_read_expression (port
));
1475 if (((chr
>= '0') && (chr
<= '9'))
1476 || (strchr ("+-.", chr
)))
1477 return (scm_read_number (chr
, port
));
1479 return (scm_read_mixed_case_symbol (chr
, port
));
1487 /* Actual reader. */
1489 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1491 "Read an s-expression from the input port @var{port}, or from\n"
1492 "the current input port if @var{port} is not specified.\n"
1493 "Any whitespace before the next token is discarded.")
1494 #define FUNC_NAME s_scm_read
1498 if (SCM_UNBNDP (port
))
1499 port
= scm_current_input_port ();
1500 SCM_VALIDATE_OPINPORT (1, port
);
1502 c
= flush_ws (port
, (char *) NULL
);
1505 scm_ungetc (c
, port
);
1507 return (scm_read_expression (port
));
1514 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1516 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1518 if (!scm_is_pair(obj
)) {
1521 SCM tmp
= obj
, copy
;
1522 /* If this sexpr is visible in the read:sharp source, we want to
1523 keep that information, so only record non-constant cons cells
1524 which haven't previously been read by the reader. */
1525 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1527 if (SCM_COPY_SOURCE_P
)
1529 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1531 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1533 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1538 copy
= SCM_CDR (copy
);
1540 SCM_SETCDR (copy
, tmp
);
1544 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1545 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1546 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1547 copy
= SCM_UNDEFINED
;
1549 scm_whash_insert (scm_source_whash
,
1551 scm_make_srcprops (line
,
1561 /* Manipulate the read-hash-procedures alist. This could be written in
1562 Scheme, but maybe it will also be used by C code during initialisation. */
1563 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1564 (SCM chr
, SCM proc
),
1565 "Install the procedure @var{proc} for reading expressions\n"
1566 "starting with the character sequence @code{#} and @var{chr}.\n"
1567 "@var{proc} will be called with two arguments: the character\n"
1568 "@var{chr} and the port to read further data from. The object\n"
1569 "returned will be the return value of @code{read}. \n"
1570 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1572 #define FUNC_NAME s_scm_read_hash_extend
1577 SCM_VALIDATE_CHAR (1, chr
);
1578 SCM_ASSERT (scm_is_false (proc
)
1579 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1580 proc
, SCM_ARG2
, FUNC_NAME
);
1582 /* Check if chr is already in the alist. */
1583 this = scm_i_read_hash_procedures_ref ();
1587 if (scm_is_null (this))
1589 /* not found, so add it to the beginning. */
1590 if (scm_is_true (proc
))
1592 SCM
new = scm_cons (scm_cons (chr
, proc
),
1593 scm_i_read_hash_procedures_ref ());
1594 scm_i_read_hash_procedures_set_x (new);
1598 if (scm_is_eq (chr
, SCM_CAAR (this)))
1600 /* already in the alist. */
1601 if (scm_is_false (proc
))
1604 if (scm_is_false (prev
))
1606 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1607 scm_i_read_hash_procedures_set_x (rest
);
1610 scm_set_cdr_x (prev
, SCM_CDR (this));
1615 scm_set_cdr_x (SCM_CAR (this), proc
);
1620 this = SCM_CDR (this);
1623 return SCM_UNSPECIFIED
;
1627 /* Recover the read-hash procedure corresponding to char c. */
1629 scm_get_hash_procedure (int c
)
1631 SCM rest
= scm_i_read_hash_procedures_ref ();
1635 if (scm_is_null (rest
))
1638 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1639 return SCM_CDAR (rest
);
1641 rest
= SCM_CDR (rest
);
1645 #define SCM_ENCODING_SEARCH_SIZE (500)
1647 /* Search the first few hundred characters of a file for an Emacs-like coding
1648 declaration. Returns either NULL or a string whose storage has been
1649 allocated with `scm_gc_malloc ()'. */
1651 scm_i_scan_for_encoding (SCM port
)
1654 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1655 size_t bytes_read
, encoding_length
, i
;
1656 char *encoding
= NULL
;
1658 char *pos
, *encoding_start
;
1661 pt
= SCM_PTAB_ENTRY (port
);
1663 if (pt
->rw_active
== SCM_PORT_WRITE
)
1667 pt
->rw_active
= SCM_PORT_READ
;
1669 if (pt
->read_pos
== pt
->read_end
)
1671 /* We can use the read buffer, and thus avoid a seek. */
1672 if (scm_fill_input (port
) == EOF
)
1675 bytes_read
= pt
->read_end
- pt
->read_pos
;
1676 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1677 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1679 if (bytes_read
<= 1)
1680 /* An unbuffered port -- don't scan. */
1683 memcpy (header
, pt
->read_pos
, bytes_read
);
1684 header
[bytes_read
] = '\0';
1688 /* Try to read some bytes and then seek back. Not all ports
1689 support seeking back; and indeed some file ports (like
1690 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1691 check performed by SCM_FPORT_FDES---but fail to seek
1692 backwards. Hence this block comes second. We prefer to use
1693 the read buffer in-place. */
1694 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1697 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1698 header
[bytes_read
] = '\0';
1699 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1703 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1706 /* search past "coding[:=]" */
1710 if ((pos
= strstr(pos
, "coding")) == NULL
)
1713 pos
+= strlen("coding");
1714 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1715 (*pos
== ':' || *pos
== '='))
1723 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1724 (*pos
== ' ' || *pos
== '\t'))
1727 /* grab the next token */
1728 encoding_start
= pos
;
1730 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1731 && encoding_start
+ i
- header
< bytes_read
1732 && (isalnum ((int) encoding_start
[i
])
1733 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1736 encoding_length
= i
;
1737 if (encoding_length
== 0)
1740 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1741 for (i
= 0; i
< encoding_length
; i
++)
1742 encoding
[i
] = toupper ((int) encoding
[i
]);
1744 /* push backwards to make sure we were in a comment */
1746 pos
= encoding_start
;
1747 while (pos
>= header
)
1751 /* This wasn't in a semicolon comment. Check for a
1752 hash-bang comment. */
1753 char *beg
= strstr (header
, "#!");
1754 char *end
= strstr (header
, "!#");
1755 if (beg
< encoding_start
&& encoding_start
+ encoding_length
< end
)
1767 /* This wasn't in a comment */
1770 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1771 scm_misc_error (NULL
,
1772 "the port input declares the encoding ~s but is encoded as UTF-8",
1773 scm_list_1 (scm_from_locale_string (encoding
)));
1778 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1780 "Scans the port for an Emacs-like character coding declaration\n"
1781 "near the top of the contents of a port with random-accessible contents.\n"
1782 "The coding declaration is of the form\n"
1783 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1785 "Returns a string containing the character encoding of the file\n"
1786 "if a declaration was found, or @code{#f} otherwise.\n")
1787 #define FUNC_NAME s_scm_file_encoding
1792 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1794 enc
= scm_i_scan_for_encoding (port
);
1799 s_enc
= scm_from_locale_string (enc
);
1810 SCM read_hash_procs
;
1812 read_hash_procs
= scm_make_fluid ();
1813 scm_fluid_set_x (read_hash_procs
, SCM_EOL
);
1815 scm_i_read_hash_procedures
=
1816 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1818 scm_init_opts (scm_read_options
, scm_read_opts
);
1819 #include "libguile/read.x"