1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
244 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
245 char **overflow_buffer
, size_t *read
)
248 size_t bytes_read
, overflow_size
;
250 *overflow_buffer
= NULL
;
255 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
258 if (overflow
|| overflow_size
!= 0)
260 if (overflow_size
== 0)
262 *overflow_buffer
= scm_malloc (bytes_read
);
263 memcpy (*overflow_buffer
, buffer
, bytes_read
);
264 overflow_size
= bytes_read
;
268 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
269 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
!= 0);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
289 register scm_t_wchar c
;
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
);
360 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
364 scm_read_sexp (scm_t_wchar chr
, SCM port
)
365 #define FUNC_NAME "scm_i_lreadparen"
369 register SCM tl
, ans
= SCM_EOL
;
370 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
371 const int terminating_char
= ((chr
== '[') ? ']' : ')');
373 /* Need to capture line and column numbers here. */
374 long line
= SCM_LINUM (port
);
375 int column
= SCM_COL (port
) - 1;
378 c
= flush_ws (port
, FUNC_NAME
);
379 if (terminating_char
== c
)
382 scm_ungetc (c
, port
);
383 if (scm_is_eq (scm_sym_dot
,
384 (tmp
= scm_read_expression (port
))))
386 ans
= scm_read_expression (port
);
387 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
388 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
393 /* Build the head of the list structure. */
394 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
396 if (SCM_COPY_SOURCE_P
)
397 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
402 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
406 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
407 scm_i_input_error (FUNC_NAME
, port
,
408 "in pair: mismatched close paren: ~A",
409 scm_list_1 (SCM_MAKE_CHAR (c
)));
411 scm_ungetc (c
, port
);
412 tmp
= scm_read_expression (port
);
414 if (scm_is_eq (scm_sym_dot
, tmp
))
416 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
418 if (SCM_COPY_SOURCE_P
)
419 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
422 c
= flush_ws (port
, FUNC_NAME
);
423 if (terminating_char
!= c
)
424 scm_i_input_error (FUNC_NAME
, port
,
425 "in pair: missing close paren", SCM_EOL
);
429 new_tail
= scm_cons (tmp
, SCM_EOL
);
430 SCM_SETCDR (tl
, new_tail
);
433 if (SCM_COPY_SOURCE_P
)
435 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
438 SCM_SETCDR (tl2
, new_tail2
);
444 if (SCM_RECORD_POSITIONS_P
)
445 scm_hashq_set_x (scm_source_whash
,
447 scm_make_srcprops (line
, column
,
458 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
459 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
467 while (i < ndigits) \
469 a = scm_getc (port); \
473 && (a == (scm_t_wchar) terminator) \
476 if ('0' <= a && a <= '9') \
478 else if ('A' <= a && a <= 'F') \
480 else if ('a' <= a && a <= 'f') \
493 skip_intraline_whitespace (SCM port
)
503 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
505 scm_ungetc (c
, port
);
509 scm_read_string (int chr
, SCM port
)
510 #define FUNC_NAME "scm_lreadr"
512 /* For strings smaller than C_STR, this function creates only one Scheme
513 object (the string returned). */
515 SCM str
= SCM_BOOL_F
;
516 unsigned c_str_len
= 0;
519 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
520 while ('"' != (c
= scm_getc (port
)))
525 scm_i_input_error (FUNC_NAME
, port
,
526 "end of file in string constant", SCM_EOL
);
529 if (c_str_len
+ 1 >= scm_i_string_length (str
))
531 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
533 str
= scm_string_append (scm_list_2 (str
, addy
));
538 switch (c
= scm_getc (port
))
546 if (SCM_HUNGRY_EOL_ESCAPES_P
)
547 skip_intraline_whitespace (port
);
574 if (SCM_R6RS_ESCAPES_P
)
575 SCM_READ_HEX_ESCAPE (10, ';');
577 SCM_READ_HEX_ESCAPE (2, '\0');
580 if (!SCM_R6RS_ESCAPES_P
)
582 SCM_READ_HEX_ESCAPE (4, '\0');
586 if (!SCM_R6RS_ESCAPES_P
)
588 SCM_READ_HEX_ESCAPE (6, '\0');
593 scm_i_input_error (FUNC_NAME
, port
,
594 "illegal character in escape sequence: ~S",
595 scm_list_1 (SCM_MAKE_CHAR (c
)));
598 str
= scm_i_string_start_writing (str
);
599 scm_i_string_set_x (str
, c_str_len
++, c
);
600 scm_i_string_stop_writing ();
605 return scm_i_substring_copy (str
, 0, c_str_len
);
614 scm_read_number (scm_t_wchar chr
, SCM port
)
616 SCM result
, str
= SCM_EOL
;
617 char buffer
[READER_BUFFER_SIZE
];
618 char *overflow_buffer
= NULL
;
621 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
623 scm_ungetc (chr
, port
);
624 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
625 &overflow_buffer
, &bytes_read
);
628 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
630 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
633 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
634 if (!scm_is_true (result
))
636 /* Return a symbol instead of a number */
637 if (SCM_CASE_INSENSITIVE_P
)
638 str
= scm_string_downcase_x (str
);
639 result
= scm_string_to_symbol (str
);
643 free (overflow_buffer
);
644 SCM_COL (port
) += scm_i_string_length (str
);
649 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
652 int ends_with_colon
= 0;
654 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
656 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
657 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
660 scm_ungetc (chr
, port
);
661 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
662 &overflow_buffer
, &bytes_read
);
666 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
668 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
671 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
674 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
676 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
679 if (SCM_CASE_INSENSITIVE_P
)
680 str
= scm_string_downcase_x (str
);
681 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
686 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
688 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
691 if (SCM_CASE_INSENSITIVE_P
)
692 str
= scm_string_downcase_x (str
);
693 result
= scm_string_to_symbol (str
);
697 free (overflow_buffer
);
698 SCM_COL (port
) += scm_i_string_length (str
);
703 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
704 #define FUNC_NAME "scm_lreadr"
708 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
737 scm_ungetc (chr
, port
);
738 scm_ungetc ('#', port
);
742 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
743 &overflow_buffer
, &read
);
745 pt
= SCM_PTAB_ENTRY (port
);
747 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
749 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
752 result
= scm_string_to_number (str
, scm_from_uint (radix
));
755 free (overflow_buffer
);
757 SCM_COL (port
) += scm_i_string_length (str
);
759 if (scm_is_true (result
))
762 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
769 scm_read_quote (int chr
, SCM port
)
772 long line
= SCM_LINUM (port
);
773 int column
= SCM_COL (port
) - 1;
778 p
= scm_sym_quasiquote
;
791 p
= scm_sym_uq_splicing
;
794 scm_ungetc (c
, port
);
801 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
802 "scm_read_quote", chr
);
806 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
807 if (SCM_RECORD_POSITIONS_P
)
808 scm_hashq_set_x (scm_source_whash
, p
,
809 scm_make_srcprops (line
, column
,
812 ? (scm_cons2 (SCM_CAR (p
),
813 SCM_CAR (SCM_CDR (p
)),
822 SCM_SYMBOL (sym_syntax
, "syntax");
823 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
824 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
825 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
828 scm_read_syntax (int chr
, SCM port
)
831 long line
= SCM_LINUM (port
);
832 int column
= SCM_COL (port
) - 1;
850 p
= sym_unsyntax_splicing
;
853 scm_ungetc (c
, port
);
860 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
861 "scm_read_syntax", chr
);
865 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
866 if (SCM_RECORD_POSITIONS_P
)
867 scm_hashq_set_x (scm_source_whash
, p
,
868 scm_make_srcprops (line
, column
,
871 ? (scm_cons2 (SCM_CAR (p
),
872 SCM_CAR (SCM_CDR (p
)),
882 scm_read_nil (int chr
, SCM port
)
884 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
886 if (!scm_is_eq (id
, sym_nil
))
887 scm_i_input_error ("scm_read_nil", port
,
888 "unexpected input while reading #nil: ~a",
891 return SCM_ELISP_NIL
;
895 scm_read_semicolon_comment (int chr
, SCM port
)
899 /* We use the get_byte here because there is no need to get the
900 locale correct with comment input. This presumes that newline
901 always represents itself no matter what the encoding is. */
902 for (c
= scm_get_byte_or_eof (port
);
903 (c
!= EOF
) && (c
!= '\n');
904 c
= scm_get_byte_or_eof (port
));
906 return SCM_UNSPECIFIED
;
910 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
913 scm_read_boolean (int chr
, SCM port
)
926 return SCM_UNSPECIFIED
;
930 scm_read_character (scm_t_wchar chr
, SCM port
)
931 #define FUNC_NAME "scm_lreadr"
933 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
935 size_t charname_len
, bytes_read
;
940 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
942 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
946 chr
= scm_getc (port
);
948 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
949 "while reading character", SCM_EOL
);
951 /* CHR must be a token delimiter, like a whitespace. */
952 return (SCM_MAKE_CHAR (chr
));
955 pt
= SCM_PTAB_ENTRY (port
);
957 /* Simple ASCII characters can be processed immediately. Also, simple
958 ISO-8859-1 characters can be processed immediately if the encoding for this
959 port is ISO-8859-1. */
960 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
963 return SCM_MAKE_CHAR (buffer
[0]);
966 /* Otherwise, convert the buffer into a proper scheme string for
968 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
970 charname_len
= scm_i_string_length (charname
);
971 SCM_COL (port
) += charname_len
;
972 cp
= scm_i_string_ref (charname
, 0);
973 if (charname_len
== 1)
974 return SCM_MAKE_CHAR (cp
);
976 /* Ignore dotted circles, which may be used to keep combining characters from
977 combining with the backslash in #\charname. */
978 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
979 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
981 if (cp
>= '0' && cp
< '8')
983 /* Dirk:FIXME:: This type of character syntax is not R5RS
984 * compliant. Further, it should be verified that the constant
985 * does only consist of octal digits. */
986 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
989 scm_t_wchar c
= scm_to_uint32 (p
);
990 if (SCM_IS_UNICODE_CHAR (c
))
991 return SCM_MAKE_CHAR (c
);
993 scm_i_input_error (FUNC_NAME
, port
,
994 "out-of-range octal character escape: ~a",
995 scm_list_1 (charname
));
999 if (cp
== 'x' && (charname_len
> 1))
1003 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
1005 scm_from_uint (16));
1006 if (SCM_I_INUMP (p
))
1008 scm_t_wchar c
= scm_to_uint32 (p
);
1009 if (SCM_IS_UNICODE_CHAR (c
))
1010 return SCM_MAKE_CHAR (c
);
1012 scm_i_input_error (FUNC_NAME
, port
,
1013 "out-of-range hex character escape: ~a",
1014 scm_list_1 (charname
));
1018 /* The names of characters should never have non-Latin1
1020 if (scm_i_is_narrow_string (charname
)
1021 || scm_i_try_narrow_string (charname
))
1022 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
1024 if (scm_is_true (ch
))
1028 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
1029 scm_list_1 (charname
));
1031 return SCM_UNSPECIFIED
;
1036 scm_read_keyword (int chr
, SCM port
)
1040 /* Read the symbol that comprises the keyword. Doing this instead of
1041 invoking a specific symbol reader function allows `scm_read_keyword ()'
1042 to adapt to the delimiters currently valid of symbols.
1044 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1045 symbol
= scm_read_expression (port
);
1046 if (!scm_is_symbol (symbol
))
1047 scm_i_input_error ("scm_read_keyword", port
,
1048 "keyword prefix `~a' not followed by a symbol: ~s",
1049 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1051 return (scm_symbol_to_keyword (symbol
));
1055 scm_read_vector (int chr
, SCM port
)
1057 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058 guarantee that it's going to do what we want. After all, this is an
1059 implementation detail of `scm_read_vector ()', not a desirable
1061 return (scm_vector (scm_read_sexp (chr
, port
)));
1065 scm_read_srfi4_vector (int chr
, SCM port
)
1067 return scm_i_read_array (port
, chr
);
1071 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1073 chr
= scm_getc (port
);
1077 chr
= scm_getc (port
);
1081 chr
= scm_getc (port
);
1085 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1088 scm_i_input_error ("read_bytevector", port
,
1089 "invalid bytevector prefix",
1090 SCM_MAKE_CHAR (chr
));
1091 return SCM_UNSPECIFIED
;
1095 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1097 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1098 terribly inefficient but who cares? */
1099 SCM s_bits
= SCM_EOL
;
1101 for (chr
= scm_getc (port
);
1102 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1103 chr
= scm_getc (port
))
1105 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1109 scm_ungetc (chr
, port
);
1111 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1115 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1121 int c
= scm_getc (port
);
1124 scm_i_input_error ("skip_block_comment", port
,
1125 "unterminated `#! ... !#' comment", SCM_EOL
);
1129 else if (c
== '#' && bang_seen
)
1135 return SCM_UNSPECIFIED
;
1139 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1142 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1144 scm_ungetc (c
, port
);
1145 return scm_read_scsh_block_comment (chr
, port
);
1147 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1149 scm_ungetc (c
, port
);
1150 scm_ungetc ('r', port
);
1151 return scm_read_scsh_block_comment (chr
, port
);
1153 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1155 scm_ungetc (c
, port
);
1156 scm_ungetc ('6', port
);
1157 scm_ungetc ('r', port
);
1158 return scm_read_scsh_block_comment (chr
, port
);
1160 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1162 scm_ungetc (c
, port
);
1163 scm_ungetc ('r', port
);
1164 scm_ungetc ('6', port
);
1165 scm_ungetc ('r', port
);
1166 return scm_read_scsh_block_comment (chr
, port
);
1169 return SCM_UNSPECIFIED
;
1173 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1175 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1176 nested. So care must be taken. */
1177 int nesting_level
= 1;
1178 int opening_seen
= 0, closing_seen
= 0;
1180 while (nesting_level
> 0)
1182 int c
= scm_getc (port
);
1185 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1186 "unterminated `#| ... |#' comment", SCM_EOL
);
1194 else if (closing_seen
)
1205 opening_seen
= closing_seen
= 0;
1208 return SCM_UNSPECIFIED
;
1212 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1216 c
= flush_ws (port
, (char *) NULL
);
1218 scm_i_input_error ("read_commented_expression", port
,
1219 "no expression after #; comment", SCM_EOL
);
1220 scm_ungetc (c
, port
);
1221 scm_read_expression (port
);
1222 return SCM_UNSPECIFIED
;
1226 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1228 /* Guile's extended symbol read syntax looks like this:
1230 #{This is all a symbol name}#
1232 So here, CHR is expected to be `{'. */
1235 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1237 buf
= scm_i_string_start_writing (buf
);
1239 while ((chr
= scm_getc (port
)) != EOF
)
1250 scm_i_string_set_x (buf
, len
++, '}');
1256 else if (chr
== '\\')
1258 /* It used to be that print.c would print extended-read-syntax
1259 symbols with backslashes before "non-standard" chars, but
1260 this routine wouldn't do anything with those escapes.
1261 Bummer. What we've done is to change print.c to output
1262 R6RS hex escapes for those characters, relying on the fact
1263 that the extended read syntax would never put a `\' before
1264 an `x'. For now, we just ignore other instances of
1265 backslash in the string. */
1266 switch ((chr
= scm_getc (port
)))
1274 SCM_READ_HEX_ESCAPE (10, ';');
1275 scm_i_string_set_x (buf
, len
++, c
);
1283 scm_i_string_stop_writing ();
1284 scm_i_input_error ("scm_read_extended_symbol", port
,
1285 "illegal character in escape sequence: ~S",
1286 scm_list_1 (SCM_MAKE_CHAR (c
)));
1290 scm_i_string_set_x (buf
, len
++, chr
);
1295 scm_i_string_set_x (buf
, len
++, chr
);
1297 if (len
>= scm_i_string_length (buf
) - 2)
1301 scm_i_string_stop_writing ();
1302 addy
= scm_i_make_string (1024, NULL
, 0);
1303 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1305 buf
= scm_i_string_start_writing (buf
);
1310 scm_i_string_stop_writing ();
1312 scm_i_input_error ("scm_read_extended_symbol", port
,
1313 "end of file while reading symbol", SCM_EOL
);
1315 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1320 /* Top-level token readers, i.e., dispatchers. */
1323 scm_read_sharp_extension (int chr
, SCM port
)
1327 proc
= scm_get_hash_procedure (chr
);
1328 if (scm_is_true (scm_procedure_p (proc
)))
1330 long line
= SCM_LINUM (port
);
1331 int column
= SCM_COL (port
) - 2;
1334 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1335 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1337 if (SCM_RECORD_POSITIONS_P
)
1338 return (recsexpr (got
, line
, column
,
1339 SCM_FILENAME (port
)));
1345 return SCM_UNSPECIFIED
;
1348 /* The reader for the sharp `#' character. It basically dispatches reads
1349 among the above token readers. */
1351 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1352 #define FUNC_NAME "scm_lreadr"
1356 chr
= scm_getc (port
);
1358 result
= scm_read_sharp_extension (chr
, port
);
1359 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1365 return (scm_read_character (chr
, port
));
1367 return (scm_read_vector (chr
, port
));
1372 /* This one may return either a boolean or an SRFI-4 vector. */
1373 return (scm_read_srfi4_vector (chr
, port
));
1375 return (scm_read_bytevector (chr
, port
));
1377 return (scm_read_guile_bit_vector (chr
, port
));
1381 /* This one may return either a boolean or an SRFI-4 vector. */
1382 return (scm_read_boolean (chr
, port
));
1384 return (scm_read_keyword (chr
, port
));
1385 case '0': case '1': case '2': case '3': case '4':
1386 case '5': case '6': case '7': case '8': case '9':
1388 return (scm_i_read_array (port
, chr
));
1402 return (scm_read_number_and_radix (chr
, port
));
1404 return (scm_read_extended_symbol (chr
, port
));
1406 return (scm_read_shebang (chr
, port
));
1408 return (scm_read_commented_expression (chr
, port
));
1412 return (scm_read_syntax (chr
, port
));
1414 return (scm_read_nil (chr
, port
));
1416 result
= scm_read_sharp_extension (chr
, port
);
1417 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1419 /* To remain compatible with 1.8 and earlier, the following
1420 characters have lower precedence than `read-hash-extend'
1425 return scm_read_r6rs_block_comment (chr
, port
);
1427 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1428 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1435 return SCM_UNSPECIFIED
;
1440 scm_read_expression (SCM port
)
1441 #define FUNC_NAME "scm_read_expression"
1445 register scm_t_wchar chr
;
1447 chr
= scm_getc (port
);
1451 case SCM_WHITE_SPACES
:
1452 case SCM_LINE_INCREMENTORS
:
1455 (void) scm_read_semicolon_comment (chr
, port
);
1458 if (!SCM_SQUARE_BRACKETS_P
)
1459 return (scm_read_mixed_case_symbol (chr
, port
));
1460 /* otherwise fall through */
1462 return (scm_read_sexp (chr
, port
));
1464 return (scm_read_string (chr
, port
));
1468 return (scm_read_quote (chr
, port
));
1472 result
= scm_read_sharp (chr
, port
);
1473 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1474 /* We read a comment or some such. */
1480 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1483 if (SCM_SQUARE_BRACKETS_P
)
1484 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1485 /* otherwise fall through */
1489 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1490 return scm_symbol_to_keyword (scm_read_expression (port
));
1495 if (((chr
>= '0') && (chr
<= '9'))
1496 || (strchr ("+-.", chr
)))
1497 return (scm_read_number (chr
, port
));
1499 return (scm_read_mixed_case_symbol (chr
, port
));
1507 /* Actual reader. */
1509 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1511 "Read an s-expression from the input port @var{port}, or from\n"
1512 "the current input port if @var{port} is not specified.\n"
1513 "Any whitespace before the next token is discarded.")
1514 #define FUNC_NAME s_scm_read
1518 if (SCM_UNBNDP (port
))
1519 port
= scm_current_input_port ();
1520 SCM_VALIDATE_OPINPORT (1, port
);
1522 c
= flush_ws (port
, (char *) NULL
);
1525 scm_ungetc (c
, port
);
1527 return (scm_read_expression (port
));
1534 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1536 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1538 if (!scm_is_pair(obj
)) {
1542 /* If this sexpr is visible in the read:sharp source, we want to
1543 keep that information, so only record non-constant cons cells
1544 which haven't previously been read by the reader. */
1545 if (scm_is_false (scm_hashq_ref (scm_source_whash
, obj
, SCM_BOOL_F
)))
1547 if (SCM_COPY_SOURCE_P
)
1549 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1551 for (tmp
= obj
; scm_is_pair (tmp
); tmp
= SCM_CDR (tmp
))
1553 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1558 copy
= SCM_CDR (copy
);
1560 SCM_SETCDR (copy
, tmp
);
1564 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1565 for (tmp
= obj
; scm_is_pair (tmp
); tmp
= SCM_CDR (tmp
))
1566 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1567 copy
= SCM_UNDEFINED
;
1569 scm_hashq_set_x (scm_source_whash
,
1571 scm_make_srcprops (line
,
1581 /* Manipulate the read-hash-procedures alist. This could be written in
1582 Scheme, but maybe it will also be used by C code during initialisation. */
1583 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1584 (SCM chr
, SCM proc
),
1585 "Install the procedure @var{proc} for reading expressions\n"
1586 "starting with the character sequence @code{#} and @var{chr}.\n"
1587 "@var{proc} will be called with two arguments: the character\n"
1588 "@var{chr} and the port to read further data from. The object\n"
1589 "returned will be the return value of @code{read}. \n"
1590 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1592 #define FUNC_NAME s_scm_read_hash_extend
1597 SCM_VALIDATE_CHAR (1, chr
);
1598 SCM_ASSERT (scm_is_false (proc
)
1599 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1600 proc
, SCM_ARG2
, FUNC_NAME
);
1602 /* Check if chr is already in the alist. */
1603 this = scm_i_read_hash_procedures_ref ();
1607 if (scm_is_null (this))
1609 /* not found, so add it to the beginning. */
1610 if (scm_is_true (proc
))
1612 SCM
new = scm_cons (scm_cons (chr
, proc
),
1613 scm_i_read_hash_procedures_ref ());
1614 scm_i_read_hash_procedures_set_x (new);
1618 if (scm_is_eq (chr
, SCM_CAAR (this)))
1620 /* already in the alist. */
1621 if (scm_is_false (proc
))
1624 if (scm_is_false (prev
))
1626 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1627 scm_i_read_hash_procedures_set_x (rest
);
1630 scm_set_cdr_x (prev
, SCM_CDR (this));
1635 scm_set_cdr_x (SCM_CAR (this), proc
);
1640 this = SCM_CDR (this);
1643 return SCM_UNSPECIFIED
;
1647 /* Recover the read-hash procedure corresponding to char c. */
1649 scm_get_hash_procedure (int c
)
1651 SCM rest
= scm_i_read_hash_procedures_ref ();
1655 if (scm_is_null (rest
))
1658 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1659 return SCM_CDAR (rest
);
1661 rest
= SCM_CDR (rest
);
1665 #define SCM_ENCODING_SEARCH_SIZE (500)
1667 /* Search the first few hundred characters of a file for an Emacs-like coding
1668 declaration. Returns either NULL or a string whose storage has been
1669 allocated with `scm_gc_malloc ()'. */
1671 scm_i_scan_for_encoding (SCM port
)
1674 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1675 size_t bytes_read
, encoding_length
, i
;
1676 char *encoding
= NULL
;
1678 char *pos
, *encoding_start
;
1681 pt
= SCM_PTAB_ENTRY (port
);
1683 if (pt
->rw_active
== SCM_PORT_WRITE
)
1687 pt
->rw_active
= SCM_PORT_READ
;
1689 if (pt
->read_pos
== pt
->read_end
)
1691 /* We can use the read buffer, and thus avoid a seek. */
1692 if (scm_fill_input (port
) == EOF
)
1695 bytes_read
= pt
->read_end
- pt
->read_pos
;
1696 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1697 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1699 if (bytes_read
<= 1)
1700 /* An unbuffered port -- don't scan. */
1703 memcpy (header
, pt
->read_pos
, bytes_read
);
1704 header
[bytes_read
] = '\0';
1708 /* Try to read some bytes and then seek back. Not all ports
1709 support seeking back; and indeed some file ports (like
1710 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1711 check performed by SCM_FPORT_FDES---but fail to seek
1712 backwards. Hence this block comes second. We prefer to use
1713 the read buffer in-place. */
1714 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1717 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1718 header
[bytes_read
] = '\0';
1719 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1723 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1726 /* search past "coding[:=]" */
1730 if ((pos
= strstr(pos
, "coding")) == NULL
)
1733 pos
+= strlen("coding");
1734 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1735 (*pos
== ':' || *pos
== '='))
1743 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1744 (*pos
== ' ' || *pos
== '\t'))
1747 /* grab the next token */
1748 encoding_start
= pos
;
1750 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1751 && encoding_start
+ i
- header
< bytes_read
1752 && (isalnum ((int) encoding_start
[i
])
1753 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1756 encoding_length
= i
;
1757 if (encoding_length
== 0)
1760 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1761 for (i
= 0; i
< encoding_length
; i
++)
1762 encoding
[i
] = toupper ((int) encoding
[i
]);
1764 /* push backwards to make sure we were in a comment */
1766 pos
= encoding_start
;
1767 while (pos
>= header
)
1774 else if (*pos
== '\n' || pos
== header
)
1776 /* This wasn't in a semicolon comment. Check for a
1777 hash-bang comment. */
1778 char *beg
= strstr (header
, "#!");
1779 char *end
= strstr (header
, "!#");
1780 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1791 /* This wasn't in a comment */
1794 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1795 scm_misc_error (NULL
,
1796 "the port input declares the encoding ~s but is encoded as UTF-8",
1797 scm_list_1 (scm_from_locale_string (encoding
)));
1802 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1804 "Scans the port for an Emacs-like character coding declaration\n"
1805 "near the top of the contents of a port with random-accessible contents.\n"
1806 "The coding declaration is of the form\n"
1807 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1809 "Returns a string containing the character encoding of the file\n"
1810 "if a declaration was found, or @code{#f} otherwise.\n")
1811 #define FUNC_NAME s_scm_file_encoding
1816 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1818 enc
= scm_i_scan_for_encoding (port
);
1823 s_enc
= scm_from_locale_string (enc
);
1834 SCM read_hash_procs
;
1836 read_hash_procs
= scm_make_fluid ();
1837 scm_fluid_set_x (read_hash_procs
, SCM_EOL
);
1839 scm_i_read_hash_procedures
=
1840 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1842 scm_init_opts (scm_read_options
, scm_read_opts
);
1843 #include "libguile/read.x"