1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
63 SCM_SYMBOL (sym_nil
, "nil");
65 scm_t_option scm_read_opts
[] = {
66 { SCM_OPTION_BOOLEAN
, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN
, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM
, "keywords", (unsigned long) SCM_BOOL_F
,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
75 "Use R6RS variable-length character and string hex escapes."},
76 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
77 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
82 Give meaningful error messages for errors
86 FILE:LINE:COL: MESSAGE
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
96 scm_i_input_error (char const *function
,
97 SCM port
, const char *message
, SCM arg
)
99 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
101 : scm_from_locale_string ("#<unknown port>"));
103 SCM string_port
= scm_open_output_string ();
104 SCM string
= SCM_EOL
;
105 scm_simple_format (string_port
,
106 scm_from_locale_string ("~A:~S:~S: ~A"),
108 scm_from_long (SCM_LINUM (port
) + 1),
109 scm_from_int (SCM_COL (port
) + 1),
110 scm_from_locale_string (message
)));
112 string
= scm_get_output_string (string_port
);
113 scm_close_output_port (string_port
);
114 scm_error_scm (scm_from_locale_symbol ("read-error"),
115 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
122 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
127 #define FUNC_NAME s_scm_read_options
129 SCM ans
= scm_options (setting
,
132 if (SCM_COPY_SOURCE_P
)
133 SCM_RECORD_POSITIONS_P
= 1;
138 /* An association list mapping extra hash characters to procedures. */
139 static SCM
*scm_read_hash_procedures
;
146 /* Size of the C buffer used to read symbols and numbers. */
147 #define READER_BUFFER_SIZE 128
149 /* Size of the C buffer used to read strings. */
150 #define READER_STRING_BUFFER_SIZE 512
152 /* The maximum size of Scheme character names. */
153 #define READER_CHAR_NAME_MAX_SIZE 50
156 /* `isblank' is only in C99. */
157 #define CHAR_IS_BLANK_(_chr) \
158 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
159 || ((_chr) == '\f') || ((_chr) == '\r'))
162 # define CHAR_IS_BLANK(_chr) \
163 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
165 # define CHAR_IS_BLANK CHAR_IS_BLANK_
169 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
171 #define CHAR_IS_R5RS_DELIMITER(c) \
173 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
174 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
176 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
178 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
180 #define CHAR_IS_EXPONENT_MARKER(_chr) \
181 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
182 || ((_chr) == 'd') || ((_chr) == 'l'))
184 /* Read an SCSH block comment. */
185 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
186 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
187 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
188 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
189 static SCM
scm_get_hash_procedure (int);
191 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
192 result in the pre-allocated buffer BUF. Return zero if the whole token has
193 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
194 bytes actually read. */
196 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
200 while (*read
< buf_size
)
204 chr
= scm_get_byte_or_eof (port
);
208 else if (CHAR_IS_DELIMITER (chr
))
210 scm_unget_byte (chr
, port
);
223 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
224 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
225 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
226 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
227 will be set the number of bytes actually read. */
229 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
230 char **overflow_buffer
, size_t *read
)
233 size_t bytes_read
, overflow_size
;
235 *overflow_buffer
= NULL
;
240 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
243 if (overflow
|| overflow_size
!= 0)
245 if (overflow_size
== 0)
247 *overflow_buffer
= scm_malloc (bytes_read
);
248 memcpy (*overflow_buffer
, buffer
, bytes_read
);
249 overflow_size
= bytes_read
;
253 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
254 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
255 overflow_size
+= bytes_read
;
262 *read
= overflow_size
;
266 return (overflow_size
!= 0);
269 /* Skip whitespace from PORT and return the first non-whitespace character
270 read. Raise an error on end-of-file. */
272 flush_ws (SCM port
, const char *eoferr
)
274 register scm_t_wchar c
;
276 switch (c
= scm_getc (port
))
282 scm_i_input_error (eoferr
,
291 switch (c
= scm_getc (port
))
297 case SCM_LINE_INCREMENTORS
:
303 switch (c
= scm_getc (port
))
306 eoferr
= "read_sharp";
309 scm_read_shebang (c
, port
);
312 scm_read_commented_expression (c
, port
);
315 if (scm_is_false (scm_get_hash_procedure (c
)))
317 scm_read_r6rs_block_comment (c
, port
);
322 scm_ungetc (c
, port
);
327 case SCM_LINE_INCREMENTORS
:
328 case SCM_SINGLE_SPACES
:
343 static SCM
scm_read_expression (SCM port
);
344 static SCM
scm_read_sharp (int chr
, SCM port
);
345 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
349 scm_read_sexp (scm_t_wchar chr
, SCM port
)
350 #define FUNC_NAME "scm_i_lreadparen"
354 register SCM tl
, ans
= SCM_EOL
;
355 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
356 const int terminating_char
= ((chr
== '[') ? ']' : ')');
358 /* Need to capture line and column numbers here. */
359 long line
= SCM_LINUM (port
);
360 int column
= SCM_COL (port
) - 1;
363 c
= flush_ws (port
, FUNC_NAME
);
364 if (terminating_char
== c
)
367 scm_ungetc (c
, port
);
368 if (scm_is_eq (scm_sym_dot
,
369 (tmp
= scm_read_expression (port
))))
371 ans
= scm_read_expression (port
);
372 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
373 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
378 /* Build the head of the list structure. */
379 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
381 if (SCM_COPY_SOURCE_P
)
382 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
387 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
391 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
392 scm_i_input_error (FUNC_NAME
, port
,
393 "in pair: mismatched close paren: ~A",
394 scm_list_1 (SCM_MAKE_CHAR (c
)));
396 scm_ungetc (c
, port
);
397 tmp
= scm_read_expression (port
);
399 if (scm_is_eq (scm_sym_dot
, tmp
))
401 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
403 if (SCM_COPY_SOURCE_P
)
404 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
407 c
= flush_ws (port
, FUNC_NAME
);
408 if (terminating_char
!= c
)
409 scm_i_input_error (FUNC_NAME
, port
,
410 "in pair: missing close paren", SCM_EOL
);
414 new_tail
= scm_cons (tmp
, SCM_EOL
);
415 SCM_SETCDR (tl
, new_tail
);
418 if (SCM_COPY_SOURCE_P
)
420 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
423 SCM_SETCDR (tl2
, new_tail2
);
429 if (SCM_RECORD_POSITIONS_P
)
430 scm_whash_insert (scm_source_whash
,
432 scm_make_srcprops (line
, column
,
443 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
444 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
446 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
452 while (i < ndigits) \
454 a = scm_getc (port); \
458 && (a == (scm_t_wchar) terminator) \
461 if ('0' <= a && a <= '9') \
463 else if ('A' <= a && a <= 'F') \
465 else if ('a' <= a && a <= 'f') \
478 scm_read_string (int chr
, SCM port
)
479 #define FUNC_NAME "scm_lreadr"
481 /* For strings smaller than C_STR, this function creates only one Scheme
482 object (the string returned). */
484 SCM str
= SCM_BOOL_F
;
485 unsigned c_str_len
= 0;
488 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
489 while ('"' != (c
= scm_getc (port
)))
494 scm_i_input_error (FUNC_NAME
, port
,
495 "end of file in string constant", SCM_EOL
);
498 if (c_str_len
+ 1 >= scm_i_string_length (str
))
500 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
502 str
= scm_string_append (scm_list_2 (str
, addy
));
507 switch (c
= scm_getc (port
))
541 if (SCM_R6RS_ESCAPES_P
)
542 SCM_READ_HEX_ESCAPE (10, ';');
544 SCM_READ_HEX_ESCAPE (2, '\0');
547 if (!SCM_R6RS_ESCAPES_P
)
549 SCM_READ_HEX_ESCAPE (4, '\0');
553 if (!SCM_R6RS_ESCAPES_P
)
555 SCM_READ_HEX_ESCAPE (6, '\0');
560 scm_i_input_error (FUNC_NAME
, port
,
561 "illegal character in escape sequence: ~S",
562 scm_list_1 (SCM_MAKE_CHAR (c
)));
565 str
= scm_i_string_start_writing (str
);
566 scm_i_string_set_x (str
, c_str_len
++, c
);
567 scm_i_string_stop_writing ();
572 return scm_i_substring_copy (str
, 0, c_str_len
);
581 scm_read_number (scm_t_wchar chr
, SCM port
)
583 SCM result
, str
= SCM_EOL
;
584 char buffer
[READER_BUFFER_SIZE
];
585 char *overflow_buffer
= NULL
;
588 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
590 scm_ungetc (chr
, port
);
591 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
592 &overflow_buffer
, &bytes_read
);
595 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
597 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
600 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
601 if (!scm_is_true (result
))
603 /* Return a symbol instead of a number */
604 if (SCM_CASE_INSENSITIVE_P
)
605 str
= scm_string_downcase_x (str
);
606 result
= scm_string_to_symbol (str
);
610 free (overflow_buffer
);
611 SCM_COL (port
) += scm_i_string_length (str
);
616 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
619 int ends_with_colon
= 0;
621 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
623 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
624 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
627 scm_ungetc (chr
, port
);
628 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
629 &overflow_buffer
, &bytes_read
);
633 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
635 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
638 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
641 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
643 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
646 if (SCM_CASE_INSENSITIVE_P
)
647 str
= scm_string_downcase_x (str
);
648 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
653 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
655 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
658 if (SCM_CASE_INSENSITIVE_P
)
659 str
= scm_string_downcase_x (str
);
660 result
= scm_string_to_symbol (str
);
664 free (overflow_buffer
);
665 SCM_COL (port
) += scm_i_string_length (str
);
670 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
671 #define FUNC_NAME "scm_lreadr"
675 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
704 scm_ungetc (chr
, port
);
705 scm_ungetc ('#', port
);
709 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
710 &overflow_buffer
, &read
);
712 pt
= SCM_PTAB_ENTRY (port
);
714 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
716 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
719 result
= scm_string_to_number (str
, scm_from_uint (radix
));
722 free (overflow_buffer
);
724 SCM_COL (port
) += scm_i_string_length (str
);
726 if (scm_is_true (result
))
729 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
736 scm_read_quote (int chr
, SCM port
)
739 long line
= SCM_LINUM (port
);
740 int column
= SCM_COL (port
) - 1;
745 p
= scm_sym_quasiquote
;
758 p
= scm_sym_uq_splicing
;
761 scm_ungetc (c
, port
);
768 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
769 "scm_read_quote", chr
);
773 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
774 if (SCM_RECORD_POSITIONS_P
)
775 scm_whash_insert (scm_source_whash
, p
,
776 scm_make_srcprops (line
, column
,
779 ? (scm_cons2 (SCM_CAR (p
),
780 SCM_CAR (SCM_CDR (p
)),
789 SCM_SYMBOL (sym_syntax
, "syntax");
790 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
791 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
792 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
795 scm_read_syntax (int chr
, SCM port
)
798 long line
= SCM_LINUM (port
);
799 int column
= SCM_COL (port
) - 1;
817 p
= sym_unsyntax_splicing
;
820 scm_ungetc (c
, port
);
827 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr
);
832 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
833 if (SCM_RECORD_POSITIONS_P
)
834 scm_whash_insert (scm_source_whash
, p
,
835 scm_make_srcprops (line
, column
,
838 ? (scm_cons2 (SCM_CAR (p
),
839 SCM_CAR (SCM_CDR (p
)),
849 scm_read_nil (int chr
, SCM port
)
851 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
853 if (!scm_is_eq (id
, sym_nil
))
854 scm_i_input_error ("scm_read_nil", port
,
855 "unexpected input while reading #nil: ~a",
858 return SCM_ELISP_NIL
;
862 scm_read_semicolon_comment (int chr
, SCM port
)
866 /* We use the get_byte here because there is no need to get the
867 locale correct with comment input. This presumes that newline
868 always represents itself no matter what the encoding is. */
869 for (c
= scm_get_byte_or_eof (port
);
870 (c
!= EOF
) && (c
!= '\n');
871 c
= scm_get_byte_or_eof (port
));
873 return SCM_UNSPECIFIED
;
877 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
880 scm_read_boolean (int chr
, SCM port
)
893 return SCM_UNSPECIFIED
;
897 scm_read_character (scm_t_wchar chr
, SCM port
)
898 #define FUNC_NAME "scm_lreadr"
900 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
902 size_t charname_len
, bytes_read
;
907 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
913 chr
= scm_getc (port
);
915 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
916 "while reading character", SCM_EOL
);
918 /* CHR must be a token delimiter, like a whitespace. */
919 return (SCM_MAKE_CHAR (chr
));
922 pt
= SCM_PTAB_ENTRY (port
);
924 /* Simple ASCII characters can be processed immediately. Also, simple
925 ISO-8859-1 characters can be processed immediately if the encoding for this
926 port is ISO-8859-1. */
927 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
930 return SCM_MAKE_CHAR (buffer
[0]);
933 /* Otherwise, convert the buffer into a proper scheme string for
935 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
937 charname_len
= scm_i_string_length (charname
);
938 SCM_COL (port
) += charname_len
;
939 cp
= scm_i_string_ref (charname
, 0);
940 if (charname_len
== 1)
941 return SCM_MAKE_CHAR (cp
);
943 /* Ignore dotted circles, which may be used to keep combining characters from
944 combining with the backslash in #\charname. */
945 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
946 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
948 if (cp
>= '0' && cp
< '8')
950 /* Dirk:FIXME:: This type of character syntax is not R5RS
951 * compliant. Further, it should be verified that the constant
952 * does only consist of octal digits. */
953 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
956 scm_t_wchar c
= SCM_I_INUM (p
);
957 if (SCM_IS_UNICODE_CHAR (c
))
958 return SCM_MAKE_CHAR (c
);
960 scm_i_input_error (FUNC_NAME
, port
,
961 "out-of-range octal character escape: ~a",
962 scm_list_1 (charname
));
966 if (cp
== 'x' && (charname_len
> 1))
970 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
971 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
975 scm_t_wchar c
= SCM_I_INUM (p
);
976 if (SCM_IS_UNICODE_CHAR (c
))
977 return SCM_MAKE_CHAR (c
);
979 scm_i_input_error (FUNC_NAME
, port
,
980 "out-of-range hex character escape: ~a",
981 scm_list_1 (charname
));
985 /* The names of characters should never have non-Latin1
987 if (scm_i_is_narrow_string (charname
)
988 || scm_i_try_narrow_string (charname
))
989 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
991 if (scm_is_true (ch
))
996 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
997 scm_list_1 (charname
));
999 return SCM_UNSPECIFIED
;
1004 scm_read_keyword (int chr
, SCM port
)
1008 /* Read the symbol that comprises the keyword. Doing this instead of
1009 invoking a specific symbol reader function allows `scm_read_keyword ()'
1010 to adapt to the delimiters currently valid of symbols.
1012 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1013 symbol
= scm_read_expression (port
);
1014 if (!scm_is_symbol (symbol
))
1015 scm_i_input_error ("scm_read_keyword", port
,
1016 "keyword prefix `~a' not followed by a symbol: ~s",
1017 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1019 return (scm_symbol_to_keyword (symbol
));
1023 scm_read_vector (int chr
, SCM port
)
1025 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1026 guarantee that it's going to do what we want. After all, this is an
1027 implementation detail of `scm_read_vector ()', not a desirable
1029 return (scm_vector (scm_read_sexp (chr
, port
)));
1033 scm_read_srfi4_vector (int chr
, SCM port
)
1035 return scm_i_read_array (port
, chr
);
1039 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1041 chr
= scm_getc (port
);
1045 chr
= scm_getc (port
);
1049 chr
= scm_getc (port
);
1053 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1056 scm_i_input_error ("read_bytevector", port
,
1057 "invalid bytevector prefix",
1058 SCM_MAKE_CHAR (chr
));
1059 return SCM_UNSPECIFIED
;
1063 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1065 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1066 terribly inefficient but who cares? */
1067 SCM s_bits
= SCM_EOL
;
1069 for (chr
= scm_getc (port
);
1070 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1071 chr
= scm_getc (port
))
1073 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1077 scm_ungetc (chr
, port
);
1079 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1083 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1087 /* We can use the get_byte here because there is no need to get the
1088 locale correct when reading comments. This presumes that
1089 hash and exclamation points always represent themselves no
1090 matter what the source encoding is.*/
1093 int c
= scm_get_byte_or_eof (port
);
1096 scm_i_input_error ("skip_block_comment", port
,
1097 "unterminated `#! ... !#' comment", SCM_EOL
);
1101 else if (c
== '#' && bang_seen
)
1107 return SCM_UNSPECIFIED
;
1111 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1114 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1116 scm_ungetc (c
, port
);
1117 return scm_read_scsh_block_comment (chr
, port
);
1119 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1121 scm_ungetc (c
, port
);
1122 scm_ungetc ('r', port
);
1123 return scm_read_scsh_block_comment (chr
, port
);
1125 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1127 scm_ungetc (c
, port
);
1128 scm_ungetc ('6', port
);
1129 scm_ungetc ('r', port
);
1130 return scm_read_scsh_block_comment (chr
, port
);
1132 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1134 scm_ungetc (c
, port
);
1135 scm_ungetc ('r', port
);
1136 scm_ungetc ('6', port
);
1137 scm_ungetc ('r', port
);
1138 return scm_read_scsh_block_comment (chr
, port
);
1141 return SCM_UNSPECIFIED
;
1145 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1147 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1148 nested. So care must be taken. */
1149 int nesting_level
= 1;
1150 int opening_seen
= 0, closing_seen
= 0;
1152 while (nesting_level
> 0)
1154 int c
= scm_getc (port
);
1157 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1158 "unterminated `#| ... |#' comment", SCM_EOL
);
1166 else if (closing_seen
)
1177 opening_seen
= closing_seen
= 0;
1180 return SCM_UNSPECIFIED
;
1184 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1188 c
= flush_ws (port
, (char *) NULL
);
1190 scm_i_input_error ("read_commented_expression", port
,
1191 "no expression after #; comment", SCM_EOL
);
1192 scm_ungetc (c
, port
);
1193 scm_read_expression (port
);
1194 return SCM_UNSPECIFIED
;
1198 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1200 /* Guile's extended symbol read syntax looks like this:
1202 #{This is all a symbol name}#
1204 So here, CHR is expected to be `{'. */
1205 int saw_brace
= 0, finished
= 0;
1207 SCM buf
= scm_i_make_string (1024, NULL
);
1209 buf
= scm_i_string_start_writing (buf
);
1211 while ((chr
= scm_getc (port
)) != EOF
)
1223 scm_i_string_set_x (buf
, len
++, '}');
1224 scm_i_string_set_x (buf
, len
++, chr
);
1227 else if (chr
== '}')
1230 scm_i_string_set_x (buf
, len
++, chr
);
1232 if (len
>= scm_i_string_length (buf
) - 2)
1236 scm_i_string_stop_writing ();
1237 addy
= scm_i_make_string (1024, NULL
);
1238 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1240 buf
= scm_i_string_start_writing (buf
);
1246 scm_i_string_stop_writing ();
1248 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1253 /* Top-level token readers, i.e., dispatchers. */
1256 scm_read_sharp_extension (int chr
, SCM port
)
1260 proc
= scm_get_hash_procedure (chr
);
1261 if (scm_is_true (scm_procedure_p (proc
)))
1263 long line
= SCM_LINUM (port
);
1264 int column
= SCM_COL (port
) - 2;
1267 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1268 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1270 if (SCM_RECORD_POSITIONS_P
)
1271 return (recsexpr (got
, line
, column
,
1272 SCM_FILENAME (port
)));
1278 return SCM_UNSPECIFIED
;
1281 /* The reader for the sharp `#' character. It basically dispatches reads
1282 among the above token readers. */
1284 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1285 #define FUNC_NAME "scm_lreadr"
1289 chr
= scm_getc (port
);
1291 result
= scm_read_sharp_extension (chr
, port
);
1292 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1298 return (scm_read_character (chr
, port
));
1300 return (scm_read_vector (chr
, port
));
1304 /* This one may return either a boolean or an SRFI-4 vector. */
1305 return (scm_read_srfi4_vector (chr
, port
));
1307 return (scm_read_bytevector (chr
, port
));
1309 return (scm_read_guile_bit_vector (chr
, port
));
1313 /* This one may return either a boolean or an SRFI-4 vector. */
1314 return (scm_read_boolean (chr
, port
));
1316 return (scm_read_keyword (chr
, port
));
1317 case '0': case '1': case '2': case '3': case '4':
1318 case '5': case '6': case '7': case '8': case '9':
1320 #if SCM_ENABLE_DEPRECATED
1321 /* See below for 'i' and 'e'. */
1328 return (scm_i_read_array (port
, chr
));
1332 #if SCM_ENABLE_DEPRECATED
1334 /* When next char is '(', it really is an old-style
1336 scm_t_wchar next_c
= scm_getc (port
);
1338 scm_ungetc (next_c
, port
);
1340 return scm_i_read_array (port
, chr
);
1354 return (scm_read_number_and_radix (chr
, port
));
1356 return (scm_read_extended_symbol (chr
, port
));
1358 return (scm_read_shebang (chr
, port
));
1360 return (scm_read_commented_expression (chr
, port
));
1364 return (scm_read_syntax (chr
, port
));
1366 return (scm_read_nil (chr
, port
));
1368 result
= scm_read_sharp_extension (chr
, port
);
1369 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1371 /* To remain compatible with 1.8 and earlier, the following
1372 characters have lower precedence than `read-hash-extend'
1377 return scm_read_r6rs_block_comment (chr
, port
);
1379 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1380 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1387 return SCM_UNSPECIFIED
;
1392 scm_read_expression (SCM port
)
1393 #define FUNC_NAME "scm_read_expression"
1397 register scm_t_wchar chr
;
1399 chr
= scm_getc (port
);
1403 case SCM_WHITE_SPACES
:
1404 case SCM_LINE_INCREMENTORS
:
1407 (void) scm_read_semicolon_comment (chr
, port
);
1410 if (!SCM_SQUARE_BRACKETS_P
)
1411 return (scm_read_mixed_case_symbol (chr
, port
));
1412 /* otherwise fall through */
1414 return (scm_read_sexp (chr
, port
));
1416 return (scm_read_string (chr
, port
));
1420 return (scm_read_quote (chr
, port
));
1424 result
= scm_read_sharp (chr
, port
);
1425 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1426 /* We read a comment or some such. */
1432 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1437 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1438 return scm_symbol_to_keyword (scm_read_expression (port
));
1443 if (((chr
>= '0') && (chr
<= '9'))
1444 || (strchr ("+-.", chr
)))
1445 return (scm_read_number (chr
, port
));
1447 return (scm_read_mixed_case_symbol (chr
, port
));
1455 /* Actual reader. */
1457 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1459 "Read an s-expression from the input port @var{port}, or from\n"
1460 "the current input port if @var{port} is not specified.\n"
1461 "Any whitespace before the next token is discarded.")
1462 #define FUNC_NAME s_scm_read
1466 if (SCM_UNBNDP (port
))
1467 port
= scm_current_input_port ();
1468 SCM_VALIDATE_OPINPORT (1, port
);
1470 c
= flush_ws (port
, (char *) NULL
);
1473 scm_ungetc (c
, port
);
1475 return (scm_read_expression (port
));
1482 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1484 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1486 if (!scm_is_pair(obj
)) {
1489 SCM tmp
= obj
, copy
;
1490 /* If this sexpr is visible in the read:sharp source, we want to
1491 keep that information, so only record non-constant cons cells
1492 which haven't previously been read by the reader. */
1493 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1495 if (SCM_COPY_SOURCE_P
)
1497 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1499 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1501 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1506 copy
= SCM_CDR (copy
);
1508 SCM_SETCDR (copy
, tmp
);
1512 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1513 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1514 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1515 copy
= SCM_UNDEFINED
;
1517 scm_whash_insert (scm_source_whash
,
1519 scm_make_srcprops (line
,
1529 /* Manipulate the read-hash-procedures alist. This could be written in
1530 Scheme, but maybe it will also be used by C code during initialisation. */
1531 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1532 (SCM chr
, SCM proc
),
1533 "Install the procedure @var{proc} for reading expressions\n"
1534 "starting with the character sequence @code{#} and @var{chr}.\n"
1535 "@var{proc} will be called with two arguments: the character\n"
1536 "@var{chr} and the port to read further data from. The object\n"
1537 "returned will be the return value of @code{read}. \n"
1538 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1540 #define FUNC_NAME s_scm_read_hash_extend
1545 SCM_VALIDATE_CHAR (1, chr
);
1546 SCM_ASSERT (scm_is_false (proc
)
1547 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1548 proc
, SCM_ARG2
, FUNC_NAME
);
1550 /* Check if chr is already in the alist. */
1551 this = *scm_read_hash_procedures
;
1555 if (scm_is_null (this))
1557 /* not found, so add it to the beginning. */
1558 if (scm_is_true (proc
))
1560 *scm_read_hash_procedures
=
1561 scm_cons (scm_cons (chr
, proc
), *scm_read_hash_procedures
);
1565 if (scm_is_eq (chr
, SCM_CAAR (this)))
1567 /* already in the alist. */
1568 if (scm_is_false (proc
))
1571 if (scm_is_false (prev
))
1573 *scm_read_hash_procedures
=
1574 SCM_CDR (*scm_read_hash_procedures
);
1577 scm_set_cdr_x (prev
, SCM_CDR (this));
1582 scm_set_cdr_x (SCM_CAR (this), proc
);
1587 this = SCM_CDR (this);
1590 return SCM_UNSPECIFIED
;
1594 /* Recover the read-hash procedure corresponding to char c. */
1596 scm_get_hash_procedure (int c
)
1598 SCM rest
= *scm_read_hash_procedures
;
1602 if (scm_is_null (rest
))
1605 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1606 return SCM_CDAR (rest
);
1608 rest
= SCM_CDR (rest
);
1612 #define SCM_ENCODING_SEARCH_SIZE (500)
1614 /* Search the first few hundred characters of a file for an Emacs-like coding
1615 declaration. Returns either NULL or a string whose storage has been
1616 allocated with `scm_gc_malloc ()'. */
1618 scm_i_scan_for_encoding (SCM port
)
1620 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1621 size_t bytes_read
, encoding_length
, i
;
1622 char *encoding
= NULL
;
1624 char *pos
, *encoding_start
;
1627 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1628 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1629 "guile <(echo '(display "hello")')") so bail out. */
1632 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1634 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1637 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1640 /* search past "coding[:=]" */
1644 if ((pos
= strstr(pos
, "coding")) == NULL
)
1647 pos
+= strlen("coding");
1648 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1649 (*pos
== ':' || *pos
== '='))
1657 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1658 (*pos
== ' ' || *pos
== '\t'))
1661 /* grab the next token */
1662 encoding_start
= pos
;
1664 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1665 && encoding_start
+ i
- header
< bytes_read
1666 && (isalnum ((int) encoding_start
[i
])
1667 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1670 encoding_length
= i
;
1671 if (encoding_length
== 0)
1674 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1675 for (i
= 0; i
< encoding_length
; i
++)
1676 encoding
[i
] = toupper ((int) encoding
[i
]);
1678 /* push backwards to make sure we were in a comment */
1680 pos
= encoding_start
;
1681 while (pos
>= header
)
1685 /* This wasn't in a semicolon comment. Check for a
1686 hash-bang comment. */
1687 char *beg
= strstr (header
, "#!");
1688 char *end
= strstr (header
, "!#");
1689 if (beg
< encoding_start
&& encoding_start
+ encoding_length
< end
)
1701 /* This wasn't in a comment */
1704 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1705 scm_misc_error (NULL
,
1706 "the port input declares the encoding ~s but is encoded as UTF-8",
1707 scm_list_1 (scm_from_locale_string (encoding
)));
1712 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1714 "Scans the port for an Emacs-like character coding declaration\n"
1715 "near the top of the contents of a port with random-acessible contents.\n"
1716 "The coding declaration is of the form\n"
1717 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1719 "Returns a string containing the character encoding of the file\n"
1720 "if a declaration was found, or @code{#f} otherwise.\n")
1721 #define FUNC_NAME s_scm_file_encoding
1726 enc
= scm_i_scan_for_encoding (port
);
1731 s_enc
= scm_from_locale_string (enc
);
1742 scm_read_hash_procedures
=
1743 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL
));
1745 scm_init_opts (scm_read_options
, scm_read_opts
);
1746 #include "libguile/read.x"