1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
63 SCM_SYMBOL (sym_nil
, "nil");
65 scm_t_option scm_read_opts
[] = {
66 { SCM_OPTION_BOOLEAN
, "copy", 0,
67 "Copy source code expressions." },
68 { SCM_OPTION_BOOLEAN
, "positions", 0,
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
71 "Convert symbols to lower case."},
72 { SCM_OPTION_SCM
, "keywords", (unsigned long) SCM_BOOL_F
,
73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
74 { SCM_OPTION_BOOLEAN
, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
76 { SCM_OPTION_BOOLEAN
, "elisp-strings", 0,
77 "Support `\\(' and `\\)' in strings."},
78 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
79 "Use R6RS variable-length character and string hex escapes."},
80 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
81 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 Give meaningful error messages for errors
90 FILE:LINE:COL: MESSAGE
93 This is not standard GNU format, but the test-suite likes the real
94 message to be in front.
100 scm_i_input_error (char const *function
,
101 SCM port
, const char *message
, SCM arg
)
103 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
105 : scm_from_locale_string ("#<unknown port>"));
107 SCM string_port
= scm_open_output_string ();
108 SCM string
= SCM_EOL
;
109 scm_simple_format (string_port
,
110 scm_from_locale_string ("~A:~S:~S: ~A"),
112 scm_from_long (SCM_LINUM (port
) + 1),
113 scm_from_int (SCM_COL (port
) + 1),
114 scm_from_locale_string (message
)));
116 string
= scm_get_output_string (string_port
);
117 scm_close_output_port (string_port
);
118 scm_error_scm (scm_from_locale_symbol ("read-error"),
119 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
126 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
128 "Option interface for the read options. Instead of using\n"
129 "this procedure directly, use the procedures @code{read-enable},\n"
130 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
131 #define FUNC_NAME s_scm_read_options
133 SCM ans
= scm_options (setting
,
136 if (SCM_COPY_SOURCE_P
)
137 SCM_RECORD_POSITIONS_P
= 1;
142 /* An association list mapping extra hash characters to procedures. */
143 static SCM
*scm_read_hash_procedures
;
150 /* Size of the C buffer used to read symbols and numbers. */
151 #define READER_BUFFER_SIZE 128
153 /* Size of the C buffer used to read strings. */
154 #define READER_STRING_BUFFER_SIZE 512
156 /* The maximum size of Scheme character names. */
157 #define READER_CHAR_NAME_MAX_SIZE 50
160 /* `isblank' is only in C99. */
161 #define CHAR_IS_BLANK_(_chr) \
162 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
163 || ((_chr) == '\f') || ((_chr) == '\r'))
166 # define CHAR_IS_BLANK(_chr) \
167 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
175 #define CHAR_IS_R5RS_DELIMITER(c) \
177 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
178 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
180 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
184 #define CHAR_IS_EXPONENT_MARKER(_chr) \
185 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
186 || ((_chr) == 'd') || ((_chr) == 'l'))
188 /* Read an SCSH block comment. */
189 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
190 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
191 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
192 static SCM
scm_get_hash_procedure (int);
194 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
195 result in the pre-allocated buffer BUF. Return zero if the whole token has
196 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
197 bytes actually read. */
199 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
203 while (*read
< buf_size
)
207 chr
= scm_get_byte_or_eof (port
);
211 else if (CHAR_IS_DELIMITER (chr
))
213 scm_unget_byte (chr
, port
);
226 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
227 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
228 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
229 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
230 will be set the number of bytes actually read. */
232 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
233 char **overflow_buffer
, size_t *read
)
236 size_t bytes_read
, overflow_size
;
238 *overflow_buffer
= NULL
;
243 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
246 if (overflow
|| overflow_size
!= 0)
248 if (overflow_size
== 0)
250 *overflow_buffer
= scm_malloc (bytes_read
);
251 memcpy (*overflow_buffer
, buffer
, bytes_read
);
252 overflow_size
= bytes_read
;
256 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
257 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
258 overflow_size
+= bytes_read
;
265 *read
= overflow_size
;
269 return (overflow_size
!= 0);
272 /* Skip whitespace from PORT and return the first non-whitespace character
273 read. Raise an error on end-of-file. */
275 flush_ws (SCM port
, const char *eoferr
)
277 register scm_t_wchar c
;
279 switch (c
= scm_getc (port
))
285 scm_i_input_error (eoferr
,
294 switch (c
= scm_getc (port
))
300 case SCM_LINE_INCREMENTORS
:
306 switch (c
= scm_getc (port
))
309 eoferr
= "read_sharp";
312 scm_read_scsh_block_comment (c
, port
);
315 scm_read_commented_expression (c
, port
);
318 if (scm_is_false (scm_get_hash_procedure (c
)))
320 scm_read_r6rs_block_comment (c
, port
);
325 scm_ungetc (c
, port
);
330 case SCM_LINE_INCREMENTORS
:
331 case SCM_SINGLE_SPACES
:
346 static SCM
scm_read_expression (SCM port
);
347 static SCM
scm_read_sharp (int chr
, SCM port
);
348 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
352 scm_read_sexp (scm_t_wchar chr
, SCM port
)
353 #define FUNC_NAME "scm_i_lreadparen"
357 register SCM tl
, ans
= SCM_EOL
;
358 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
359 const int terminating_char
= ((chr
== '[') ? ']' : ')');
361 /* Need to capture line and column numbers here. */
362 long line
= SCM_LINUM (port
);
363 int column
= SCM_COL (port
) - 1;
366 c
= flush_ws (port
, FUNC_NAME
);
367 if (terminating_char
== c
)
370 scm_ungetc (c
, port
);
371 if (scm_is_eq (scm_sym_dot
,
372 (tmp
= scm_read_expression (port
))))
374 ans
= scm_read_expression (port
);
375 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
376 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
381 /* Build the head of the list structure. */
382 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
384 if (SCM_COPY_SOURCE_P
)
385 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
390 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
394 scm_ungetc (c
, port
);
395 if (scm_is_eq (scm_sym_dot
,
396 (tmp
= scm_read_expression (port
))))
398 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
400 if (SCM_COPY_SOURCE_P
)
401 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
404 c
= flush_ws (port
, FUNC_NAME
);
405 if (terminating_char
!= c
)
406 scm_i_input_error (FUNC_NAME
, port
,
407 "in pair: missing close paren", SCM_EOL
);
411 new_tail
= scm_cons (tmp
, SCM_EOL
);
412 SCM_SETCDR (tl
, new_tail
);
415 if (SCM_COPY_SOURCE_P
)
417 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
420 SCM_SETCDR (tl2
, new_tail2
);
426 if (SCM_RECORD_POSITIONS_P
)
427 scm_whash_insert (scm_source_whash
,
429 scm_make_srcprops (line
, column
,
440 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
441 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
443 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
449 while (i < ndigits) \
451 a = scm_getc (port); \
455 && (a == (scm_t_wchar) terminator) \
458 if ('0' <= a && a <= '9') \
460 else if ('A' <= a && a <= 'F') \
462 else if ('a' <= a && a <= 'f') \
475 scm_read_string (int chr
, SCM port
)
476 #define FUNC_NAME "scm_lreadr"
478 /* For strings smaller than C_STR, this function creates only one Scheme
479 object (the string returned). */
481 SCM str
= SCM_BOOL_F
;
482 unsigned c_str_len
= 0;
485 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
486 while ('"' != (c
= scm_getc (port
)))
491 scm_i_input_error (FUNC_NAME
, port
,
492 "end of file in string constant", SCM_EOL
);
495 if (c_str_len
+ 1 >= scm_i_string_length (str
))
497 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
499 str
= scm_string_append (scm_list_2 (str
, addy
));
504 switch (c
= scm_getc (port
))
513 if (SCM_ESCAPED_PARENS_P
)
543 if (SCM_R6RS_ESCAPES_P
)
544 SCM_READ_HEX_ESCAPE (10, ';');
546 SCM_READ_HEX_ESCAPE (2, '\0');
549 if (!SCM_R6RS_ESCAPES_P
)
551 SCM_READ_HEX_ESCAPE (4, '\0');
555 if (!SCM_R6RS_ESCAPES_P
)
557 SCM_READ_HEX_ESCAPE (6, '\0');
562 scm_i_input_error (FUNC_NAME
, port
,
563 "illegal character in escape sequence: ~S",
564 scm_list_1 (SCM_MAKE_CHAR (c
)));
567 str
= scm_i_string_start_writing (str
);
568 scm_i_string_set_x (str
, c_str_len
++, c
);
569 scm_i_string_stop_writing ();
574 return scm_i_substring_copy (str
, 0, c_str_len
);
583 scm_read_number (scm_t_wchar chr
, SCM port
)
585 SCM result
, str
= SCM_EOL
;
586 char buffer
[READER_BUFFER_SIZE
];
587 char *overflow_buffer
= NULL
;
590 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
592 scm_ungetc (chr
, port
);
593 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
594 &overflow_buffer
, &bytes_read
);
597 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
599 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
602 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
603 if (!scm_is_true (result
))
605 /* Return a symbol instead of a number */
606 if (SCM_CASE_INSENSITIVE_P
)
607 str
= scm_string_downcase_x (str
);
608 result
= scm_string_to_symbol (str
);
612 free (overflow_buffer
);
613 SCM_COL (port
) += scm_i_string_length (str
);
618 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
621 int ends_with_colon
= 0;
623 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
625 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
626 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
629 scm_ungetc (chr
, port
);
630 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
631 &overflow_buffer
, &bytes_read
);
635 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
637 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
640 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
643 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
645 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
648 if (SCM_CASE_INSENSITIVE_P
)
649 str
= scm_string_downcase_x (str
);
650 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
655 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
657 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
660 if (SCM_CASE_INSENSITIVE_P
)
661 str
= scm_string_downcase_x (str
);
662 result
= scm_string_to_symbol (str
);
666 free (overflow_buffer
);
667 SCM_COL (port
) += scm_i_string_length (str
);
672 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
673 #define FUNC_NAME "scm_lreadr"
677 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
706 scm_ungetc (chr
, port
);
707 scm_ungetc ('#', port
);
711 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
712 &overflow_buffer
, &read
);
714 pt
= SCM_PTAB_ENTRY (port
);
716 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
718 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
721 result
= scm_string_to_number (str
, scm_from_uint (radix
));
724 free (overflow_buffer
);
726 SCM_COL (port
) += scm_i_string_length (str
);
728 if (scm_is_true (result
))
731 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
738 scm_read_quote (int chr
, SCM port
)
741 long line
= SCM_LINUM (port
);
742 int column
= SCM_COL (port
) - 1;
747 p
= scm_sym_quasiquote
;
760 p
= scm_sym_uq_splicing
;
763 scm_ungetc (c
, port
);
770 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
771 "scm_read_quote", chr
);
775 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
776 if (SCM_RECORD_POSITIONS_P
)
777 scm_whash_insert (scm_source_whash
, p
,
778 scm_make_srcprops (line
, column
,
781 ? (scm_cons2 (SCM_CAR (p
),
782 SCM_CAR (SCM_CDR (p
)),
791 SCM_SYMBOL (sym_syntax
, "syntax");
792 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
793 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
794 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
797 scm_read_syntax (int chr
, SCM port
)
800 long line
= SCM_LINUM (port
);
801 int column
= SCM_COL (port
) - 1;
819 p
= sym_unsyntax_splicing
;
822 scm_ungetc (c
, port
);
829 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
830 "scm_read_syntax", chr
);
834 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
835 if (SCM_RECORD_POSITIONS_P
)
836 scm_whash_insert (scm_source_whash
, p
,
837 scm_make_srcprops (line
, column
,
840 ? (scm_cons2 (SCM_CAR (p
),
841 SCM_CAR (SCM_CDR (p
)),
851 scm_read_nil (int chr
, SCM port
)
853 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
855 if (!scm_is_eq (id
, sym_nil
))
856 scm_i_input_error ("scm_read_nil", port
,
857 "unexpected input while reading #nil: ~a",
860 return SCM_ELISP_NIL
;
864 scm_read_semicolon_comment (int chr
, SCM port
)
868 /* We use the get_byte here because there is no need to get the
869 locale correct with comment input. This presumes that newline
870 always represents itself no matter what the encoding is. */
871 for (c
= scm_get_byte_or_eof (port
);
872 (c
!= EOF
) && (c
!= '\n');
873 c
= scm_get_byte_or_eof (port
));
875 return SCM_UNSPECIFIED
;
879 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
882 scm_read_boolean (int chr
, SCM port
)
895 return SCM_UNSPECIFIED
;
899 scm_read_character (scm_t_wchar chr
, SCM port
)
900 #define FUNC_NAME "scm_lreadr"
902 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
904 size_t charname_len
, bytes_read
;
909 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
915 chr
= scm_getc (port
);
917 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
918 "while reading character", SCM_EOL
);
920 /* CHR must be a token delimiter, like a whitespace. */
921 return (SCM_MAKE_CHAR (chr
));
924 pt
= SCM_PTAB_ENTRY (port
);
926 /* Simple ASCII characters can be processed immediately. Also, simple
927 ISO-8859-1 characters can be processed immediately if the encoding for this
928 port is ISO-8859-1. */
929 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
932 return SCM_MAKE_CHAR (buffer
[0]);
935 /* Otherwise, convert the buffer into a proper scheme string for
937 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
939 charname_len
= scm_i_string_length (charname
);
940 SCM_COL (port
) += charname_len
;
941 cp
= scm_i_string_ref (charname
, 0);
942 if (charname_len
== 1)
943 return SCM_MAKE_CHAR (cp
);
945 /* Ignore dotted circles, which may be used to keep combining characters from
946 combining with the backslash in #\charname. */
947 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
948 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
950 if (cp
>= '0' && cp
< '8')
952 /* Dirk:FIXME:: This type of character syntax is not R5RS
953 * compliant. Further, it should be verified that the constant
954 * does only consist of octal digits. */
955 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
958 scm_t_wchar c
= SCM_I_INUM (p
);
959 if (SCM_IS_UNICODE_CHAR (c
))
960 return SCM_MAKE_CHAR (c
);
962 scm_i_input_error (FUNC_NAME
, port
,
963 "out-of-range octal character escape: ~a",
964 scm_list_1 (charname
));
968 if (cp
== 'x' && (charname_len
> 1) && SCM_R6RS_ESCAPES_P
)
972 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
973 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
977 scm_t_wchar c
= SCM_I_INUM (p
);
978 if (SCM_IS_UNICODE_CHAR (c
))
979 return SCM_MAKE_CHAR (c
);
981 scm_i_input_error (FUNC_NAME
, port
,
982 "out-of-range hex character escape: ~a",
983 scm_list_1 (charname
));
987 /* The names of characters should never have non-Latin1
989 if (scm_i_is_narrow_string (charname
)
990 || scm_i_try_narrow_string (charname
))
991 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
993 if (scm_is_true (ch
))
998 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
999 scm_list_1 (charname
));
1001 return SCM_UNSPECIFIED
;
1006 scm_read_keyword (int chr
, SCM port
)
1010 /* Read the symbol that comprises the keyword. Doing this instead of
1011 invoking a specific symbol reader function allows `scm_read_keyword ()'
1012 to adapt to the delimiters currently valid of symbols.
1014 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1015 symbol
= scm_read_expression (port
);
1016 if (!scm_is_symbol (symbol
))
1017 scm_i_input_error ("scm_read_keyword", port
,
1018 "keyword prefix `~a' not followed by a symbol: ~s",
1019 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1021 return (scm_symbol_to_keyword (symbol
));
1025 scm_read_vector (int chr
, SCM port
)
1027 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1028 guarantee that it's going to do what we want. After all, this is an
1029 implementation detail of `scm_read_vector ()', not a desirable
1031 return (scm_vector (scm_read_sexp (chr
, port
)));
1035 scm_read_srfi4_vector (int chr
, SCM port
)
1037 return scm_i_read_array (port
, chr
);
1041 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1043 chr
= scm_getc (port
);
1047 chr
= scm_getc (port
);
1051 chr
= scm_getc (port
);
1055 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1058 scm_i_input_error ("read_bytevector", port
,
1059 "invalid bytevector prefix",
1060 SCM_MAKE_CHAR (chr
));
1061 return SCM_UNSPECIFIED
;
1065 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1067 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1068 terribly inefficient but who cares? */
1069 SCM s_bits
= SCM_EOL
;
1071 for (chr
= scm_getc (port
);
1072 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1073 chr
= scm_getc (port
))
1075 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1079 scm_ungetc (chr
, port
);
1081 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1085 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1089 /* We can use the get_byte here because there is no need to get the
1090 locale correct when reading comments. This presumes that
1091 hash and exclamation points always represent themselves no
1092 matter what the source encoding is.*/
1095 int c
= scm_get_byte_or_eof (port
);
1098 scm_i_input_error ("skip_block_comment", port
,
1099 "unterminated `#! ... !#' comment", SCM_EOL
);
1103 else if (c
== '#' && bang_seen
)
1109 return SCM_UNSPECIFIED
;
1113 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1115 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1116 nested. So care must be taken. */
1117 int nesting_level
= 1;
1118 int opening_seen
= 0, closing_seen
= 0;
1120 while (nesting_level
> 0)
1122 int c
= scm_getc (port
);
1125 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1126 "unterminated `#| ... |#' comment", SCM_EOL
);
1134 else if (closing_seen
)
1145 opening_seen
= closing_seen
= 0;
1148 return SCM_UNSPECIFIED
;
1152 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1156 c
= flush_ws (port
, (char *) NULL
);
1158 scm_i_input_error ("read_commented_expression", port
,
1159 "no expression after #; comment", SCM_EOL
);
1160 scm_ungetc (c
, port
);
1161 scm_read_expression (port
);
1162 return SCM_UNSPECIFIED
;
1166 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1168 /* Guile's extended symbol read syntax looks like this:
1170 #{This is all a symbol name}#
1172 So here, CHR is expected to be `{'. */
1173 int saw_brace
= 0, finished
= 0;
1175 SCM buf
= scm_i_make_string (1024, NULL
);
1177 buf
= scm_i_string_start_writing (buf
);
1179 while ((chr
= scm_getc (port
)) != EOF
)
1191 scm_i_string_set_x (buf
, len
++, '}');
1192 scm_i_string_set_x (buf
, len
++, chr
);
1195 else if (chr
== '}')
1198 scm_i_string_set_x (buf
, len
++, chr
);
1200 if (len
>= scm_i_string_length (buf
) - 2)
1204 scm_i_string_stop_writing ();
1205 addy
= scm_i_make_string (1024, NULL
);
1206 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1208 buf
= scm_i_string_start_writing (buf
);
1214 scm_i_string_stop_writing ();
1216 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1221 /* Top-level token readers, i.e., dispatchers. */
1224 scm_read_sharp_extension (int chr
, SCM port
)
1228 proc
= scm_get_hash_procedure (chr
);
1229 if (scm_is_true (scm_procedure_p (proc
)))
1231 long line
= SCM_LINUM (port
);
1232 int column
= SCM_COL (port
) - 2;
1235 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1236 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1238 if (SCM_RECORD_POSITIONS_P
)
1239 return (recsexpr (got
, line
, column
,
1240 SCM_FILENAME (port
)));
1246 return SCM_UNSPECIFIED
;
1249 /* The reader for the sharp `#' character. It basically dispatches reads
1250 among the above token readers. */
1252 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1253 #define FUNC_NAME "scm_lreadr"
1257 chr
= scm_getc (port
);
1259 result
= scm_read_sharp_extension (chr
, port
);
1260 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1266 return (scm_read_character (chr
, port
));
1268 return (scm_read_vector (chr
, port
));
1272 /* This one may return either a boolean or an SRFI-4 vector. */
1273 return (scm_read_srfi4_vector (chr
, port
));
1275 return (scm_read_bytevector (chr
, port
));
1277 return (scm_read_guile_bit_vector (chr
, port
));
1281 /* This one may return either a boolean or an SRFI-4 vector. */
1282 return (scm_read_boolean (chr
, port
));
1284 return (scm_read_keyword (chr
, port
));
1285 case '0': case '1': case '2': case '3': case '4':
1286 case '5': case '6': case '7': case '8': case '9':
1288 #if SCM_ENABLE_DEPRECATED
1289 /* See below for 'i' and 'e'. */
1296 return (scm_i_read_array (port
, chr
));
1300 #if SCM_ENABLE_DEPRECATED
1302 /* When next char is '(', it really is an old-style
1304 scm_t_wchar next_c
= scm_getc (port
);
1306 scm_ungetc (next_c
, port
);
1308 return scm_i_read_array (port
, chr
);
1322 return (scm_read_number_and_radix (chr
, port
));
1324 return (scm_read_extended_symbol (chr
, port
));
1326 return (scm_read_scsh_block_comment (chr
, port
));
1328 return (scm_read_commented_expression (chr
, port
));
1332 return (scm_read_syntax (chr
, port
));
1334 return (scm_read_nil (chr
, port
));
1336 result
= scm_read_sharp_extension (chr
, port
);
1337 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1339 /* To remain compatible with 1.8 and earlier, the following
1340 characters have lower precedence than `read-hash-extend'
1345 return scm_read_r6rs_block_comment (chr
, port
);
1347 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1348 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1355 return SCM_UNSPECIFIED
;
1360 scm_read_expression (SCM port
)
1361 #define FUNC_NAME "scm_read_expression"
1365 register scm_t_wchar chr
;
1367 chr
= scm_getc (port
);
1371 case SCM_WHITE_SPACES
:
1372 case SCM_LINE_INCREMENTORS
:
1375 (void) scm_read_semicolon_comment (chr
, port
);
1378 if (!SCM_SQUARE_BRACKETS_P
)
1379 return (scm_read_mixed_case_symbol (chr
, port
));
1380 /* otherwise fall through */
1382 return (scm_read_sexp (chr
, port
));
1384 return (scm_read_string (chr
, port
));
1388 return (scm_read_quote (chr
, port
));
1392 result
= scm_read_sharp (chr
, port
);
1393 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1394 /* We read a comment or some such. */
1400 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1405 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1406 return scm_symbol_to_keyword (scm_read_expression (port
));
1411 if (((chr
>= '0') && (chr
<= '9'))
1412 || (strchr ("+-.", chr
)))
1413 return (scm_read_number (chr
, port
));
1415 return (scm_read_mixed_case_symbol (chr
, port
));
1423 /* Actual reader. */
1425 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1427 "Read an s-expression from the input port @var{port}, or from\n"
1428 "the current input port if @var{port} is not specified.\n"
1429 "Any whitespace before the next token is discarded.")
1430 #define FUNC_NAME s_scm_read
1434 if (SCM_UNBNDP (port
))
1435 port
= scm_current_input_port ();
1436 SCM_VALIDATE_OPINPORT (1, port
);
1438 c
= flush_ws (port
, (char *) NULL
);
1441 scm_ungetc (c
, port
);
1443 return (scm_read_expression (port
));
1450 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1452 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1454 if (!scm_is_pair(obj
)) {
1457 SCM tmp
= obj
, copy
;
1458 /* If this sexpr is visible in the read:sharp source, we want to
1459 keep that information, so only record non-constant cons cells
1460 which haven't previously been read by the reader. */
1461 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1463 if (SCM_COPY_SOURCE_P
)
1465 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1467 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1469 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1474 copy
= SCM_CDR (copy
);
1476 SCM_SETCDR (copy
, tmp
);
1480 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1481 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1482 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1483 copy
= SCM_UNDEFINED
;
1485 scm_whash_insert (scm_source_whash
,
1487 scm_make_srcprops (line
,
1497 /* Manipulate the read-hash-procedures alist. This could be written in
1498 Scheme, but maybe it will also be used by C code during initialisation. */
1499 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1500 (SCM chr
, SCM proc
),
1501 "Install the procedure @var{proc} for reading expressions\n"
1502 "starting with the character sequence @code{#} and @var{chr}.\n"
1503 "@var{proc} will be called with two arguments: the character\n"
1504 "@var{chr} and the port to read further data from. The object\n"
1505 "returned will be the return value of @code{read}. \n"
1506 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1508 #define FUNC_NAME s_scm_read_hash_extend
1513 SCM_VALIDATE_CHAR (1, chr
);
1514 SCM_ASSERT (scm_is_false (proc
)
1515 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1516 proc
, SCM_ARG2
, FUNC_NAME
);
1518 /* Check if chr is already in the alist. */
1519 this = *scm_read_hash_procedures
;
1523 if (scm_is_null (this))
1525 /* not found, so add it to the beginning. */
1526 if (scm_is_true (proc
))
1528 *scm_read_hash_procedures
=
1529 scm_cons (scm_cons (chr
, proc
), *scm_read_hash_procedures
);
1533 if (scm_is_eq (chr
, SCM_CAAR (this)))
1535 /* already in the alist. */
1536 if (scm_is_false (proc
))
1539 if (scm_is_false (prev
))
1541 *scm_read_hash_procedures
=
1542 SCM_CDR (*scm_read_hash_procedures
);
1545 scm_set_cdr_x (prev
, SCM_CDR (this));
1550 scm_set_cdr_x (SCM_CAR (this), proc
);
1555 this = SCM_CDR (this);
1558 return SCM_UNSPECIFIED
;
1562 /* Recover the read-hash procedure corresponding to char c. */
1564 scm_get_hash_procedure (int c
)
1566 SCM rest
= *scm_read_hash_procedures
;
1570 if (scm_is_null (rest
))
1573 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1574 return SCM_CDAR (rest
);
1576 rest
= SCM_CDR (rest
);
1580 #define SCM_ENCODING_SEARCH_SIZE (500)
1582 /* Search the first few hundred characters of a file for an Emacs-like coding
1583 declaration. Returns either NULL or a string whose storage has been
1584 allocated with `scm_gc_malloc ()'. */
1586 scm_i_scan_for_encoding (SCM port
)
1588 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1590 char *encoding
= NULL
;
1596 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1597 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1598 "guile <(echo '(display "hello")')") so bail out. */
1601 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1603 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1606 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1609 /* search past "coding[:=]" */
1613 if ((pos
= strstr(pos
, "coding")) == NULL
)
1616 pos
+= strlen("coding");
1617 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1618 (*pos
== ':' || *pos
== '='))
1626 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1627 (*pos
== ' ' || *pos
== '\t'))
1630 /* grab the next token */
1632 while (pos
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1633 && pos
+ i
- header
< bytes_read
1634 && (isalnum ((int) pos
[i
]) || strchr ("_-.:/,+=()", pos
[i
]) != NULL
))
1640 encoding
= scm_gc_strndup (pos
, i
, "encoding");
1641 for (i
= 0; i
< strlen (encoding
); i
++)
1642 encoding
[i
] = toupper ((int) encoding
[i
]);
1644 /* push backwards to make sure we were in a comment */
1646 while (pos
- i
- header
> 0)
1648 if (*(pos
- i
) == '\n')
1650 /* This wasn't in a semicolon comment. Check for a
1651 hash-bang comment. */
1652 char *beg
= strstr (header
, "#!");
1653 char *end
= strstr (header
, "!#");
1654 if (beg
< pos
&& pos
< end
)
1658 if (*(pos
- i
) == ';')
1666 /* This wasn't in a comment */
1669 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1670 scm_misc_error (NULL
,
1671 "the port input declares the encoding ~s but is encoded as UTF-8",
1672 scm_list_1 (scm_from_locale_string (encoding
)));
1677 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1679 "Scans the port for an Emacs-like character coding declaration\n"
1680 "near the top of the contents of a port with random-acessible contents.\n"
1681 "The coding declaration is of the form\n"
1682 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1684 "Returns a string containing the character encoding of the file\n"
1685 "if a declaration was found, or @code{#f} otherwise.\n")
1686 #define FUNC_NAME s_scm_file_encoding
1691 enc
= scm_i_scan_for_encoding (port
);
1696 s_enc
= scm_from_locale_string (enc
);
1707 scm_read_hash_procedures
=
1708 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL
));
1710 scm_init_opts (scm_read_options
, scm_read_opts
);
1711 #include "libguile/read.x"