1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "libguile/_scm.h"
34 #include "libguile/bytevectors.h"
35 #include "libguile/chars.h"
36 #include "libguile/eval.h"
37 #include "libguile/arrays.h"
38 #include "libguile/bitvectors.h"
39 #include "libguile/keywords.h"
40 #include "libguile/alist.h"
41 #include "libguile/srcprop.h"
42 #include "libguile/hashtab.h"
43 #include "libguile/hash.h"
44 #include "libguile/ports.h"
45 #include "libguile/fports.h"
46 #include "libguile/root.h"
47 #include "libguile/strings.h"
48 #include "libguile/strports.h"
49 #include "libguile/vectors.h"
50 #include "libguile/validate.h"
51 #include "libguile/srfi-4.h"
52 #include "libguile/srfi-13.h"
54 #include "libguile/read.h"
55 #include "libguile/private-options.h"
60 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
61 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
62 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 scm_t_option scm_read_opts
[] = {
65 { SCM_OPTION_BOOLEAN
, "copy", 0,
66 "Copy source code expressions." },
67 { SCM_OPTION_BOOLEAN
, "positions", 0,
68 "Record positions of source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
70 "Convert symbols to lower case."},
71 { SCM_OPTION_SCM
, "keywords", (unsigned long) SCM_BOOL_F
,
72 "Style of keyword recognition: #f, 'prefix or 'postfix."},
73 { SCM_OPTION_BOOLEAN
, "elisp-vectors", 0,
74 "Support Elisp vector syntax, namely `[...]'."},
75 { SCM_OPTION_BOOLEAN
, "elisp-strings", 0,
76 "Support `\\(' and `\\)' in strings."},
77 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
78 "Use R6RS variable-length character and string hex escapes."},
79 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
80 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_locale_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* An association list mapping extra hash characters to procedures. */
142 static SCM
*scm_read_hash_procedures
;
149 /* Size of the C buffer used to read symbols and numbers. */
150 #define READER_BUFFER_SIZE 128
152 /* Size of the C buffer used to read strings. */
153 #define READER_STRING_BUFFER_SIZE 512
155 /* The maximum size of Scheme character names. */
156 #define READER_CHAR_NAME_MAX_SIZE 50
159 /* `isblank' is only in C99. */
160 #define CHAR_IS_BLANK_(_chr) \
161 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
162 || ((_chr) == '\f') || ((_chr) == '\r'))
165 # define CHAR_IS_BLANK(_chr) \
166 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
168 # define CHAR_IS_BLANK CHAR_IS_BLANK_
172 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
174 #define CHAR_IS_R5RS_DELIMITER(c) \
176 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
177 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
179 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
181 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
183 #define CHAR_IS_EXPONENT_MARKER(_chr) \
184 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
185 || ((_chr) == 'd') || ((_chr) == 'l'))
187 /* Read an SCSH block comment. */
188 static inline SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
189 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
190 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
191 static SCM
scm_get_hash_procedure (int);
193 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
194 result in the pre-allocated buffer BUF. Return zero if the whole token has
195 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
196 bytes actually read. */
198 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
202 while (*read
< buf_size
)
206 chr
= scm_get_byte_or_eof (port
);
210 else if (CHAR_IS_DELIMITER (chr
))
212 scm_unget_byte (chr
, port
);
225 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
226 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
227 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
228 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
229 will be set the number of bytes actually read. */
231 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
232 char **overflow_buffer
, size_t *read
)
235 size_t bytes_read
, overflow_size
;
237 *overflow_buffer
= NULL
;
242 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
245 if (overflow
|| overflow_size
!= 0)
247 if (overflow_size
== 0)
249 *overflow_buffer
= scm_malloc (bytes_read
);
250 memcpy (*overflow_buffer
, buffer
, bytes_read
);
251 overflow_size
= bytes_read
;
255 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
256 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
257 overflow_size
+= bytes_read
;
264 *read
= overflow_size
;
268 return (overflow_size
!= 0);
271 /* Skip whitespace from PORT and return the first non-whitespace character
272 read. Raise an error on end-of-file. */
274 flush_ws (SCM port
, const char *eoferr
)
276 register scm_t_wchar c
;
278 switch (c
= scm_getc (port
))
284 scm_i_input_error (eoferr
,
293 switch (c
= scm_getc (port
))
299 case SCM_LINE_INCREMENTORS
:
305 switch (c
= scm_getc (port
))
308 eoferr
= "read_sharp";
311 scm_read_scsh_block_comment (c
, port
);
314 scm_read_commented_expression (c
, port
);
317 if (scm_is_false (scm_get_hash_procedure (c
)))
319 scm_read_r6rs_block_comment (c
, port
);
324 scm_ungetc (c
, port
);
329 case SCM_LINE_INCREMENTORS
:
330 case SCM_SINGLE_SPACES
:
345 static SCM
scm_read_expression (SCM port
);
346 static SCM
scm_read_sharp (int chr
, SCM port
);
347 static SCM
recsexpr (SCM obj
, long line
, int column
, SCM filename
);
351 scm_read_sexp (scm_t_wchar chr
, SCM port
)
352 #define FUNC_NAME "scm_i_lreadparen"
356 register SCM tl
, ans
= SCM_EOL
;
357 SCM tl2
= SCM_EOL
, ans2
= SCM_EOL
, copy
= SCM_BOOL_F
;
358 const int terminating_char
= ((chr
== '[') ? ']' : ')');
360 /* Need to capture line and column numbers here. */
361 long line
= SCM_LINUM (port
);
362 int column
= SCM_COL (port
) - 1;
365 c
= flush_ws (port
, FUNC_NAME
);
366 if (terminating_char
== c
)
369 scm_ungetc (c
, port
);
370 if (scm_is_eq (scm_sym_dot
,
371 (tmp
= scm_read_expression (port
))))
373 ans
= scm_read_expression (port
);
374 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
375 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
380 /* Build the head of the list structure. */
381 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
383 if (SCM_COPY_SOURCE_P
)
384 ans2
= tl2
= scm_cons (scm_is_pair (tmp
)
389 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
393 scm_ungetc (c
, port
);
394 if (scm_is_eq (scm_sym_dot
,
395 (tmp
= scm_read_expression (port
))))
397 SCM_SETCDR (tl
, tmp
= scm_read_expression (port
));
399 if (SCM_COPY_SOURCE_P
)
400 SCM_SETCDR (tl2
, scm_cons (scm_is_pair (tmp
) ? copy
: tmp
,
403 c
= flush_ws (port
, FUNC_NAME
);
404 if (terminating_char
!= c
)
405 scm_i_input_error (FUNC_NAME
, port
,
406 "in pair: missing close paren", SCM_EOL
);
410 new_tail
= scm_cons (tmp
, SCM_EOL
);
411 SCM_SETCDR (tl
, new_tail
);
414 if (SCM_COPY_SOURCE_P
)
416 SCM new_tail2
= scm_cons (scm_is_pair (tmp
)
419 SCM_SETCDR (tl2
, new_tail2
);
425 if (SCM_RECORD_POSITIONS_P
)
426 scm_whash_insert (scm_source_whash
,
428 scm_make_srcprops (line
, column
,
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
448 while (i < ndigits) \
450 a = scm_getc (port); \
454 && (a == (scm_t_wchar) terminator) \
457 if ('0' <= a && a <= '9') \
459 else if ('A' <= a && a <= 'F') \
461 else if ('a' <= a && a <= 'f') \
474 scm_read_string (int chr
, SCM port
)
475 #define FUNC_NAME "scm_lreadr"
477 /* For strings smaller than C_STR, this function creates only one Scheme
478 object (the string returned). */
480 SCM str
= SCM_BOOL_F
;
481 unsigned c_str_len
= 0;
484 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
485 while ('"' != (c
= scm_getc (port
)))
490 scm_i_input_error (FUNC_NAME
, port
,
491 "end of file in string constant", SCM_EOL
);
494 if (c_str_len
+ 1 >= scm_i_string_length (str
))
496 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
);
498 str
= scm_string_append (scm_list_2 (str
, addy
));
503 switch (c
= scm_getc (port
))
512 if (SCM_ESCAPED_PARENS_P
)
542 if (SCM_R6RS_ESCAPES_P
)
543 SCM_READ_HEX_ESCAPE (10, ';');
545 SCM_READ_HEX_ESCAPE (2, '\0');
548 if (!SCM_R6RS_ESCAPES_P
)
550 SCM_READ_HEX_ESCAPE (4, '\0');
554 if (!SCM_R6RS_ESCAPES_P
)
556 SCM_READ_HEX_ESCAPE (6, '\0');
561 scm_i_input_error (FUNC_NAME
, port
,
562 "illegal character in escape sequence: ~S",
563 scm_list_1 (SCM_MAKE_CHAR (c
)));
566 str
= scm_i_string_start_writing (str
);
567 scm_i_string_set_x (str
, c_str_len
++, c
);
568 scm_i_string_stop_writing ();
573 return scm_i_substring_copy (str
, 0, c_str_len
);
582 scm_read_number (scm_t_wchar chr
, SCM port
)
584 SCM result
, str
= SCM_EOL
;
585 char buffer
[READER_BUFFER_SIZE
];
586 char *overflow_buffer
= NULL
;
589 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
591 scm_ungetc (chr
, port
);
592 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
593 &overflow_buffer
, &bytes_read
);
596 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
598 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
601 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
602 if (!scm_is_true (result
))
604 /* Return a symbol instead of a number */
605 if (SCM_CASE_INSENSITIVE_P
)
606 str
= scm_string_downcase_x (str
);
607 result
= scm_string_to_symbol (str
);
611 free (overflow_buffer
);
612 SCM_COL (port
) += scm_i_string_length (str
);
617 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
620 int ends_with_colon
= 0;
622 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
624 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
625 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
628 scm_ungetc (chr
, port
);
629 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
630 &overflow_buffer
, &bytes_read
);
634 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
636 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
639 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
642 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
644 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
647 if (SCM_CASE_INSENSITIVE_P
)
648 str
= scm_string_downcase_x (str
);
649 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
654 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
656 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
659 if (SCM_CASE_INSENSITIVE_P
)
660 str
= scm_string_downcase_x (str
);
661 result
= scm_string_to_symbol (str
);
665 free (overflow_buffer
);
666 SCM_COL (port
) += scm_i_string_length (str
);
671 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
672 #define FUNC_NAME "scm_lreadr"
676 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
705 scm_ungetc (chr
, port
);
706 scm_ungetc ('#', port
);
710 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
711 &overflow_buffer
, &read
);
713 pt
= SCM_PTAB_ENTRY (port
);
715 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
717 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
720 result
= scm_string_to_number (str
, scm_from_uint (radix
));
723 free (overflow_buffer
);
725 SCM_COL (port
) += scm_i_string_length (str
);
727 if (scm_is_true (result
))
730 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
737 scm_read_quote (int chr
, SCM port
)
740 long line
= SCM_LINUM (port
);
741 int column
= SCM_COL (port
) - 1;
746 p
= scm_sym_quasiquote
;
759 p
= scm_sym_uq_splicing
;
762 scm_ungetc (c
, port
);
769 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
770 "scm_read_quote", chr
);
774 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
775 if (SCM_RECORD_POSITIONS_P
)
776 scm_whash_insert (scm_source_whash
, p
,
777 scm_make_srcprops (line
, column
,
780 ? (scm_cons2 (SCM_CAR (p
),
781 SCM_CAR (SCM_CDR (p
)),
790 SCM_SYMBOL (sym_syntax
, "syntax");
791 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
792 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
793 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
796 scm_read_syntax (int chr
, SCM port
)
799 long line
= SCM_LINUM (port
);
800 int column
= SCM_COL (port
) - 1;
818 p
= sym_unsyntax_splicing
;
821 scm_ungetc (c
, port
);
828 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
829 "scm_read_syntax", chr
);
833 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
834 if (SCM_RECORD_POSITIONS_P
)
835 scm_whash_insert (scm_source_whash
, p
,
836 scm_make_srcprops (line
, column
,
839 ? (scm_cons2 (SCM_CAR (p
),
840 SCM_CAR (SCM_CDR (p
)),
850 scm_read_semicolon_comment (int chr
, SCM port
)
854 /* We use the get_byte here because there is no need to get the
855 locale correct with comment input. This presumes that newline
856 always represents itself no matter what the encoding is. */
857 for (c
= scm_get_byte_or_eof (port
);
858 (c
!= EOF
) && (c
!= '\n');
859 c
= scm_get_byte_or_eof (port
));
861 return SCM_UNSPECIFIED
;
865 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
868 scm_read_boolean (int chr
, SCM port
)
881 return SCM_UNSPECIFIED
;
885 scm_read_character (scm_t_wchar chr
, SCM port
)
886 #define FUNC_NAME "scm_lreadr"
888 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
890 size_t charname_len
, bytes_read
;
895 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
901 chr
= scm_getc (port
);
903 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
904 "while reading character", SCM_EOL
);
906 /* CHR must be a token delimiter, like a whitespace. */
907 return (SCM_MAKE_CHAR (chr
));
910 pt
= SCM_PTAB_ENTRY (port
);
912 /* Simple ASCII characters can be processed immediately. Also, simple
913 ISO-8859-1 characters can be processed immediately if the encoding for this
914 port is ISO-8859-1. */
915 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
918 return SCM_MAKE_CHAR (buffer
[0]);
921 /* Otherwise, convert the buffer into a proper scheme string for
923 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
925 charname_len
= scm_i_string_length (charname
);
926 SCM_COL (port
) += charname_len
;
927 cp
= scm_i_string_ref (charname
, 0);
928 if (charname_len
== 1)
929 return SCM_MAKE_CHAR (cp
);
931 /* Ignore dotted circles, which may be used to keep combining characters from
932 combining with the backslash in #\charname. */
933 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
934 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
936 if (cp
>= '0' && cp
< '8')
938 /* Dirk:FIXME:: This type of character syntax is not R5RS
939 * compliant. Further, it should be verified that the constant
940 * does only consist of octal digits. */
941 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
944 scm_t_wchar c
= SCM_I_INUM (p
);
945 if (SCM_IS_UNICODE_CHAR (c
))
946 return SCM_MAKE_CHAR (c
);
948 scm_i_input_error (FUNC_NAME
, port
,
949 "out-of-range octal character escape: ~a",
950 scm_list_1 (charname
));
954 if (cp
== 'x' && (charname_len
> 1) && SCM_R6RS_ESCAPES_P
)
958 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
959 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
963 scm_t_wchar c
= SCM_I_INUM (p
);
964 if (SCM_IS_UNICODE_CHAR (c
))
965 return SCM_MAKE_CHAR (c
);
967 scm_i_input_error (FUNC_NAME
, port
,
968 "out-of-range hex character escape: ~a",
969 scm_list_1 (charname
));
973 /* The names of characters should never have non-Latin1
975 if (scm_i_is_narrow_string (charname
)
976 || scm_i_try_narrow_string (charname
))
977 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
979 if (scm_is_true (ch
))
984 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
985 scm_list_1 (charname
));
987 return SCM_UNSPECIFIED
;
992 scm_read_keyword (int chr
, SCM port
)
996 /* Read the symbol that comprises the keyword. Doing this instead of
997 invoking a specific symbol reader function allows `scm_read_keyword ()'
998 to adapt to the delimiters currently valid of symbols.
1000 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1001 symbol
= scm_read_expression (port
);
1002 if (!scm_is_symbol (symbol
))
1003 scm_i_input_error ("scm_read_keyword", port
,
1004 "keyword prefix `~a' not followed by a symbol: ~s",
1005 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1007 return (scm_symbol_to_keyword (symbol
));
1011 scm_read_vector (int chr
, SCM port
)
1013 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1014 guarantee that it's going to do what we want. After all, this is an
1015 implementation detail of `scm_read_vector ()', not a desirable
1017 return (scm_vector (scm_read_sexp (chr
, port
)));
1021 scm_read_srfi4_vector (int chr
, SCM port
)
1023 return scm_i_read_array (port
, chr
);
1027 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1029 chr
= scm_getc (port
);
1033 chr
= scm_getc (port
);
1037 chr
= scm_getc (port
);
1041 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1044 scm_i_input_error ("read_bytevector", port
,
1045 "invalid bytevector prefix",
1046 SCM_MAKE_CHAR (chr
));
1047 return SCM_UNSPECIFIED
;
1051 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1053 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1054 terribly inefficient but who cares? */
1055 SCM s_bits
= SCM_EOL
;
1057 for (chr
= scm_getc (port
);
1058 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1059 chr
= scm_getc (port
))
1061 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1065 scm_ungetc (chr
, port
);
1067 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1071 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1075 /* We can use the get_byte here because there is no need to get the
1076 locale correct when reading comments. This presumes that
1077 hash and exclamation points always represent themselves no
1078 matter what the source encoding is.*/
1081 int c
= scm_get_byte_or_eof (port
);
1084 scm_i_input_error ("skip_block_comment", port
,
1085 "unterminated `#! ... !#' comment", SCM_EOL
);
1089 else if (c
== '#' && bang_seen
)
1095 return SCM_UNSPECIFIED
;
1099 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1101 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1102 nested. So care must be taken. */
1103 int nesting_level
= 1;
1104 int opening_seen
= 0, closing_seen
= 0;
1106 while (nesting_level
> 0)
1108 int c
= scm_getc (port
);
1111 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1112 "unterminated `#| ... |#' comment", SCM_EOL
);
1120 else if (closing_seen
)
1131 opening_seen
= closing_seen
= 0;
1134 return SCM_UNSPECIFIED
;
1138 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1142 c
= flush_ws (port
, (char *) NULL
);
1144 scm_i_input_error ("read_commented_expression", port
,
1145 "no expression after #; comment", SCM_EOL
);
1146 scm_ungetc (c
, port
);
1147 scm_read_expression (port
);
1148 return SCM_UNSPECIFIED
;
1152 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1154 /* Guile's extended symbol read syntax looks like this:
1156 #{This is all a symbol name}#
1158 So here, CHR is expected to be `{'. */
1159 int saw_brace
= 0, finished
= 0;
1161 SCM buf
= scm_i_make_string (1024, NULL
);
1163 buf
= scm_i_string_start_writing (buf
);
1165 while ((chr
= scm_getc (port
)) != EOF
)
1177 scm_i_string_set_x (buf
, len
++, '}');
1178 scm_i_string_set_x (buf
, len
++, chr
);
1181 else if (chr
== '}')
1184 scm_i_string_set_x (buf
, len
++, chr
);
1186 if (len
>= scm_i_string_length (buf
) - 2)
1190 scm_i_string_stop_writing ();
1191 addy
= scm_i_make_string (1024, NULL
);
1192 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1194 buf
= scm_i_string_start_writing (buf
);
1200 scm_i_string_stop_writing ();
1202 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1207 /* Top-level token readers, i.e., dispatchers. */
1210 scm_read_sharp_extension (int chr
, SCM port
)
1214 proc
= scm_get_hash_procedure (chr
);
1215 if (scm_is_true (scm_procedure_p (proc
)))
1217 long line
= SCM_LINUM (port
);
1218 int column
= SCM_COL (port
) - 2;
1221 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1222 if (!scm_is_eq (got
, SCM_UNSPECIFIED
))
1224 if (SCM_RECORD_POSITIONS_P
)
1225 return (recsexpr (got
, line
, column
,
1226 SCM_FILENAME (port
)));
1232 return SCM_UNSPECIFIED
;
1235 /* The reader for the sharp `#' character. It basically dispatches reads
1236 among the above token readers. */
1238 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1239 #define FUNC_NAME "scm_lreadr"
1243 chr
= scm_getc (port
);
1245 result
= scm_read_sharp_extension (chr
, port
);
1246 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1252 return (scm_read_character (chr
, port
));
1254 return (scm_read_vector (chr
, port
));
1258 /* This one may return either a boolean or an SRFI-4 vector. */
1259 return (scm_read_srfi4_vector (chr
, port
));
1261 return (scm_read_bytevector (chr
, port
));
1263 return (scm_read_guile_bit_vector (chr
, port
));
1267 /* This one may return either a boolean or an SRFI-4 vector. */
1268 return (scm_read_boolean (chr
, port
));
1270 return (scm_read_keyword (chr
, port
));
1271 case '0': case '1': case '2': case '3': case '4':
1272 case '5': case '6': case '7': case '8': case '9':
1274 #if SCM_ENABLE_DEPRECATED
1275 /* See below for 'i' and 'e'. */
1282 return (scm_i_read_array (port
, chr
));
1286 #if SCM_ENABLE_DEPRECATED
1288 /* When next char is '(', it really is an old-style
1290 scm_t_wchar next_c
= scm_getc (port
);
1292 scm_ungetc (next_c
, port
);
1294 return scm_i_read_array (port
, chr
);
1308 return (scm_read_number_and_radix (chr
, port
));
1310 return (scm_read_extended_symbol (chr
, port
));
1312 return (scm_read_scsh_block_comment (chr
, port
));
1314 return (scm_read_commented_expression (chr
, port
));
1318 return (scm_read_syntax (chr
, port
));
1320 result
= scm_read_sharp_extension (chr
, port
);
1321 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1323 /* To remain compatible with 1.8 and earlier, the following
1324 characters have lower precedence than `read-hash-extend'
1329 return scm_read_r6rs_block_comment (chr
, port
);
1331 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1332 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1339 return SCM_UNSPECIFIED
;
1344 scm_read_expression (SCM port
)
1345 #define FUNC_NAME "scm_read_expression"
1349 register scm_t_wchar chr
;
1351 chr
= scm_getc (port
);
1355 case SCM_WHITE_SPACES
:
1356 case SCM_LINE_INCREMENTORS
:
1359 (void) scm_read_semicolon_comment (chr
, port
);
1362 if (!SCM_SQUARE_BRACKETS_P
)
1363 return (scm_read_mixed_case_symbol (chr
, port
));
1364 /* otherwise fall through */
1366 return (scm_read_sexp (chr
, port
));
1368 return (scm_read_string (chr
, port
));
1372 return (scm_read_quote (chr
, port
));
1376 result
= scm_read_sharp (chr
, port
);
1377 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1378 /* We read a comment or some such. */
1384 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1389 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1390 return scm_symbol_to_keyword (scm_read_expression (port
));
1395 if (((chr
>= '0') && (chr
<= '9'))
1396 || (strchr ("+-.", chr
)))
1397 return (scm_read_number (chr
, port
));
1399 return (scm_read_mixed_case_symbol (chr
, port
));
1407 /* Actual reader. */
1409 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1411 "Read an s-expression from the input port @var{port}, or from\n"
1412 "the current input port if @var{port} is not specified.\n"
1413 "Any whitespace before the next token is discarded.")
1414 #define FUNC_NAME s_scm_read
1418 if (SCM_UNBNDP (port
))
1419 port
= scm_current_input_port ();
1420 SCM_VALIDATE_OPINPORT (1, port
);
1422 c
= flush_ws (port
, (char *) NULL
);
1425 scm_ungetc (c
, port
);
1427 return (scm_read_expression (port
));
1434 /* Used when recording expressions constructed by `scm_read_sharp ()'. */
1436 recsexpr (SCM obj
, long line
, int column
, SCM filename
)
1438 if (!scm_is_pair(obj
)) {
1441 SCM tmp
= obj
, copy
;
1442 /* If this sexpr is visible in the read:sharp source, we want to
1443 keep that information, so only record non-constant cons cells
1444 which haven't previously been read by the reader. */
1445 if (scm_is_false (scm_whash_lookup (scm_source_whash
, obj
)))
1447 if (SCM_COPY_SOURCE_P
)
1449 copy
= scm_cons (recsexpr (SCM_CAR (obj
), line
, column
, filename
),
1451 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1453 SCM_SETCDR (copy
, scm_cons (recsexpr (SCM_CAR (tmp
),
1458 copy
= SCM_CDR (copy
);
1460 SCM_SETCDR (copy
, tmp
);
1464 recsexpr (SCM_CAR (obj
), line
, column
, filename
);
1465 while ((tmp
= SCM_CDR (tmp
)) && scm_is_pair (tmp
))
1466 recsexpr (SCM_CAR (tmp
), line
, column
, filename
);
1467 copy
= SCM_UNDEFINED
;
1469 scm_whash_insert (scm_source_whash
,
1471 scm_make_srcprops (line
,
1481 /* Manipulate the read-hash-procedures alist. This could be written in
1482 Scheme, but maybe it will also be used by C code during initialisation. */
1483 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1484 (SCM chr
, SCM proc
),
1485 "Install the procedure @var{proc} for reading expressions\n"
1486 "starting with the character sequence @code{#} and @var{chr}.\n"
1487 "@var{proc} will be called with two arguments: the character\n"
1488 "@var{chr} and the port to read further data from. The object\n"
1489 "returned will be the return value of @code{read}. \n"
1490 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1492 #define FUNC_NAME s_scm_read_hash_extend
1497 SCM_VALIDATE_CHAR (1, chr
);
1498 SCM_ASSERT (scm_is_false (proc
)
1499 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1500 proc
, SCM_ARG2
, FUNC_NAME
);
1502 /* Check if chr is already in the alist. */
1503 this = *scm_read_hash_procedures
;
1507 if (scm_is_null (this))
1509 /* not found, so add it to the beginning. */
1510 if (scm_is_true (proc
))
1512 *scm_read_hash_procedures
=
1513 scm_cons (scm_cons (chr
, proc
), *scm_read_hash_procedures
);
1517 if (scm_is_eq (chr
, SCM_CAAR (this)))
1519 /* already in the alist. */
1520 if (scm_is_false (proc
))
1523 if (scm_is_false (prev
))
1525 *scm_read_hash_procedures
=
1526 SCM_CDR (*scm_read_hash_procedures
);
1529 scm_set_cdr_x (prev
, SCM_CDR (this));
1534 scm_set_cdr_x (SCM_CAR (this), proc
);
1539 this = SCM_CDR (this);
1542 return SCM_UNSPECIFIED
;
1546 /* Recover the read-hash procedure corresponding to char c. */
1548 scm_get_hash_procedure (int c
)
1550 SCM rest
= *scm_read_hash_procedures
;
1554 if (scm_is_null (rest
))
1557 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1558 return SCM_CDAR (rest
);
1560 rest
= SCM_CDR (rest
);
1564 #define SCM_ENCODING_SEARCH_SIZE (500)
1566 /* Search the first few hundred characters of a file for an Emacs-like coding
1567 declaration. Returns either NULL or a string whose storage has been
1568 allocated with `scm_gc_malloc ()'. */
1570 scm_i_scan_for_encoding (SCM port
)
1572 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1574 char *encoding
= NULL
;
1580 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1581 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1582 "guile <(echo '(display "hello")')") so bail out. */
1585 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1587 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1590 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1593 /* search past "coding[:=]" */
1597 if ((pos
= strstr(pos
, "coding")) == NULL
)
1600 pos
+= strlen("coding");
1601 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1602 (*pos
== ':' || *pos
== '='))
1610 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1611 (*pos
== ' ' || *pos
== '\t'))
1614 /* grab the next token */
1616 while (pos
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1617 && pos
+ i
- header
< bytes_read
1618 && (isalnum ((int) pos
[i
]) || strchr ("_-.:/,+=()", pos
[i
]) != NULL
))
1624 encoding
= scm_gc_strndup (pos
, i
, "encoding");
1625 for (i
= 0; i
< strlen (encoding
); i
++)
1626 encoding
[i
] = toupper ((int) encoding
[i
]);
1628 /* push backwards to make sure we were in a comment */
1630 while (pos
- i
- header
> 0)
1632 if (*(pos
- i
) == '\n')
1634 /* This wasn't in a semicolon comment. Check for a
1635 hash-bang comment. */
1636 char *beg
= strstr (header
, "#!");
1637 char *end
= strstr (header
, "!#");
1638 if (beg
< pos
&& pos
< end
)
1642 if (*(pos
- i
) == ';')
1650 /* This wasn't in a comment */
1653 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1654 scm_misc_error (NULL
,
1655 "the port input declares the encoding ~s but is encoded as UTF-8",
1656 scm_list_1 (scm_from_locale_string (encoding
)));
1661 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1663 "Scans the port for an Emacs-like character coding declaration\n"
1664 "near the top of the contents of a port with random-acessible contents.\n"
1665 "The coding declaration is of the form\n"
1666 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1668 "Returns a string containing the character encoding of the file\n"
1669 "if a declaration was found, or @code{#f} otherwise.\n")
1670 #define FUNC_NAME s_scm_file_encoding
1675 enc
= scm_i_scan_for_encoding (port
);
1680 s_enc
= scm_from_locale_string (enc
);
1691 scm_read_hash_procedures
=
1692 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL
));
1694 scm_init_opts (scm_read_options
, scm_read_opts
);
1695 #include "libguile/read.x"