1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
244 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
245 char **overflow_buffer
, size_t *read
)
248 size_t bytes_read
, overflow_size
;
250 *overflow_buffer
= NULL
;
255 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
258 if (overflow
|| overflow_size
!= 0)
260 if (overflow_size
== 0)
262 *overflow_buffer
= scm_malloc (bytes_read
);
263 memcpy (*overflow_buffer
, buffer
, bytes_read
);
264 overflow_size
= bytes_read
;
268 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
269 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
!= 0);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
);
363 maybe_annotate_source (SCM x
, SCM port
, long line
, int column
)
365 if (SCM_RECORD_POSITIONS_P
)
366 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
371 scm_read_sexp (scm_t_wchar chr
, SCM port
)
372 #define FUNC_NAME "scm_i_lreadparen"
375 SCM tmp
, tl
, ans
= SCM_EOL
;
376 const int terminating_char
= ((chr
== '[') ? ']' : ')');
378 /* Need to capture line and column numbers here. */
379 long line
= SCM_LINUM (port
);
380 int column
= SCM_COL (port
) - 1;
382 c
= flush_ws (port
, FUNC_NAME
);
383 if (terminating_char
== c
)
386 scm_ungetc (c
, port
);
387 tmp
= scm_read_expression (port
);
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
394 ans
= scm_read_expression (port
);
395 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
396 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
401 /* Build the head of the list structure. */
402 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
404 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
408 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
409 scm_i_input_error (FUNC_NAME
, port
,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c
)));
413 scm_ungetc (c
, port
);
414 tmp
= scm_read_expression (port
);
416 /* See above note about scm_sym_dot. */
417 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
419 SCM_SETCDR (tl
, scm_read_expression (port
));
421 c
= flush_ws (port
, FUNC_NAME
);
422 if (terminating_char
!= c
)
423 scm_i_input_error (FUNC_NAME
, port
,
424 "in pair: missing close paren", SCM_EOL
);
428 new_tail
= scm_cons (tmp
, SCM_EOL
);
429 SCM_SETCDR (tl
, new_tail
);
434 return maybe_annotate_source (ans
, port
, line
, column
);
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
448 while (i < ndigits) \
450 a = scm_getc (port); \
454 && (a == (scm_t_wchar) terminator) \
457 if ('0' <= a && a <= '9') \
459 else if ('A' <= a && a <= 'F') \
461 else if ('a' <= a && a <= 'f') \
474 skip_intraline_whitespace (SCM port
)
484 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
486 scm_ungetc (c
, port
);
490 scm_read_string (int chr
, SCM port
)
491 #define FUNC_NAME "scm_lreadr"
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
496 SCM str
= SCM_BOOL_F
;
497 unsigned c_str_len
= 0;
500 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
501 while ('"' != (c
= scm_getc (port
)))
506 scm_i_input_error (FUNC_NAME
, port
,
507 "end of file in string constant", SCM_EOL
);
510 if (c_str_len
+ 1 >= scm_i_string_length (str
))
512 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
514 str
= scm_string_append (scm_list_2 (str
, addy
));
519 switch (c
= scm_getc (port
))
527 if (SCM_HUNGRY_EOL_ESCAPES_P
)
528 skip_intraline_whitespace (port
);
555 if (SCM_R6RS_ESCAPES_P
)
556 SCM_READ_HEX_ESCAPE (10, ';');
558 SCM_READ_HEX_ESCAPE (2, '\0');
561 if (!SCM_R6RS_ESCAPES_P
)
563 SCM_READ_HEX_ESCAPE (4, '\0');
567 if (!SCM_R6RS_ESCAPES_P
)
569 SCM_READ_HEX_ESCAPE (6, '\0');
574 scm_i_input_error (FUNC_NAME
, port
,
575 "illegal character in escape sequence: ~S",
576 scm_list_1 (SCM_MAKE_CHAR (c
)));
579 str
= scm_i_string_start_writing (str
);
580 scm_i_string_set_x (str
, c_str_len
++, c
);
581 scm_i_string_stop_writing ();
583 return scm_i_substring_copy (str
, 0, c_str_len
);
589 scm_read_number (scm_t_wchar chr
, SCM port
)
591 SCM result
, str
= SCM_EOL
;
592 char buffer
[READER_BUFFER_SIZE
];
593 char *overflow_buffer
= NULL
;
596 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
598 scm_ungetc (chr
, port
);
599 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
600 &overflow_buffer
, &bytes_read
);
603 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
605 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
608 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
609 if (!scm_is_true (result
))
611 /* Return a symbol instead of a number */
612 if (SCM_CASE_INSENSITIVE_P
)
613 str
= scm_string_downcase_x (str
);
614 result
= scm_string_to_symbol (str
);
618 free (overflow_buffer
);
619 SCM_COL (port
) += scm_i_string_length (str
);
624 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
627 int ends_with_colon
= 0;
629 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
631 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
632 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
635 scm_ungetc (chr
, port
);
636 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
637 &overflow_buffer
, &bytes_read
);
641 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
643 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
646 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
649 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
651 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
654 if (SCM_CASE_INSENSITIVE_P
)
655 str
= scm_string_downcase_x (str
);
656 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
661 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
663 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
666 if (SCM_CASE_INSENSITIVE_P
)
667 str
= scm_string_downcase_x (str
);
668 result
= scm_string_to_symbol (str
);
672 free (overflow_buffer
);
673 SCM_COL (port
) += scm_i_string_length (str
);
678 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
679 #define FUNC_NAME "scm_lreadr"
683 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
712 scm_ungetc (chr
, port
);
713 scm_ungetc ('#', port
);
717 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
718 &overflow_buffer
, &read
);
720 pt
= SCM_PTAB_ENTRY (port
);
722 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
724 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
727 result
= scm_string_to_number (str
, scm_from_uint (radix
));
730 free (overflow_buffer
);
732 SCM_COL (port
) += scm_i_string_length (str
);
734 if (scm_is_true (result
))
737 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
744 scm_read_quote (int chr
, SCM port
)
747 long line
= SCM_LINUM (port
);
748 int column
= SCM_COL (port
) - 1;
753 p
= scm_sym_quasiquote
;
766 p
= scm_sym_uq_splicing
;
769 scm_ungetc (c
, port
);
776 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
777 "scm_read_quote", chr
);
781 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
782 return maybe_annotate_source (p
, port
, line
, column
);
785 SCM_SYMBOL (sym_syntax
, "syntax");
786 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
787 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
788 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
791 scm_read_syntax (int chr
, SCM port
)
794 long line
= SCM_LINUM (port
);
795 int column
= SCM_COL (port
) - 1;
813 p
= sym_unsyntax_splicing
;
816 scm_ungetc (c
, port
);
823 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
824 "scm_read_syntax", chr
);
828 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
829 return maybe_annotate_source (p
, port
, line
, column
);
833 scm_read_nil (int chr
, SCM port
)
835 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
837 if (!scm_is_eq (id
, sym_nil
))
838 scm_i_input_error ("scm_read_nil", port
,
839 "unexpected input while reading #nil: ~a",
842 return SCM_ELISP_NIL
;
846 scm_read_semicolon_comment (int chr
, SCM port
)
850 /* We use the get_byte here because there is no need to get the
851 locale correct with comment input. This presumes that newline
852 always represents itself no matter what the encoding is. */
853 for (c
= scm_get_byte_or_eof (port
);
854 (c
!= EOF
) && (c
!= '\n');
855 c
= scm_get_byte_or_eof (port
));
857 return SCM_UNSPECIFIED
;
861 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
864 scm_read_boolean (int chr
, SCM port
)
877 return SCM_UNSPECIFIED
;
881 scm_read_character (scm_t_wchar chr
, SCM port
)
882 #define FUNC_NAME "scm_lreadr"
884 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
886 size_t charname_len
, bytes_read
;
891 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
893 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
897 chr
= scm_getc (port
);
899 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
900 "while reading character", SCM_EOL
);
902 /* CHR must be a token delimiter, like a whitespace. */
903 return (SCM_MAKE_CHAR (chr
));
906 pt
= SCM_PTAB_ENTRY (port
);
908 /* Simple ASCII characters can be processed immediately. Also, simple
909 ISO-8859-1 characters can be processed immediately if the encoding for this
910 port is ISO-8859-1. */
911 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
914 return SCM_MAKE_CHAR (buffer
[0]);
917 /* Otherwise, convert the buffer into a proper scheme string for
919 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
921 charname_len
= scm_i_string_length (charname
);
922 SCM_COL (port
) += charname_len
;
923 cp
= scm_i_string_ref (charname
, 0);
924 if (charname_len
== 1)
925 return SCM_MAKE_CHAR (cp
);
927 /* Ignore dotted circles, which may be used to keep combining characters from
928 combining with the backslash in #\charname. */
929 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
930 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
932 if (cp
>= '0' && cp
< '8')
934 /* Dirk:FIXME:: This type of character syntax is not R5RS
935 * compliant. Further, it should be verified that the constant
936 * does only consist of octal digits. */
937 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
940 scm_t_wchar c
= scm_to_uint32 (p
);
941 if (SCM_IS_UNICODE_CHAR (c
))
942 return SCM_MAKE_CHAR (c
);
944 scm_i_input_error (FUNC_NAME
, port
,
945 "out-of-range octal character escape: ~a",
946 scm_list_1 (charname
));
950 if (cp
== 'x' && (charname_len
> 1))
954 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
955 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
959 scm_t_wchar c
= scm_to_uint32 (p
);
960 if (SCM_IS_UNICODE_CHAR (c
))
961 return SCM_MAKE_CHAR (c
);
963 scm_i_input_error (FUNC_NAME
, port
,
964 "out-of-range hex character escape: ~a",
965 scm_list_1 (charname
));
969 /* The names of characters should never have non-Latin1
971 if (scm_i_is_narrow_string (charname
)
972 || scm_i_try_narrow_string (charname
))
973 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
975 if (scm_is_true (ch
))
979 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
980 scm_list_1 (charname
));
982 return SCM_UNSPECIFIED
;
987 scm_read_keyword (int chr
, SCM port
)
991 /* Read the symbol that comprises the keyword. Doing this instead of
992 invoking a specific symbol reader function allows `scm_read_keyword ()'
993 to adapt to the delimiters currently valid of symbols.
995 XXX: This implementation allows sloppy syntaxes like `#: key'. */
996 symbol
= scm_read_expression (port
);
997 if (!scm_is_symbol (symbol
))
998 scm_i_input_error ("scm_read_keyword", port
,
999 "keyword prefix `~a' not followed by a symbol: ~s",
1000 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1002 return (scm_symbol_to_keyword (symbol
));
1006 scm_read_vector (int chr
, SCM port
)
1008 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1009 guarantee that it's going to do what we want. After all, this is an
1010 implementation detail of `scm_read_vector ()', not a desirable
1012 return (scm_vector (scm_read_sexp (chr
, port
)));
1016 scm_read_srfi4_vector (int chr
, SCM port
)
1018 return scm_i_read_array (port
, chr
);
1022 scm_read_bytevector (scm_t_wchar chr
, SCM port
)
1024 chr
= scm_getc (port
);
1028 chr
= scm_getc (port
);
1032 chr
= scm_getc (port
);
1036 return scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
));
1039 scm_i_input_error ("read_bytevector", port
,
1040 "invalid bytevector prefix",
1041 SCM_MAKE_CHAR (chr
));
1042 return SCM_UNSPECIFIED
;
1046 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
)
1048 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1049 terribly inefficient but who cares? */
1050 SCM s_bits
= SCM_EOL
;
1052 for (chr
= scm_getc (port
);
1053 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1054 chr
= scm_getc (port
))
1056 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1060 scm_ungetc (chr
, port
);
1062 return scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
));
1066 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1072 int c
= scm_getc (port
);
1075 scm_i_input_error ("skip_block_comment", port
,
1076 "unterminated `#! ... !#' comment", SCM_EOL
);
1080 else if (c
== '#' && bang_seen
)
1086 return SCM_UNSPECIFIED
;
1090 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1093 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1095 scm_ungetc (c
, port
);
1096 return scm_read_scsh_block_comment (chr
, port
);
1098 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1100 scm_ungetc (c
, port
);
1101 scm_ungetc ('r', port
);
1102 return scm_read_scsh_block_comment (chr
, port
);
1104 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1106 scm_ungetc (c
, port
);
1107 scm_ungetc ('6', port
);
1108 scm_ungetc ('r', port
);
1109 return scm_read_scsh_block_comment (chr
, port
);
1111 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1113 scm_ungetc (c
, port
);
1114 scm_ungetc ('r', port
);
1115 scm_ungetc ('6', port
);
1116 scm_ungetc ('r', port
);
1117 return scm_read_scsh_block_comment (chr
, port
);
1120 return SCM_UNSPECIFIED
;
1124 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1126 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1127 nested. So care must be taken. */
1128 int nesting_level
= 1;
1130 int a
= scm_getc (port
);
1133 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1134 "unterminated `#| ... |#' comment", SCM_EOL
);
1136 while (nesting_level
> 0)
1138 int b
= scm_getc (port
);
1141 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1142 "unterminated `#| ... |#' comment", SCM_EOL
);
1144 if (a
== '|' && b
== '#')
1149 else if (a
== '#' && b
== '|')
1158 return SCM_UNSPECIFIED
;
1162 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1166 c
= flush_ws (port
, (char *) NULL
);
1168 scm_i_input_error ("read_commented_expression", port
,
1169 "no expression after #; comment", SCM_EOL
);
1170 scm_ungetc (c
, port
);
1171 scm_read_expression (port
);
1172 return SCM_UNSPECIFIED
;
1176 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1178 /* Guile's extended symbol read syntax looks like this:
1180 #{This is all a symbol name}#
1182 So here, CHR is expected to be `{'. */
1185 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1187 buf
= scm_i_string_start_writing (buf
);
1189 while ((chr
= scm_getc (port
)) != EOF
)
1200 scm_i_string_set_x (buf
, len
++, '}');
1206 else if (chr
== '\\')
1208 /* It used to be that print.c would print extended-read-syntax
1209 symbols with backslashes before "non-standard" chars, but
1210 this routine wouldn't do anything with those escapes.
1211 Bummer. What we've done is to change print.c to output
1212 R6RS hex escapes for those characters, relying on the fact
1213 that the extended read syntax would never put a `\' before
1214 an `x'. For now, we just ignore other instances of
1215 backslash in the string. */
1216 switch ((chr
= scm_getc (port
)))
1224 SCM_READ_HEX_ESCAPE (10, ';');
1225 scm_i_string_set_x (buf
, len
++, c
);
1233 scm_i_string_stop_writing ();
1234 scm_i_input_error ("scm_read_extended_symbol", port
,
1235 "illegal character in escape sequence: ~S",
1236 scm_list_1 (SCM_MAKE_CHAR (c
)));
1240 scm_i_string_set_x (buf
, len
++, chr
);
1245 scm_i_string_set_x (buf
, len
++, chr
);
1247 if (len
>= scm_i_string_length (buf
) - 2)
1251 scm_i_string_stop_writing ();
1252 addy
= scm_i_make_string (1024, NULL
, 0);
1253 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1255 buf
= scm_i_string_start_writing (buf
);
1260 scm_i_string_stop_writing ();
1262 scm_i_input_error ("scm_read_extended_symbol", port
,
1263 "end of file while reading symbol", SCM_EOL
);
1265 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1270 /* Top-level token readers, i.e., dispatchers. */
1273 scm_read_sharp_extension (int chr
, SCM port
)
1277 proc
= scm_get_hash_procedure (chr
);
1278 if (scm_is_true (scm_procedure_p (proc
)))
1280 long line
= SCM_LINUM (port
);
1281 int column
= SCM_COL (port
) - 2;
1284 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1286 if (scm_is_pair (got
) && !scm_i_has_source_properties (got
))
1287 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1292 return SCM_UNSPECIFIED
;
1295 /* The reader for the sharp `#' character. It basically dispatches reads
1296 among the above token readers. */
1298 scm_read_sharp (scm_t_wchar chr
, SCM port
)
1299 #define FUNC_NAME "scm_lreadr"
1303 chr
= scm_getc (port
);
1305 result
= scm_read_sharp_extension (chr
, port
);
1306 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1312 return (scm_read_character (chr
, port
));
1314 return (scm_read_vector (chr
, port
));
1319 /* This one may return either a boolean or an SRFI-4 vector. */
1320 return (scm_read_srfi4_vector (chr
, port
));
1322 return (scm_read_bytevector (chr
, port
));
1324 return (scm_read_guile_bit_vector (chr
, port
));
1328 return (scm_read_boolean (chr
, port
));
1330 return (scm_read_keyword (chr
, port
));
1331 case '0': case '1': case '2': case '3': case '4':
1332 case '5': case '6': case '7': case '8': case '9':
1334 #if SCM_ENABLE_DEPRECATED
1335 /* See below for 'i' and 'e'. */
1341 return (scm_i_read_array (port
, chr
));
1345 #if SCM_ENABLE_DEPRECATED
1347 /* When next char is '(', it really is an old-style
1349 scm_t_wchar next_c
= scm_getc (port
);
1351 scm_ungetc (next_c
, port
);
1353 return scm_i_read_array (port
, chr
);
1367 return (scm_read_number_and_radix (chr
, port
));
1369 return (scm_read_extended_symbol (chr
, port
));
1371 return (scm_read_shebang (chr
, port
));
1373 return (scm_read_commented_expression (chr
, port
));
1377 return (scm_read_syntax (chr
, port
));
1379 return (scm_read_nil (chr
, port
));
1381 result
= scm_read_sharp_extension (chr
, port
);
1382 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1384 /* To remain compatible with 1.8 and earlier, the following
1385 characters have lower precedence than `read-hash-extend'
1390 return scm_read_r6rs_block_comment (chr
, port
);
1392 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1393 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1400 return SCM_UNSPECIFIED
;
1405 scm_read_expression (SCM port
)
1406 #define FUNC_NAME "scm_read_expression"
1412 chr
= scm_getc (port
);
1416 case SCM_WHITE_SPACES
:
1417 case SCM_LINE_INCREMENTORS
:
1420 (void) scm_read_semicolon_comment (chr
, port
);
1423 if (!SCM_SQUARE_BRACKETS_P
)
1424 return (scm_read_mixed_case_symbol (chr
, port
));
1425 /* otherwise fall through */
1427 return (scm_read_sexp (chr
, port
));
1429 return (scm_read_string (chr
, port
));
1433 return (scm_read_quote (chr
, port
));
1437 result
= scm_read_sharp (chr
, port
);
1438 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1439 /* We read a comment or some such. */
1445 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1448 if (SCM_SQUARE_BRACKETS_P
)
1449 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1450 /* otherwise fall through */
1454 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1455 return scm_symbol_to_keyword (scm_read_expression (port
));
1460 if (((chr
>= '0') && (chr
<= '9'))
1461 || (strchr ("+-.", chr
)))
1462 return (scm_read_number (chr
, port
));
1464 return (scm_read_mixed_case_symbol (chr
, port
));
1472 /* Actual reader. */
1474 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1476 "Read an s-expression from the input port @var{port}, or from\n"
1477 "the current input port if @var{port} is not specified.\n"
1478 "Any whitespace before the next token is discarded.")
1479 #define FUNC_NAME s_scm_read
1483 if (SCM_UNBNDP (port
))
1484 port
= scm_current_input_port ();
1485 SCM_VALIDATE_OPINPORT (1, port
);
1487 c
= flush_ws (port
, (char *) NULL
);
1490 scm_ungetc (c
, port
);
1492 return (scm_read_expression (port
));
1499 /* Manipulate the read-hash-procedures alist. This could be written in
1500 Scheme, but maybe it will also be used by C code during initialisation. */
1501 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1502 (SCM chr
, SCM proc
),
1503 "Install the procedure @var{proc} for reading expressions\n"
1504 "starting with the character sequence @code{#} and @var{chr}.\n"
1505 "@var{proc} will be called with two arguments: the character\n"
1506 "@var{chr} and the port to read further data from. The object\n"
1507 "returned will be the return value of @code{read}. \n"
1508 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1510 #define FUNC_NAME s_scm_read_hash_extend
1515 SCM_VALIDATE_CHAR (1, chr
);
1516 SCM_ASSERT (scm_is_false (proc
)
1517 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1518 proc
, SCM_ARG2
, FUNC_NAME
);
1520 /* Check if chr is already in the alist. */
1521 this = scm_i_read_hash_procedures_ref ();
1525 if (scm_is_null (this))
1527 /* not found, so add it to the beginning. */
1528 if (scm_is_true (proc
))
1530 SCM
new = scm_cons (scm_cons (chr
, proc
),
1531 scm_i_read_hash_procedures_ref ());
1532 scm_i_read_hash_procedures_set_x (new);
1536 if (scm_is_eq (chr
, SCM_CAAR (this)))
1538 /* already in the alist. */
1539 if (scm_is_false (proc
))
1542 if (scm_is_false (prev
))
1544 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1545 scm_i_read_hash_procedures_set_x (rest
);
1548 scm_set_cdr_x (prev
, SCM_CDR (this));
1553 scm_set_cdr_x (SCM_CAR (this), proc
);
1558 this = SCM_CDR (this);
1561 return SCM_UNSPECIFIED
;
1565 /* Recover the read-hash procedure corresponding to char c. */
1567 scm_get_hash_procedure (int c
)
1569 SCM rest
= scm_i_read_hash_procedures_ref ();
1573 if (scm_is_null (rest
))
1576 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1577 return SCM_CDAR (rest
);
1579 rest
= SCM_CDR (rest
);
1583 #define SCM_ENCODING_SEARCH_SIZE (500)
1585 /* Search the first few hundred characters of a file for an Emacs-like coding
1586 declaration. Returns either NULL or a string whose storage has been
1587 allocated with `scm_gc_malloc ()'. */
1589 scm_i_scan_for_encoding (SCM port
)
1592 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1593 size_t bytes_read
, encoding_length
, i
;
1594 char *encoding
= NULL
;
1596 char *pos
, *encoding_start
;
1599 pt
= SCM_PTAB_ENTRY (port
);
1601 if (pt
->rw_active
== SCM_PORT_WRITE
)
1605 pt
->rw_active
= SCM_PORT_READ
;
1607 if (pt
->read_pos
== pt
->read_end
)
1609 /* We can use the read buffer, and thus avoid a seek. */
1610 if (scm_fill_input (port
) == EOF
)
1613 bytes_read
= pt
->read_end
- pt
->read_pos
;
1614 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1615 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1617 if (bytes_read
<= 1)
1618 /* An unbuffered port -- don't scan. */
1621 memcpy (header
, pt
->read_pos
, bytes_read
);
1622 header
[bytes_read
] = '\0';
1626 /* Try to read some bytes and then seek back. Not all ports
1627 support seeking back; and indeed some file ports (like
1628 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1629 check performed by SCM_FPORT_FDES---but fail to seek
1630 backwards. Hence this block comes second. We prefer to use
1631 the read buffer in-place. */
1632 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1635 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1636 header
[bytes_read
] = '\0';
1637 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1641 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1644 /* search past "coding[:=]" */
1648 if ((pos
= strstr(pos
, "coding")) == NULL
)
1651 pos
+= strlen("coding");
1652 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1653 (*pos
== ':' || *pos
== '='))
1661 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1662 (*pos
== ' ' || *pos
== '\t'))
1665 /* grab the next token */
1666 encoding_start
= pos
;
1668 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1669 && encoding_start
+ i
- header
< bytes_read
1670 && (isalnum ((int) encoding_start
[i
])
1671 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1674 encoding_length
= i
;
1675 if (encoding_length
== 0)
1678 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1679 for (i
= 0; i
< encoding_length
; i
++)
1680 encoding
[i
] = toupper ((int) encoding
[i
]);
1682 /* push backwards to make sure we were in a comment */
1684 pos
= encoding_start
;
1685 while (pos
>= header
)
1692 else if (*pos
== '\n' || pos
== header
)
1694 /* This wasn't in a semicolon comment. Check for a
1695 hash-bang comment. */
1696 char *beg
= strstr (header
, "#!");
1697 char *end
= strstr (header
, "!#");
1698 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1709 /* This wasn't in a comment */
1712 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1713 scm_misc_error (NULL
,
1714 "the port input declares the encoding ~s but is encoded as UTF-8",
1715 scm_list_1 (scm_from_locale_string (encoding
)));
1720 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1722 "Scans the port for an Emacs-like character coding declaration\n"
1723 "near the top of the contents of a port with random-accessible contents.\n"
1724 "The coding declaration is of the form\n"
1725 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1727 "Returns a string containing the character encoding of the file\n"
1728 "if a declaration was found, or @code{#f} otherwise.\n")
1729 #define FUNC_NAME s_scm_file_encoding
1734 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1736 enc
= scm_i_scan_for_encoding (port
);
1741 s_enc
= scm_from_locale_string (enc
);
1752 SCM read_hash_procs
;
1754 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
1756 scm_i_read_hash_procedures
=
1757 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1759 scm_init_opts (scm_read_options
, scm_read_opts
);
1760 #include "libguile/read.x"