1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
244 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
245 char **overflow_buffer
, size_t *read
)
248 size_t bytes_read
, overflow_size
;
250 *overflow_buffer
= NULL
;
255 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
258 if (overflow
|| overflow_size
!= 0)
260 if (overflow_size
== 0)
262 *overflow_buffer
= scm_malloc (bytes_read
);
263 memcpy (*overflow_buffer
, buffer
, bytes_read
);
264 overflow_size
= bytes_read
;
268 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
269 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
!= 0);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
, long line
, int column
);
363 maybe_annotate_source (SCM x
, SCM port
, long line
, int column
)
365 if (SCM_RECORD_POSITIONS_P
)
366 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
371 scm_read_sexp (scm_t_wchar chr
, SCM port
)
372 #define FUNC_NAME "scm_i_lreadparen"
375 SCM tmp
, tl
, ans
= SCM_EOL
;
376 const int terminating_char
= ((chr
== '[') ? ']' : ')');
378 /* Need to capture line and column numbers here. */
379 long line
= SCM_LINUM (port
);
380 int column
= SCM_COL (port
) - 1;
382 c
= flush_ws (port
, FUNC_NAME
);
383 if (terminating_char
== c
)
386 scm_ungetc (c
, port
);
387 tmp
= scm_read_expression (port
);
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
394 ans
= scm_read_expression (port
);
395 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
396 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
401 /* Build the head of the list structure. */
402 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
404 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
408 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
409 scm_i_input_error (FUNC_NAME
, port
,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c
)));
413 scm_ungetc (c
, port
);
414 tmp
= scm_read_expression (port
);
416 /* See above note about scm_sym_dot. */
417 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
419 SCM_SETCDR (tl
, scm_read_expression (port
));
421 c
= flush_ws (port
, FUNC_NAME
);
422 if (terminating_char
!= c
)
423 scm_i_input_error (FUNC_NAME
, port
,
424 "in pair: missing close paren", SCM_EOL
);
428 new_tail
= scm_cons (tmp
, SCM_EOL
);
429 SCM_SETCDR (tl
, new_tail
);
434 return maybe_annotate_source (ans
, port
, line
, column
);
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
448 while (i < ndigits) \
450 a = scm_getc (port); \
454 && (a == (scm_t_wchar) terminator) \
457 if ('0' <= a && a <= '9') \
459 else if ('A' <= a && a <= 'F') \
461 else if ('a' <= a && a <= 'f') \
474 skip_intraline_whitespace (SCM port
)
484 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
486 scm_ungetc (c
, port
);
490 scm_read_string (int chr
, SCM port
)
491 #define FUNC_NAME "scm_lreadr"
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
496 SCM str
= SCM_BOOL_F
;
497 unsigned c_str_len
= 0;
500 /* Need to capture line and column numbers here. */
501 long line
= SCM_LINUM (port
);
502 int column
= SCM_COL (port
) - 1;
504 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
505 while ('"' != (c
= scm_getc (port
)))
510 scm_i_input_error (FUNC_NAME
, port
,
511 "end of file in string constant", SCM_EOL
);
514 if (c_str_len
+ 1 >= scm_i_string_length (str
))
516 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
518 str
= scm_string_append (scm_list_2 (str
, addy
));
523 switch (c
= scm_getc (port
))
531 if (SCM_HUNGRY_EOL_ESCAPES_P
)
532 skip_intraline_whitespace (port
);
559 if (SCM_R6RS_ESCAPES_P
)
560 SCM_READ_HEX_ESCAPE (10, ';');
562 SCM_READ_HEX_ESCAPE (2, '\0');
565 if (!SCM_R6RS_ESCAPES_P
)
567 SCM_READ_HEX_ESCAPE (4, '\0');
571 if (!SCM_R6RS_ESCAPES_P
)
573 SCM_READ_HEX_ESCAPE (6, '\0');
578 scm_i_input_error (FUNC_NAME
, port
,
579 "illegal character in escape sequence: ~S",
580 scm_list_1 (SCM_MAKE_CHAR (c
)));
583 str
= scm_i_string_start_writing (str
);
584 scm_i_string_set_x (str
, c_str_len
++, c
);
585 scm_i_string_stop_writing ();
587 return maybe_annotate_source (scm_i_substring_copy (str
, 0, c_str_len
),
594 scm_read_number (scm_t_wchar chr
, SCM port
)
596 SCM result
, str
= SCM_EOL
;
597 char buffer
[READER_BUFFER_SIZE
];
598 char *overflow_buffer
= NULL
;
601 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
603 /* Need to capture line and column numbers here. */
604 long line
= SCM_LINUM (port
);
605 int column
= SCM_COL (port
) - 1;
607 scm_ungetc (chr
, port
);
608 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
609 &overflow_buffer
, &bytes_read
);
612 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
614 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
617 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
618 if (scm_is_false (result
))
620 /* Return a symbol instead of a number */
621 if (SCM_CASE_INSENSITIVE_P
)
622 str
= scm_string_downcase_x (str
);
623 result
= scm_string_to_symbol (str
);
625 else if (SCM_NIMP (result
))
626 result
= maybe_annotate_source (result
, port
, line
, column
);
629 free (overflow_buffer
);
630 SCM_COL (port
) += scm_i_string_length (str
);
635 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
638 int ends_with_colon
= 0;
640 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
642 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
643 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
646 scm_ungetc (chr
, port
);
647 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
648 &overflow_buffer
, &bytes_read
);
652 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
654 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
657 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
660 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
662 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
665 if (SCM_CASE_INSENSITIVE_P
)
666 str
= scm_string_downcase_x (str
);
667 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
672 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
674 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
677 if (SCM_CASE_INSENSITIVE_P
)
678 str
= scm_string_downcase_x (str
);
679 result
= scm_string_to_symbol (str
);
683 free (overflow_buffer
);
684 SCM_COL (port
) += scm_i_string_length (str
);
689 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
690 #define FUNC_NAME "scm_lreadr"
694 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
723 scm_ungetc (chr
, port
);
724 scm_ungetc ('#', port
);
728 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
729 &overflow_buffer
, &read
);
731 pt
= SCM_PTAB_ENTRY (port
);
733 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
735 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
738 result
= scm_string_to_number (str
, scm_from_uint (radix
));
741 free (overflow_buffer
);
743 SCM_COL (port
) += scm_i_string_length (str
);
745 if (scm_is_true (result
))
748 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
755 scm_read_quote (int chr
, SCM port
)
758 long line
= SCM_LINUM (port
);
759 int column
= SCM_COL (port
) - 1;
764 p
= scm_sym_quasiquote
;
777 p
= scm_sym_uq_splicing
;
780 scm_ungetc (c
, port
);
787 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
788 "scm_read_quote", chr
);
792 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
793 return maybe_annotate_source (p
, port
, line
, column
);
796 SCM_SYMBOL (sym_syntax
, "syntax");
797 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
798 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
799 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
802 scm_read_syntax (int chr
, SCM port
)
805 long line
= SCM_LINUM (port
);
806 int column
= SCM_COL (port
) - 1;
824 p
= sym_unsyntax_splicing
;
827 scm_ungetc (c
, port
);
834 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
835 "scm_read_syntax", chr
);
839 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
840 return maybe_annotate_source (p
, port
, line
, column
);
844 scm_read_nil (int chr
, SCM port
)
846 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
848 if (!scm_is_eq (id
, sym_nil
))
849 scm_i_input_error ("scm_read_nil", port
,
850 "unexpected input while reading #nil: ~a",
853 return SCM_ELISP_NIL
;
857 scm_read_semicolon_comment (int chr
, SCM port
)
861 /* We use the get_byte here because there is no need to get the
862 locale correct with comment input. This presumes that newline
863 always represents itself no matter what the encoding is. */
864 for (c
= scm_get_byte_or_eof (port
);
865 (c
!= EOF
) && (c
!= '\n');
866 c
= scm_get_byte_or_eof (port
));
868 return SCM_UNSPECIFIED
;
872 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
875 scm_read_boolean (int chr
, SCM port
)
888 return SCM_UNSPECIFIED
;
892 scm_read_character (scm_t_wchar chr
, SCM port
)
893 #define FUNC_NAME "scm_lreadr"
895 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
897 size_t charname_len
, bytes_read
;
902 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
904 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
908 chr
= scm_getc (port
);
910 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
911 "while reading character", SCM_EOL
);
913 /* CHR must be a token delimiter, like a whitespace. */
914 return (SCM_MAKE_CHAR (chr
));
917 pt
= SCM_PTAB_ENTRY (port
);
919 /* Simple ASCII characters can be processed immediately. Also, simple
920 ISO-8859-1 characters can be processed immediately if the encoding for this
921 port is ISO-8859-1. */
922 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
925 return SCM_MAKE_CHAR (buffer
[0]);
928 /* Otherwise, convert the buffer into a proper scheme string for
930 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
932 charname_len
= scm_i_string_length (charname
);
933 SCM_COL (port
) += charname_len
;
934 cp
= scm_i_string_ref (charname
, 0);
935 if (charname_len
== 1)
936 return SCM_MAKE_CHAR (cp
);
938 /* Ignore dotted circles, which may be used to keep combining characters from
939 combining with the backslash in #\charname. */
940 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
941 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
943 if (cp
>= '0' && cp
< '8')
945 /* Dirk:FIXME:: This type of character syntax is not R5RS
946 * compliant. Further, it should be verified that the constant
947 * does only consist of octal digits. */
948 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
951 scm_t_wchar c
= scm_to_uint32 (p
);
952 if (SCM_IS_UNICODE_CHAR (c
))
953 return SCM_MAKE_CHAR (c
);
955 scm_i_input_error (FUNC_NAME
, port
,
956 "out-of-range octal character escape: ~a",
957 scm_list_1 (charname
));
961 if (cp
== 'x' && (charname_len
> 1))
965 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
966 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
970 scm_t_wchar c
= scm_to_uint32 (p
);
971 if (SCM_IS_UNICODE_CHAR (c
))
972 return SCM_MAKE_CHAR (c
);
974 scm_i_input_error (FUNC_NAME
, port
,
975 "out-of-range hex character escape: ~a",
976 scm_list_1 (charname
));
980 /* The names of characters should never have non-Latin1
982 if (scm_i_is_narrow_string (charname
)
983 || scm_i_try_narrow_string (charname
))
984 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
986 if (scm_is_true (ch
))
990 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
991 scm_list_1 (charname
));
993 return SCM_UNSPECIFIED
;
998 scm_read_keyword (int chr
, SCM port
)
1002 /* Read the symbol that comprises the keyword. Doing this instead of
1003 invoking a specific symbol reader function allows `scm_read_keyword ()'
1004 to adapt to the delimiters currently valid of symbols.
1006 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1007 symbol
= scm_read_expression (port
);
1008 if (!scm_is_symbol (symbol
))
1009 scm_i_input_error ("scm_read_keyword", port
,
1010 "keyword prefix `~a' not followed by a symbol: ~s",
1011 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1013 return (scm_symbol_to_keyword (symbol
));
1017 scm_read_vector (int chr
, SCM port
, long line
, int column
)
1019 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1020 guarantee that it's going to do what we want. After all, this is an
1021 implementation detail of `scm_read_vector ()', not a desirable
1023 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
)),
1024 port
, line
, column
);
1028 scm_read_array (int chr
, SCM port
, long line
, int column
)
1030 SCM result
= scm_i_read_array (port
, chr
);
1031 if (scm_is_false (result
))
1034 return maybe_annotate_source (result
, port
, line
, column
);
1038 scm_read_srfi4_vector (int chr
, SCM port
, long line
, int column
)
1040 return scm_read_array (chr
, port
, line
, column
);
1044 scm_read_bytevector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1046 chr
= scm_getc (port
);
1050 chr
= scm_getc (port
);
1054 chr
= scm_getc (port
);
1058 return maybe_annotate_source
1059 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
)),
1060 port
, line
, column
);
1063 scm_i_input_error ("read_bytevector", port
,
1064 "invalid bytevector prefix",
1065 SCM_MAKE_CHAR (chr
));
1066 return SCM_UNSPECIFIED
;
1070 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1072 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1073 terribly inefficient but who cares? */
1074 SCM s_bits
= SCM_EOL
;
1076 for (chr
= scm_getc (port
);
1077 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1078 chr
= scm_getc (port
))
1080 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1084 scm_ungetc (chr
, port
);
1086 return maybe_annotate_source
1087 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1088 port
, line
, column
);
1092 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1098 int c
= scm_getc (port
);
1101 scm_i_input_error ("skip_block_comment", port
,
1102 "unterminated `#! ... !#' comment", SCM_EOL
);
1106 else if (c
== '#' && bang_seen
)
1112 return SCM_UNSPECIFIED
;
1116 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1119 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1121 scm_ungetc (c
, port
);
1122 return scm_read_scsh_block_comment (chr
, port
);
1124 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1126 scm_ungetc (c
, port
);
1127 scm_ungetc ('r', port
);
1128 return scm_read_scsh_block_comment (chr
, port
);
1130 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1132 scm_ungetc (c
, port
);
1133 scm_ungetc ('6', port
);
1134 scm_ungetc ('r', port
);
1135 return scm_read_scsh_block_comment (chr
, port
);
1137 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1139 scm_ungetc (c
, port
);
1140 scm_ungetc ('r', port
);
1141 scm_ungetc ('6', port
);
1142 scm_ungetc ('r', port
);
1143 return scm_read_scsh_block_comment (chr
, port
);
1146 return SCM_UNSPECIFIED
;
1150 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1152 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1153 nested. So care must be taken. */
1154 int nesting_level
= 1;
1156 int a
= scm_getc (port
);
1159 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1160 "unterminated `#| ... |#' comment", SCM_EOL
);
1162 while (nesting_level
> 0)
1164 int b
= scm_getc (port
);
1167 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1168 "unterminated `#| ... |#' comment", SCM_EOL
);
1170 if (a
== '|' && b
== '#')
1175 else if (a
== '#' && b
== '|')
1184 return SCM_UNSPECIFIED
;
1188 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1192 c
= flush_ws (port
, (char *) NULL
);
1194 scm_i_input_error ("read_commented_expression", port
,
1195 "no expression after #; comment", SCM_EOL
);
1196 scm_ungetc (c
, port
);
1197 scm_read_expression (port
);
1198 return SCM_UNSPECIFIED
;
1202 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1204 /* Guile's extended symbol read syntax looks like this:
1206 #{This is all a symbol name}#
1208 So here, CHR is expected to be `{'. */
1211 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1213 buf
= scm_i_string_start_writing (buf
);
1215 while ((chr
= scm_getc (port
)) != EOF
)
1226 scm_i_string_set_x (buf
, len
++, '}');
1232 else if (chr
== '\\')
1234 /* It used to be that print.c would print extended-read-syntax
1235 symbols with backslashes before "non-standard" chars, but
1236 this routine wouldn't do anything with those escapes.
1237 Bummer. What we've done is to change print.c to output
1238 R6RS hex escapes for those characters, relying on the fact
1239 that the extended read syntax would never put a `\' before
1240 an `x'. For now, we just ignore other instances of
1241 backslash in the string. */
1242 switch ((chr
= scm_getc (port
)))
1250 SCM_READ_HEX_ESCAPE (10, ';');
1251 scm_i_string_set_x (buf
, len
++, c
);
1259 scm_i_string_stop_writing ();
1260 scm_i_input_error ("scm_read_extended_symbol", port
,
1261 "illegal character in escape sequence: ~S",
1262 scm_list_1 (SCM_MAKE_CHAR (c
)));
1266 scm_i_string_set_x (buf
, len
++, chr
);
1271 scm_i_string_set_x (buf
, len
++, chr
);
1273 if (len
>= scm_i_string_length (buf
) - 2)
1277 scm_i_string_stop_writing ();
1278 addy
= scm_i_make_string (1024, NULL
, 0);
1279 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1281 buf
= scm_i_string_start_writing (buf
);
1286 scm_i_string_stop_writing ();
1288 scm_i_input_error ("scm_read_extended_symbol", port
,
1289 "end of file while reading symbol", SCM_EOL
);
1291 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1296 /* Top-level token readers, i.e., dispatchers. */
1299 scm_read_sharp_extension (int chr
, SCM port
)
1303 proc
= scm_get_hash_procedure (chr
);
1304 if (scm_is_true (scm_procedure_p (proc
)))
1306 long line
= SCM_LINUM (port
);
1307 int column
= SCM_COL (port
) - 2;
1310 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1312 if (scm_is_pair (got
) && !scm_i_has_source_properties (got
))
1313 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1318 return SCM_UNSPECIFIED
;
1321 /* The reader for the sharp `#' character. It basically dispatches reads
1322 among the above token readers. */
1324 scm_read_sharp (scm_t_wchar chr
, SCM port
, long line
, int column
)
1325 #define FUNC_NAME "scm_lreadr"
1329 chr
= scm_getc (port
);
1331 result
= scm_read_sharp_extension (chr
, port
);
1332 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1338 return (scm_read_character (chr
, port
));
1340 return (scm_read_vector (chr
, port
, line
, column
));
1345 /* This one may return either a boolean or an SRFI-4 vector. */
1346 return (scm_read_srfi4_vector (chr
, port
, line
, column
));
1348 return (scm_read_bytevector (chr
, port
, line
, column
));
1350 return (scm_read_guile_bit_vector (chr
, port
, line
, column
));
1354 return (scm_read_boolean (chr
, port
));
1356 return (scm_read_keyword (chr
, port
));
1357 case '0': case '1': case '2': case '3': case '4':
1358 case '5': case '6': case '7': case '8': case '9':
1360 #if SCM_ENABLE_DEPRECATED
1361 /* See below for 'i' and 'e'. */
1367 return (scm_read_array (chr
, port
, line
, column
));
1371 #if SCM_ENABLE_DEPRECATED
1373 /* When next char is '(', it really is an old-style
1375 scm_t_wchar next_c
= scm_getc (port
);
1377 scm_ungetc (next_c
, port
);
1379 return scm_read_array (chr
, port
, line
, column
);
1393 return (scm_read_number_and_radix (chr
, port
));
1395 return (scm_read_extended_symbol (chr
, port
));
1397 return (scm_read_shebang (chr
, port
));
1399 return (scm_read_commented_expression (chr
, port
));
1403 return (scm_read_syntax (chr
, port
));
1405 return (scm_read_nil (chr
, port
));
1407 result
= scm_read_sharp_extension (chr
, port
);
1408 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1410 /* To remain compatible with 1.8 and earlier, the following
1411 characters have lower precedence than `read-hash-extend'
1416 return scm_read_r6rs_block_comment (chr
, port
);
1418 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1419 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1426 return SCM_UNSPECIFIED
;
1431 scm_read_expression (SCM port
)
1432 #define FUNC_NAME "scm_read_expression"
1438 chr
= scm_getc (port
);
1442 case SCM_WHITE_SPACES
:
1443 case SCM_LINE_INCREMENTORS
:
1446 (void) scm_read_semicolon_comment (chr
, port
);
1449 if (!SCM_SQUARE_BRACKETS_P
)
1450 return (scm_read_mixed_case_symbol (chr
, port
));
1451 /* otherwise fall through */
1453 return (scm_read_sexp (chr
, port
));
1455 return (scm_read_string (chr
, port
));
1459 return (scm_read_quote (chr
, port
));
1462 long line
= SCM_LINUM (port
);
1463 int column
= SCM_COL (port
) - 1;
1464 SCM result
= scm_read_sharp (chr
, port
, line
, column
);
1465 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1466 /* We read a comment or some such. */
1472 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1475 if (SCM_SQUARE_BRACKETS_P
)
1476 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1477 /* otherwise fall through */
1481 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1482 return scm_symbol_to_keyword (scm_read_expression (port
));
1487 if (((chr
>= '0') && (chr
<= '9'))
1488 || (strchr ("+-.", chr
)))
1489 return (scm_read_number (chr
, port
));
1491 return (scm_read_mixed_case_symbol (chr
, port
));
1499 /* Actual reader. */
1501 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1503 "Read an s-expression from the input port @var{port}, or from\n"
1504 "the current input port if @var{port} is not specified.\n"
1505 "Any whitespace before the next token is discarded.")
1506 #define FUNC_NAME s_scm_read
1510 if (SCM_UNBNDP (port
))
1511 port
= scm_current_input_port ();
1512 SCM_VALIDATE_OPINPORT (1, port
);
1514 c
= flush_ws (port
, (char *) NULL
);
1517 scm_ungetc (c
, port
);
1519 return (scm_read_expression (port
));
1526 /* Manipulate the read-hash-procedures alist. This could be written in
1527 Scheme, but maybe it will also be used by C code during initialisation. */
1528 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1529 (SCM chr
, SCM proc
),
1530 "Install the procedure @var{proc} for reading expressions\n"
1531 "starting with the character sequence @code{#} and @var{chr}.\n"
1532 "@var{proc} will be called with two arguments: the character\n"
1533 "@var{chr} and the port to read further data from. The object\n"
1534 "returned will be the return value of @code{read}. \n"
1535 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1537 #define FUNC_NAME s_scm_read_hash_extend
1542 SCM_VALIDATE_CHAR (1, chr
);
1543 SCM_ASSERT (scm_is_false (proc
)
1544 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1545 proc
, SCM_ARG2
, FUNC_NAME
);
1547 /* Check if chr is already in the alist. */
1548 this = scm_i_read_hash_procedures_ref ();
1552 if (scm_is_null (this))
1554 /* not found, so add it to the beginning. */
1555 if (scm_is_true (proc
))
1557 SCM
new = scm_cons (scm_cons (chr
, proc
),
1558 scm_i_read_hash_procedures_ref ());
1559 scm_i_read_hash_procedures_set_x (new);
1563 if (scm_is_eq (chr
, SCM_CAAR (this)))
1565 /* already in the alist. */
1566 if (scm_is_false (proc
))
1569 if (scm_is_false (prev
))
1571 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1572 scm_i_read_hash_procedures_set_x (rest
);
1575 scm_set_cdr_x (prev
, SCM_CDR (this));
1580 scm_set_cdr_x (SCM_CAR (this), proc
);
1585 this = SCM_CDR (this);
1588 return SCM_UNSPECIFIED
;
1592 /* Recover the read-hash procedure corresponding to char c. */
1594 scm_get_hash_procedure (int c
)
1596 SCM rest
= scm_i_read_hash_procedures_ref ();
1600 if (scm_is_null (rest
))
1603 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1604 return SCM_CDAR (rest
);
1606 rest
= SCM_CDR (rest
);
1610 #define SCM_ENCODING_SEARCH_SIZE (500)
1612 /* Search the first few hundred characters of a file for an Emacs-like coding
1613 declaration. Returns either NULL or a string whose storage has been
1614 allocated with `scm_gc_malloc ()'. */
1616 scm_i_scan_for_encoding (SCM port
)
1619 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1620 size_t bytes_read
, encoding_length
, i
;
1621 char *encoding
= NULL
;
1623 char *pos
, *encoding_start
;
1626 pt
= SCM_PTAB_ENTRY (port
);
1628 if (pt
->rw_active
== SCM_PORT_WRITE
)
1632 pt
->rw_active
= SCM_PORT_READ
;
1634 if (pt
->read_pos
== pt
->read_end
)
1636 /* We can use the read buffer, and thus avoid a seek. */
1637 if (scm_fill_input (port
) == EOF
)
1640 bytes_read
= pt
->read_end
- pt
->read_pos
;
1641 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1642 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1644 if (bytes_read
<= 1)
1645 /* An unbuffered port -- don't scan. */
1648 memcpy (header
, pt
->read_pos
, bytes_read
);
1649 header
[bytes_read
] = '\0';
1653 /* Try to read some bytes and then seek back. Not all ports
1654 support seeking back; and indeed some file ports (like
1655 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1656 check performed by SCM_FPORT_FDES---but fail to seek
1657 backwards. Hence this block comes second. We prefer to use
1658 the read buffer in-place. */
1659 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1662 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1663 header
[bytes_read
] = '\0';
1664 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1668 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1671 /* search past "coding[:=]" */
1675 if ((pos
= strstr(pos
, "coding")) == NULL
)
1678 pos
+= strlen("coding");
1679 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1680 (*pos
== ':' || *pos
== '='))
1688 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1689 (*pos
== ' ' || *pos
== '\t'))
1692 /* grab the next token */
1693 encoding_start
= pos
;
1695 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1696 && encoding_start
+ i
- header
< bytes_read
1697 && (isalnum ((int) encoding_start
[i
])
1698 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1701 encoding_length
= i
;
1702 if (encoding_length
== 0)
1705 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1706 for (i
= 0; i
< encoding_length
; i
++)
1707 encoding
[i
] = toupper ((int) encoding
[i
]);
1709 /* push backwards to make sure we were in a comment */
1711 pos
= encoding_start
;
1712 while (pos
>= header
)
1719 else if (*pos
== '\n' || pos
== header
)
1721 /* This wasn't in a semicolon comment. Check for a
1722 hash-bang comment. */
1723 char *beg
= strstr (header
, "#!");
1724 char *end
= strstr (header
, "!#");
1725 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1736 /* This wasn't in a comment */
1739 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1740 scm_misc_error (NULL
,
1741 "the port input declares the encoding ~s but is encoded as UTF-8",
1742 scm_list_1 (scm_from_locale_string (encoding
)));
1747 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1749 "Scans the port for an Emacs-like character coding declaration\n"
1750 "near the top of the contents of a port with random-accessible contents.\n"
1751 "The coding declaration is of the form\n"
1752 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1754 "Returns a string containing the character encoding of the file\n"
1755 "if a declaration was found, or @code{#f} otherwise.\n")
1756 #define FUNC_NAME s_scm_file_encoding
1761 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1763 enc
= scm_i_scan_for_encoding (port
);
1768 s_enc
= scm_from_locale_string (enc
);
1779 SCM read_hash_procs
;
1781 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
1783 scm_i_read_hash_procedures
=
1784 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1786 scm_init_opts (scm_read_options
, scm_read_opts
);
1787 #include "libguile/read.x"