1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, const size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
244 read_complete_token (SCM port
, char *buffer
, const size_t buffer_size
,
245 char **overflow_buffer
, size_t *read
)
248 size_t bytes_read
, overflow_size
;
250 *overflow_buffer
= NULL
;
255 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
258 if (overflow
|| overflow_size
!= 0)
260 if (overflow_size
== 0)
262 *overflow_buffer
= scm_malloc (bytes_read
);
263 memcpy (*overflow_buffer
, buffer
, bytes_read
);
264 overflow_size
= bytes_read
;
268 *overflow_buffer
= scm_realloc (*overflow_buffer
, overflow_size
+ bytes_read
);
269 memcpy (*overflow_buffer
+ overflow_size
, buffer
, bytes_read
);
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
!= 0);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
, long line
, int column
);
363 maybe_annotate_source (SCM x
, SCM port
, long line
, int column
)
365 if (SCM_RECORD_POSITIONS_P
)
366 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
371 scm_read_sexp (scm_t_wchar chr
, SCM port
)
372 #define FUNC_NAME "scm_i_lreadparen"
375 SCM tmp
, tl
, ans
= SCM_EOL
;
376 const int terminating_char
= ((chr
== '[') ? ']' : ')');
378 /* Need to capture line and column numbers here. */
379 long line
= SCM_LINUM (port
);
380 int column
= SCM_COL (port
) - 1;
382 c
= flush_ws (port
, FUNC_NAME
);
383 if (terminating_char
== c
)
386 scm_ungetc (c
, port
);
387 tmp
= scm_read_expression (port
);
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
394 ans
= scm_read_expression (port
);
395 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
396 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
401 /* Build the head of the list structure. */
402 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
404 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
408 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
409 scm_i_input_error (FUNC_NAME
, port
,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c
)));
413 scm_ungetc (c
, port
);
414 tmp
= scm_read_expression (port
);
416 /* See above note about scm_sym_dot. */
417 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
419 SCM_SETCDR (tl
, scm_read_expression (port
));
421 c
= flush_ws (port
, FUNC_NAME
);
422 if (terminating_char
!= c
)
423 scm_i_input_error (FUNC_NAME
, port
,
424 "in pair: missing close paren", SCM_EOL
);
428 new_tail
= scm_cons (tmp
, SCM_EOL
);
429 SCM_SETCDR (tl
, new_tail
);
434 return maybe_annotate_source (ans
, port
, line
, column
);
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
448 while (i < ndigits) \
450 a = scm_getc (port); \
454 && (a == (scm_t_wchar) terminator) \
457 if ('0' <= a && a <= '9') \
459 else if ('A' <= a && a <= 'F') \
461 else if ('a' <= a && a <= 'f') \
474 skip_intraline_whitespace (SCM port
)
484 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
486 scm_ungetc (c
, port
);
490 scm_read_string (int chr
, SCM port
)
491 #define FUNC_NAME "scm_lreadr"
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
496 SCM str
= SCM_BOOL_F
;
497 unsigned c_str_len
= 0;
500 /* Need to capture line and column numbers here. */
501 long line
= SCM_LINUM (port
);
502 int column
= SCM_COL (port
) - 1;
504 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
505 while ('"' != (c
= scm_getc (port
)))
510 scm_i_input_error (FUNC_NAME
, port
,
511 "end of file in string constant", SCM_EOL
);
514 if (c_str_len
+ 1 >= scm_i_string_length (str
))
516 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
518 str
= scm_string_append (scm_list_2 (str
, addy
));
523 switch (c
= scm_getc (port
))
531 if (SCM_HUNGRY_EOL_ESCAPES_P
)
532 skip_intraline_whitespace (port
);
559 if (SCM_R6RS_ESCAPES_P
)
560 SCM_READ_HEX_ESCAPE (10, ';');
562 SCM_READ_HEX_ESCAPE (2, '\0');
565 if (!SCM_R6RS_ESCAPES_P
)
567 SCM_READ_HEX_ESCAPE (4, '\0');
571 if (!SCM_R6RS_ESCAPES_P
)
573 SCM_READ_HEX_ESCAPE (6, '\0');
578 scm_i_input_error (FUNC_NAME
, port
,
579 "illegal character in escape sequence: ~S",
580 scm_list_1 (SCM_MAKE_CHAR (c
)));
583 str
= scm_i_string_start_writing (str
);
584 scm_i_string_set_x (str
, c_str_len
++, c
);
585 scm_i_string_stop_writing ();
587 return maybe_annotate_source (scm_i_substring_copy (str
, 0, c_str_len
),
594 scm_read_number (scm_t_wchar chr
, SCM port
)
596 SCM result
, str
= SCM_EOL
;
597 char buffer
[READER_BUFFER_SIZE
];
598 char *overflow_buffer
= NULL
;
601 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
603 scm_ungetc (chr
, port
);
604 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
605 &overflow_buffer
, &bytes_read
);
608 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
610 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
613 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
614 if (!scm_is_true (result
))
616 /* Return a symbol instead of a number */
617 if (SCM_CASE_INSENSITIVE_P
)
618 str
= scm_string_downcase_x (str
);
619 result
= scm_string_to_symbol (str
);
623 free (overflow_buffer
);
624 SCM_COL (port
) += scm_i_string_length (str
);
629 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
632 int ends_with_colon
= 0;
634 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
636 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
637 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
640 scm_ungetc (chr
, port
);
641 overflow
= read_complete_token (port
, buffer
, READER_BUFFER_SIZE
,
642 &overflow_buffer
, &bytes_read
);
646 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
648 ends_with_colon
= overflow_buffer
[bytes_read
- 1] == ':';
651 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
654 str
= scm_from_stringn (buffer
, bytes_read
- 1, pt
->encoding
, pt
->ilseq_handler
);
656 str
= scm_from_stringn (overflow_buffer
, bytes_read
- 1, pt
->encoding
,
659 if (SCM_CASE_INSENSITIVE_P
)
660 str
= scm_string_downcase_x (str
);
661 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
666 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
668 str
= scm_from_stringn (overflow_buffer
, bytes_read
, pt
->encoding
,
671 if (SCM_CASE_INSENSITIVE_P
)
672 str
= scm_string_downcase_x (str
);
673 result
= scm_string_to_symbol (str
);
677 free (overflow_buffer
);
678 SCM_COL (port
) += scm_i_string_length (str
);
683 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
684 #define FUNC_NAME "scm_lreadr"
688 char buffer
[READER_BUFFER_SIZE
], *overflow_buffer
;
717 scm_ungetc (chr
, port
);
718 scm_ungetc ('#', port
);
722 overflow
= read_complete_token (port
, buffer
, sizeof (buffer
),
723 &overflow_buffer
, &read
);
725 pt
= SCM_PTAB_ENTRY (port
);
727 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
729 str
= scm_from_stringn (overflow_buffer
, read
, pt
->encoding
,
732 result
= scm_string_to_number (str
, scm_from_uint (radix
));
735 free (overflow_buffer
);
737 SCM_COL (port
) += scm_i_string_length (str
);
739 if (scm_is_true (result
))
742 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
749 scm_read_quote (int chr
, SCM port
)
752 long line
= SCM_LINUM (port
);
753 int column
= SCM_COL (port
) - 1;
758 p
= scm_sym_quasiquote
;
771 p
= scm_sym_uq_splicing
;
774 scm_ungetc (c
, port
);
781 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
782 "scm_read_quote", chr
);
786 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
787 return maybe_annotate_source (p
, port
, line
, column
);
790 SCM_SYMBOL (sym_syntax
, "syntax");
791 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
792 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
793 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
796 scm_read_syntax (int chr
, SCM port
)
799 long line
= SCM_LINUM (port
);
800 int column
= SCM_COL (port
) - 1;
818 p
= sym_unsyntax_splicing
;
821 scm_ungetc (c
, port
);
828 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
829 "scm_read_syntax", chr
);
833 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
834 return maybe_annotate_source (p
, port
, line
, column
);
838 scm_read_nil (int chr
, SCM port
)
840 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
842 if (!scm_is_eq (id
, sym_nil
))
843 scm_i_input_error ("scm_read_nil", port
,
844 "unexpected input while reading #nil: ~a",
847 return SCM_ELISP_NIL
;
851 scm_read_semicolon_comment (int chr
, SCM port
)
855 /* We use the get_byte here because there is no need to get the
856 locale correct with comment input. This presumes that newline
857 always represents itself no matter what the encoding is. */
858 for (c
= scm_get_byte_or_eof (port
);
859 (c
!= EOF
) && (c
!= '\n');
860 c
= scm_get_byte_or_eof (port
));
862 return SCM_UNSPECIFIED
;
866 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
869 scm_read_boolean (int chr
, SCM port
)
882 return SCM_UNSPECIFIED
;
886 scm_read_character (scm_t_wchar chr
, SCM port
)
887 #define FUNC_NAME "scm_lreadr"
889 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
891 size_t charname_len
, bytes_read
;
896 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
898 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
902 chr
= scm_getc (port
);
904 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
905 "while reading character", SCM_EOL
);
907 /* CHR must be a token delimiter, like a whitespace. */
908 return (SCM_MAKE_CHAR (chr
));
911 pt
= SCM_PTAB_ENTRY (port
);
913 /* Simple ASCII characters can be processed immediately. Also, simple
914 ISO-8859-1 characters can be processed immediately if the encoding for this
915 port is ISO-8859-1. */
916 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
919 return SCM_MAKE_CHAR (buffer
[0]);
922 /* Otherwise, convert the buffer into a proper scheme string for
924 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
926 charname_len
= scm_i_string_length (charname
);
927 SCM_COL (port
) += charname_len
;
928 cp
= scm_i_string_ref (charname
, 0);
929 if (charname_len
== 1)
930 return SCM_MAKE_CHAR (cp
);
932 /* Ignore dotted circles, which may be used to keep combining characters from
933 combining with the backslash in #\charname. */
934 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
935 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
937 if (cp
>= '0' && cp
< '8')
939 /* Dirk:FIXME:: This type of character syntax is not R5RS
940 * compliant. Further, it should be verified that the constant
941 * does only consist of octal digits. */
942 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
945 scm_t_wchar c
= scm_to_uint32 (p
);
946 if (SCM_IS_UNICODE_CHAR (c
))
947 return SCM_MAKE_CHAR (c
);
949 scm_i_input_error (FUNC_NAME
, port
,
950 "out-of-range octal character escape: ~a",
951 scm_list_1 (charname
));
955 if (cp
== 'x' && (charname_len
> 1))
959 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
960 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
964 scm_t_wchar c
= scm_to_uint32 (p
);
965 if (SCM_IS_UNICODE_CHAR (c
))
966 return SCM_MAKE_CHAR (c
);
968 scm_i_input_error (FUNC_NAME
, port
,
969 "out-of-range hex character escape: ~a",
970 scm_list_1 (charname
));
974 /* The names of characters should never have non-Latin1
976 if (scm_i_is_narrow_string (charname
)
977 || scm_i_try_narrow_string (charname
))
978 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
980 if (scm_is_true (ch
))
984 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
985 scm_list_1 (charname
));
987 return SCM_UNSPECIFIED
;
992 scm_read_keyword (int chr
, SCM port
)
996 /* Read the symbol that comprises the keyword. Doing this instead of
997 invoking a specific symbol reader function allows `scm_read_keyword ()'
998 to adapt to the delimiters currently valid of symbols.
1000 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1001 symbol
= scm_read_expression (port
);
1002 if (!scm_is_symbol (symbol
))
1003 scm_i_input_error ("scm_read_keyword", port
,
1004 "keyword prefix `~a' not followed by a symbol: ~s",
1005 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
1007 return (scm_symbol_to_keyword (symbol
));
1011 scm_read_vector (int chr
, SCM port
, long line
, int column
)
1013 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1014 guarantee that it's going to do what we want. After all, this is an
1015 implementation detail of `scm_read_vector ()', not a desirable
1017 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
)),
1018 port
, line
, column
);
1022 scm_read_array (int chr
, SCM port
, long line
, int column
)
1024 SCM result
= scm_i_read_array (port
, chr
);
1025 if (scm_is_false (result
))
1028 return maybe_annotate_source (result
, port
, line
, column
);
1032 scm_read_srfi4_vector (int chr
, SCM port
, long line
, int column
)
1034 return scm_read_array (chr
, port
, line
, column
);
1038 scm_read_bytevector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1040 chr
= scm_getc (port
);
1044 chr
= scm_getc (port
);
1048 chr
= scm_getc (port
);
1052 return maybe_annotate_source
1053 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
)),
1054 port
, line
, column
);
1057 scm_i_input_error ("read_bytevector", port
,
1058 "invalid bytevector prefix",
1059 SCM_MAKE_CHAR (chr
));
1060 return SCM_UNSPECIFIED
;
1064 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1066 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1067 terribly inefficient but who cares? */
1068 SCM s_bits
= SCM_EOL
;
1070 for (chr
= scm_getc (port
);
1071 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1072 chr
= scm_getc (port
))
1074 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1078 scm_ungetc (chr
, port
);
1080 return maybe_annotate_source
1081 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1082 port
, line
, column
);
1086 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1092 int c
= scm_getc (port
);
1095 scm_i_input_error ("skip_block_comment", port
,
1096 "unterminated `#! ... !#' comment", SCM_EOL
);
1100 else if (c
== '#' && bang_seen
)
1106 return SCM_UNSPECIFIED
;
1110 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1113 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1115 scm_ungetc (c
, port
);
1116 return scm_read_scsh_block_comment (chr
, port
);
1118 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1120 scm_ungetc (c
, port
);
1121 scm_ungetc ('r', port
);
1122 return scm_read_scsh_block_comment (chr
, port
);
1124 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1126 scm_ungetc (c
, port
);
1127 scm_ungetc ('6', port
);
1128 scm_ungetc ('r', port
);
1129 return scm_read_scsh_block_comment (chr
, port
);
1131 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1133 scm_ungetc (c
, port
);
1134 scm_ungetc ('r', port
);
1135 scm_ungetc ('6', port
);
1136 scm_ungetc ('r', port
);
1137 return scm_read_scsh_block_comment (chr
, port
);
1140 return SCM_UNSPECIFIED
;
1144 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1146 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1147 nested. So care must be taken. */
1148 int nesting_level
= 1;
1150 int a
= scm_getc (port
);
1153 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1154 "unterminated `#| ... |#' comment", SCM_EOL
);
1156 while (nesting_level
> 0)
1158 int b
= scm_getc (port
);
1161 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1162 "unterminated `#| ... |#' comment", SCM_EOL
);
1164 if (a
== '|' && b
== '#')
1169 else if (a
== '#' && b
== '|')
1178 return SCM_UNSPECIFIED
;
1182 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1186 c
= flush_ws (port
, (char *) NULL
);
1188 scm_i_input_error ("read_commented_expression", port
,
1189 "no expression after #; comment", SCM_EOL
);
1190 scm_ungetc (c
, port
);
1191 scm_read_expression (port
);
1192 return SCM_UNSPECIFIED
;
1196 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1198 /* Guile's extended symbol read syntax looks like this:
1200 #{This is all a symbol name}#
1202 So here, CHR is expected to be `{'. */
1205 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1207 buf
= scm_i_string_start_writing (buf
);
1209 while ((chr
= scm_getc (port
)) != EOF
)
1220 scm_i_string_set_x (buf
, len
++, '}');
1226 else if (chr
== '\\')
1228 /* It used to be that print.c would print extended-read-syntax
1229 symbols with backslashes before "non-standard" chars, but
1230 this routine wouldn't do anything with those escapes.
1231 Bummer. What we've done is to change print.c to output
1232 R6RS hex escapes for those characters, relying on the fact
1233 that the extended read syntax would never put a `\' before
1234 an `x'. For now, we just ignore other instances of
1235 backslash in the string. */
1236 switch ((chr
= scm_getc (port
)))
1244 SCM_READ_HEX_ESCAPE (10, ';');
1245 scm_i_string_set_x (buf
, len
++, c
);
1253 scm_i_string_stop_writing ();
1254 scm_i_input_error ("scm_read_extended_symbol", port
,
1255 "illegal character in escape sequence: ~S",
1256 scm_list_1 (SCM_MAKE_CHAR (c
)));
1260 scm_i_string_set_x (buf
, len
++, chr
);
1265 scm_i_string_set_x (buf
, len
++, chr
);
1267 if (len
>= scm_i_string_length (buf
) - 2)
1271 scm_i_string_stop_writing ();
1272 addy
= scm_i_make_string (1024, NULL
, 0);
1273 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1275 buf
= scm_i_string_start_writing (buf
);
1280 scm_i_string_stop_writing ();
1282 scm_i_input_error ("scm_read_extended_symbol", port
,
1283 "end of file while reading symbol", SCM_EOL
);
1285 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1290 /* Top-level token readers, i.e., dispatchers. */
1293 scm_read_sharp_extension (int chr
, SCM port
)
1297 proc
= scm_get_hash_procedure (chr
);
1298 if (scm_is_true (scm_procedure_p (proc
)))
1300 long line
= SCM_LINUM (port
);
1301 int column
= SCM_COL (port
) - 2;
1304 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1306 if (scm_is_pair (got
) && !scm_i_has_source_properties (got
))
1307 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1312 return SCM_UNSPECIFIED
;
1315 /* The reader for the sharp `#' character. It basically dispatches reads
1316 among the above token readers. */
1318 scm_read_sharp (scm_t_wchar chr
, SCM port
, long line
, int column
)
1319 #define FUNC_NAME "scm_lreadr"
1323 chr
= scm_getc (port
);
1325 result
= scm_read_sharp_extension (chr
, port
);
1326 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1332 return (scm_read_character (chr
, port
));
1334 return (scm_read_vector (chr
, port
, line
, column
));
1339 /* This one may return either a boolean or an SRFI-4 vector. */
1340 return (scm_read_srfi4_vector (chr
, port
, line
, column
));
1342 return (scm_read_bytevector (chr
, port
, line
, column
));
1344 return (scm_read_guile_bit_vector (chr
, port
, line
, column
));
1348 return (scm_read_boolean (chr
, port
));
1350 return (scm_read_keyword (chr
, port
));
1351 case '0': case '1': case '2': case '3': case '4':
1352 case '5': case '6': case '7': case '8': case '9':
1354 #if SCM_ENABLE_DEPRECATED
1355 /* See below for 'i' and 'e'. */
1361 return (scm_read_array (chr
, port
, line
, column
));
1365 #if SCM_ENABLE_DEPRECATED
1367 /* When next char is '(', it really is an old-style
1369 scm_t_wchar next_c
= scm_getc (port
);
1371 scm_ungetc (next_c
, port
);
1373 return scm_read_array (chr
, port
, line
, column
);
1387 return (scm_read_number_and_radix (chr
, port
));
1389 return (scm_read_extended_symbol (chr
, port
));
1391 return (scm_read_shebang (chr
, port
));
1393 return (scm_read_commented_expression (chr
, port
));
1397 return (scm_read_syntax (chr
, port
));
1399 return (scm_read_nil (chr
, port
));
1401 result
= scm_read_sharp_extension (chr
, port
);
1402 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1404 /* To remain compatible with 1.8 and earlier, the following
1405 characters have lower precedence than `read-hash-extend'
1410 return scm_read_r6rs_block_comment (chr
, port
);
1412 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1413 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1420 return SCM_UNSPECIFIED
;
1425 scm_read_expression (SCM port
)
1426 #define FUNC_NAME "scm_read_expression"
1432 chr
= scm_getc (port
);
1436 case SCM_WHITE_SPACES
:
1437 case SCM_LINE_INCREMENTORS
:
1440 (void) scm_read_semicolon_comment (chr
, port
);
1443 if (!SCM_SQUARE_BRACKETS_P
)
1444 return (scm_read_mixed_case_symbol (chr
, port
));
1445 /* otherwise fall through */
1447 return (scm_read_sexp (chr
, port
));
1449 return (scm_read_string (chr
, port
));
1453 return (scm_read_quote (chr
, port
));
1456 long line
= SCM_LINUM (port
);
1457 int column
= SCM_COL (port
) - 1;
1458 SCM result
= scm_read_sharp (chr
, port
, line
, column
);
1459 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1460 /* We read a comment or some such. */
1466 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1469 if (SCM_SQUARE_BRACKETS_P
)
1470 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1471 /* otherwise fall through */
1475 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1476 return scm_symbol_to_keyword (scm_read_expression (port
));
1481 if (((chr
>= '0') && (chr
<= '9'))
1482 || (strchr ("+-.", chr
)))
1483 return (scm_read_number (chr
, port
));
1485 return (scm_read_mixed_case_symbol (chr
, port
));
1493 /* Actual reader. */
1495 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1497 "Read an s-expression from the input port @var{port}, or from\n"
1498 "the current input port if @var{port} is not specified.\n"
1499 "Any whitespace before the next token is discarded.")
1500 #define FUNC_NAME s_scm_read
1504 if (SCM_UNBNDP (port
))
1505 port
= scm_current_input_port ();
1506 SCM_VALIDATE_OPINPORT (1, port
);
1508 c
= flush_ws (port
, (char *) NULL
);
1511 scm_ungetc (c
, port
);
1513 return (scm_read_expression (port
));
1520 /* Manipulate the read-hash-procedures alist. This could be written in
1521 Scheme, but maybe it will also be used by C code during initialisation. */
1522 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1523 (SCM chr
, SCM proc
),
1524 "Install the procedure @var{proc} for reading expressions\n"
1525 "starting with the character sequence @code{#} and @var{chr}.\n"
1526 "@var{proc} will be called with two arguments: the character\n"
1527 "@var{chr} and the port to read further data from. The object\n"
1528 "returned will be the return value of @code{read}. \n"
1529 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1531 #define FUNC_NAME s_scm_read_hash_extend
1536 SCM_VALIDATE_CHAR (1, chr
);
1537 SCM_ASSERT (scm_is_false (proc
)
1538 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1539 proc
, SCM_ARG2
, FUNC_NAME
);
1541 /* Check if chr is already in the alist. */
1542 this = scm_i_read_hash_procedures_ref ();
1546 if (scm_is_null (this))
1548 /* not found, so add it to the beginning. */
1549 if (scm_is_true (proc
))
1551 SCM
new = scm_cons (scm_cons (chr
, proc
),
1552 scm_i_read_hash_procedures_ref ());
1553 scm_i_read_hash_procedures_set_x (new);
1557 if (scm_is_eq (chr
, SCM_CAAR (this)))
1559 /* already in the alist. */
1560 if (scm_is_false (proc
))
1563 if (scm_is_false (prev
))
1565 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1566 scm_i_read_hash_procedures_set_x (rest
);
1569 scm_set_cdr_x (prev
, SCM_CDR (this));
1574 scm_set_cdr_x (SCM_CAR (this), proc
);
1579 this = SCM_CDR (this);
1582 return SCM_UNSPECIFIED
;
1586 /* Recover the read-hash procedure corresponding to char c. */
1588 scm_get_hash_procedure (int c
)
1590 SCM rest
= scm_i_read_hash_procedures_ref ();
1594 if (scm_is_null (rest
))
1597 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1598 return SCM_CDAR (rest
);
1600 rest
= SCM_CDR (rest
);
1604 #define SCM_ENCODING_SEARCH_SIZE (500)
1606 /* Search the first few hundred characters of a file for an Emacs-like coding
1607 declaration. Returns either NULL or a string whose storage has been
1608 allocated with `scm_gc_malloc ()'. */
1610 scm_i_scan_for_encoding (SCM port
)
1613 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1614 size_t bytes_read
, encoding_length
, i
;
1615 char *encoding
= NULL
;
1617 char *pos
, *encoding_start
;
1620 pt
= SCM_PTAB_ENTRY (port
);
1622 if (pt
->rw_active
== SCM_PORT_WRITE
)
1626 pt
->rw_active
= SCM_PORT_READ
;
1628 if (pt
->read_pos
== pt
->read_end
)
1630 /* We can use the read buffer, and thus avoid a seek. */
1631 if (scm_fill_input (port
) == EOF
)
1634 bytes_read
= pt
->read_end
- pt
->read_pos
;
1635 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1636 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1638 if (bytes_read
<= 1)
1639 /* An unbuffered port -- don't scan. */
1642 memcpy (header
, pt
->read_pos
, bytes_read
);
1643 header
[bytes_read
] = '\0';
1647 /* Try to read some bytes and then seek back. Not all ports
1648 support seeking back; and indeed some file ports (like
1649 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1650 check performed by SCM_FPORT_FDES---but fail to seek
1651 backwards. Hence this block comes second. We prefer to use
1652 the read buffer in-place. */
1653 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1656 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1657 header
[bytes_read
] = '\0';
1658 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1662 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1665 /* search past "coding[:=]" */
1669 if ((pos
= strstr(pos
, "coding")) == NULL
)
1672 pos
+= strlen("coding");
1673 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1674 (*pos
== ':' || *pos
== '='))
1682 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1683 (*pos
== ' ' || *pos
== '\t'))
1686 /* grab the next token */
1687 encoding_start
= pos
;
1689 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1690 && encoding_start
+ i
- header
< bytes_read
1691 && (isalnum ((int) encoding_start
[i
])
1692 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1695 encoding_length
= i
;
1696 if (encoding_length
== 0)
1699 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1700 for (i
= 0; i
< encoding_length
; i
++)
1701 encoding
[i
] = toupper ((int) encoding
[i
]);
1703 /* push backwards to make sure we were in a comment */
1705 pos
= encoding_start
;
1706 while (pos
>= header
)
1713 else if (*pos
== '\n' || pos
== header
)
1715 /* This wasn't in a semicolon comment. Check for a
1716 hash-bang comment. */
1717 char *beg
= strstr (header
, "#!");
1718 char *end
= strstr (header
, "!#");
1719 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1730 /* This wasn't in a comment */
1733 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1734 scm_misc_error (NULL
,
1735 "the port input declares the encoding ~s but is encoded as UTF-8",
1736 scm_list_1 (scm_from_locale_string (encoding
)));
1741 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1743 "Scans the port for an Emacs-like character coding declaration\n"
1744 "near the top of the contents of a port with random-accessible contents.\n"
1745 "The coding declaration is of the form\n"
1746 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1748 "Returns a string containing the character encoding of the file\n"
1749 "if a declaration was found, or @code{#f} otherwise.\n")
1750 #define FUNC_NAME s_scm_file_encoding
1755 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1757 enc
= scm_i_scan_for_encoding (port
);
1762 s_enc
= scm_from_locale_string (enc
);
1773 SCM read_hash_procs
;
1775 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
1777 scm_i_read_hash_procedures
=
1778 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1780 scm_init_opts (scm_read_options
, scm_read_opts
);
1781 #include "libguile/read.x"