1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
34 #include "libguile/_scm.h"
35 #include "libguile/bytevectors.h"
36 #include "libguile/chars.h"
37 #include "libguile/eval.h"
38 #include "libguile/arrays.h"
39 #include "libguile/bitvectors.h"
40 #include "libguile/keywords.h"
41 #include "libguile/alist.h"
42 #include "libguile/srcprop.h"
43 #include "libguile/hashtab.h"
44 #include "libguile/hash.h"
45 #include "libguile/ports.h"
46 #include "libguile/fports.h"
47 #include "libguile/root.h"
48 #include "libguile/strings.h"
49 #include "libguile/strports.h"
50 #include "libguile/vectors.h"
51 #include "libguile/validate.h"
52 #include "libguile/srfi-4.h"
53 #include "libguile/srfi-13.h"
55 #include "libguile/read.h"
56 #include "libguile/private-options.h"
61 SCM_GLOBAL_SYMBOL (scm_sym_dot
, ".");
62 SCM_SYMBOL (scm_keyword_prefix
, "prefix");
63 SCM_SYMBOL (scm_keyword_postfix
, "postfix");
64 SCM_SYMBOL (sym_nil
, "nil");
66 scm_t_option scm_read_opts
[] = {
67 { SCM_OPTION_BOOLEAN
, "copy", 0,
68 "Copy source code expressions." },
69 { SCM_OPTION_BOOLEAN
, "positions", 1,
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN
, "case-insensitive", 0,
72 "Convert symbols to lower case."},
73 { SCM_OPTION_SCM
, "keywords", (scm_t_bits
) SCM_BOOL_F_BITS
,
74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
75 { SCM_OPTION_BOOLEAN
, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
77 { SCM_OPTION_BOOLEAN
, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
79 { SCM_OPTION_BOOLEAN
, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
85 Give meaningful error messages for errors
89 FILE:LINE:COL: MESSAGE
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
99 scm_i_input_error (char const *function
,
100 SCM port
, const char *message
, SCM arg
)
102 SCM fn
= (scm_is_string (SCM_FILENAME(port
))
104 : scm_from_locale_string ("#<unknown port>"));
106 SCM string_port
= scm_open_output_string ();
107 SCM string
= SCM_EOL
;
108 scm_simple_format (string_port
,
109 scm_from_locale_string ("~A:~S:~S: ~A"),
111 scm_from_long (SCM_LINUM (port
) + 1),
112 scm_from_int (SCM_COL (port
) + 1),
113 scm_from_locale_string (message
)));
115 string
= scm_get_output_string (string_port
);
116 scm_close_output_port (string_port
);
117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
118 function
? scm_from_locale_string (function
) : SCM_BOOL_F
,
125 SCM_DEFINE (scm_read_options
, "read-options-interface", 0, 1, 0,
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
130 #define FUNC_NAME s_scm_read_options
132 SCM ans
= scm_options (setting
,
135 if (SCM_COPY_SOURCE_P
)
136 SCM_RECORD_POSITIONS_P
= 1;
141 /* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143 static SCM
*scm_i_read_hash_procedures
;
146 scm_i_read_hash_procedures_ref (void)
148 return scm_fluid_ref (*scm_i_read_hash_procedures
);
152 scm_i_read_hash_procedures_set_x (SCM value
)
154 scm_fluid_set_x (*scm_i_read_hash_procedures
, value
);
161 /* Size of the C buffer used to read symbols and numbers. */
162 #define READER_BUFFER_SIZE 128
164 /* Size of the C buffer used to read strings. */
165 #define READER_STRING_BUFFER_SIZE 512
167 /* The maximum size of Scheme character names. */
168 #define READER_CHAR_NAME_MAX_SIZE 50
171 /* `isblank' is only in C99. */
172 #define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
174 || ((_chr) == '\f') || ((_chr) == '\r'))
177 # define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
186 #define CHAR_IS_R5RS_DELIMITER(c) \
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
191 #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
195 #define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
199 /* Read an SCSH block comment. */
200 static SCM
scm_read_scsh_block_comment (scm_t_wchar
, SCM
);
201 static SCM
scm_read_r6rs_block_comment (scm_t_wchar
, SCM
);
202 static SCM
scm_read_commented_expression (scm_t_wchar
, SCM
);
203 static SCM
scm_read_shebang (scm_t_wchar
, SCM
);
204 static SCM
scm_get_hash_procedure (int);
206 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
211 read_token (SCM port
, char *buf
, size_t buf_size
, size_t *read
)
215 while (*read
< buf_size
)
219 chr
= scm_get_byte_or_eof (port
);
223 else if (CHAR_IS_DELIMITER (chr
))
225 scm_unget_byte (chr
, port
);
238 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
239 if the token doesn't fit in BUFFER_SIZE bytes. */
241 read_complete_token (SCM port
, char *buffer
, size_t buffer_size
,
245 size_t bytes_read
, overflow_size
= 0;
246 char *overflow_buffer
= NULL
;
250 overflow
= read_token (port
, buffer
, buffer_size
, &bytes_read
);
253 if (overflow
|| overflow_size
!= 0)
255 if (overflow_size
== 0)
257 overflow_buffer
= scm_gc_malloc_pointerless (bytes_read
, "read");
258 memcpy (overflow_buffer
, buffer
, bytes_read
);
259 overflow_size
= bytes_read
;
264 scm_gc_malloc_pointerless (overflow_size
+ bytes_read
, "read");
266 memcpy (new_buf
, overflow_buffer
, overflow_size
);
267 memcpy (new_buf
+ overflow_size
, buffer
, bytes_read
);
269 overflow_buffer
= new_buf
;
270 overflow_size
+= bytes_read
;
277 *read
= overflow_size
;
281 return (overflow_size
> 0 ? overflow_buffer
: buffer
);
284 /* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
287 flush_ws (SCM port
, const char *eoferr
)
291 switch (c
= scm_getc (port
))
297 scm_i_input_error (eoferr
,
306 switch (c
= scm_getc (port
))
312 case SCM_LINE_INCREMENTORS
:
318 switch (c
= scm_getc (port
))
321 eoferr
= "read_sharp";
324 scm_read_shebang (c
, port
);
327 scm_read_commented_expression (c
, port
);
330 if (scm_is_false (scm_get_hash_procedure (c
)))
332 scm_read_r6rs_block_comment (c
, port
);
337 scm_ungetc (c
, port
);
342 case SCM_LINE_INCREMENTORS
:
343 case SCM_SINGLE_SPACES
:
358 static SCM
scm_read_expression (SCM port
);
359 static SCM
scm_read_sharp (int chr
, SCM port
, long line
, int column
);
363 maybe_annotate_source (SCM x
, SCM port
, long line
, int column
)
365 if (SCM_RECORD_POSITIONS_P
)
366 scm_i_set_source_properties_x (x
, line
, column
, SCM_FILENAME (port
));
371 scm_read_sexp (scm_t_wchar chr
, SCM port
)
372 #define FUNC_NAME "scm_i_lreadparen"
375 SCM tmp
, tl
, ans
= SCM_EOL
;
376 const int terminating_char
= ((chr
== '[') ? ']' : ')');
378 /* Need to capture line and column numbers here. */
379 long line
= SCM_LINUM (port
);
380 int column
= SCM_COL (port
) - 1;
382 c
= flush_ws (port
, FUNC_NAME
);
383 if (terminating_char
== c
)
386 scm_ungetc (c
, port
);
387 tmp
= scm_read_expression (port
);
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
394 ans
= scm_read_expression (port
);
395 if (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
396 scm_i_input_error (FUNC_NAME
, port
, "missing close paren",
401 /* Build the head of the list structure. */
402 ans
= tl
= scm_cons (tmp
, SCM_EOL
);
404 while (terminating_char
!= (c
= flush_ws (port
, FUNC_NAME
)))
408 if (c
== ')' || (SCM_SQUARE_BRACKETS_P
&& c
== ']'))
409 scm_i_input_error (FUNC_NAME
, port
,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c
)));
413 scm_ungetc (c
, port
);
414 tmp
= scm_read_expression (port
);
416 /* See above note about scm_sym_dot. */
417 if (c
== '.' && scm_is_eq (scm_sym_dot
, tmp
))
419 SCM_SETCDR (tl
, scm_read_expression (port
));
421 c
= flush_ws (port
, FUNC_NAME
);
422 if (terminating_char
!= c
)
423 scm_i_input_error (FUNC_NAME
, port
,
424 "in pair: missing close paren", SCM_EOL
);
428 new_tail
= scm_cons (tmp
, SCM_EOL
);
429 SCM_SETCDR (tl
, new_tail
);
434 return maybe_annotate_source (ans
, port
, line
, column
);
439 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
448 while (i < ndigits) \
450 a = scm_getc (port); \
454 && (a == (scm_t_wchar) terminator) \
457 if ('0' <= a && a <= '9') \
459 else if ('A' <= a && a <= 'F') \
461 else if ('a' <= a && a <= 'f') \
474 skip_intraline_whitespace (SCM port
)
484 while (c
== '\t' || uc_is_general_category (c
, UC_SPACE_SEPARATOR
));
486 scm_ungetc (c
, port
);
490 scm_read_string (int chr
, SCM port
)
491 #define FUNC_NAME "scm_lreadr"
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
496 SCM str
= SCM_BOOL_F
;
497 unsigned c_str_len
= 0;
500 /* Need to capture line and column numbers here. */
501 long line
= SCM_LINUM (port
);
502 int column
= SCM_COL (port
) - 1;
504 str
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
505 while ('"' != (c
= scm_getc (port
)))
510 scm_i_input_error (FUNC_NAME
, port
,
511 "end of file in string constant", SCM_EOL
);
514 if (c_str_len
+ 1 >= scm_i_string_length (str
))
516 SCM addy
= scm_i_make_string (READER_STRING_BUFFER_SIZE
, NULL
, 0);
518 str
= scm_string_append (scm_list_2 (str
, addy
));
523 switch (c
= scm_getc (port
))
531 if (SCM_HUNGRY_EOL_ESCAPES_P
)
532 skip_intraline_whitespace (port
);
559 if (SCM_R6RS_ESCAPES_P
)
560 SCM_READ_HEX_ESCAPE (10, ';');
562 SCM_READ_HEX_ESCAPE (2, '\0');
565 if (!SCM_R6RS_ESCAPES_P
)
567 SCM_READ_HEX_ESCAPE (4, '\0');
571 if (!SCM_R6RS_ESCAPES_P
)
573 SCM_READ_HEX_ESCAPE (6, '\0');
578 scm_i_input_error (FUNC_NAME
, port
,
579 "illegal character in escape sequence: ~S",
580 scm_list_1 (SCM_MAKE_CHAR (c
)));
583 str
= scm_i_string_start_writing (str
);
584 scm_i_string_set_x (str
, c_str_len
++, c
);
585 scm_i_string_stop_writing ();
587 return maybe_annotate_source (scm_i_substring_copy (str
, 0, c_str_len
),
594 scm_read_number (scm_t_wchar chr
, SCM port
)
596 SCM result
, str
= SCM_EOL
;
597 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
599 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
601 /* Need to capture line and column numbers here. */
602 long line
= SCM_LINUM (port
);
603 int column
= SCM_COL (port
) - 1;
605 scm_ungetc (chr
, port
);
606 buffer
= read_complete_token (port
, local_buffer
, sizeof local_buffer
,
609 str
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
, pt
->ilseq_handler
);
611 result
= scm_string_to_number (str
, SCM_UNDEFINED
);
612 if (scm_is_false (result
))
614 /* Return a symbol instead of a number */
615 if (SCM_CASE_INSENSITIVE_P
)
616 str
= scm_string_downcase_x (str
);
617 result
= scm_string_to_symbol (str
);
619 else if (SCM_NIMP (result
))
620 result
= maybe_annotate_source (result
, port
, line
, column
);
622 SCM_COL (port
) += scm_i_string_length (str
);
627 scm_read_mixed_case_symbol (scm_t_wchar chr
, SCM port
)
630 int ends_with_colon
= 0;
632 int postfix
= scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_postfix
);
633 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
634 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
637 scm_ungetc (chr
, port
);
638 buffer
= read_complete_token (port
, local_buffer
, sizeof local_buffer
,
641 ends_with_colon
= buffer
[bytes_read
- 1] == ':';
643 if (postfix
&& ends_with_colon
&& (bytes_read
> 1))
645 str
= scm_from_stringn (buffer
, bytes_read
- 1,
646 pt
->encoding
, pt
->ilseq_handler
);
648 if (SCM_CASE_INSENSITIVE_P
)
649 str
= scm_string_downcase_x (str
);
650 result
= scm_symbol_to_keyword (scm_string_to_symbol (str
));
654 str
= scm_from_stringn (buffer
, bytes_read
,
655 pt
->encoding
, pt
->ilseq_handler
);
657 if (SCM_CASE_INSENSITIVE_P
)
658 str
= scm_string_downcase_x (str
);
659 result
= scm_string_to_symbol (str
);
662 SCM_COL (port
) += scm_i_string_length (str
);
667 scm_read_number_and_radix (scm_t_wchar chr
, SCM port
)
668 #define FUNC_NAME "scm_lreadr"
672 char local_buffer
[READER_BUFFER_SIZE
], *buffer
;
700 scm_ungetc (chr
, port
);
701 scm_ungetc ('#', port
);
705 buffer
= read_complete_token (port
, local_buffer
, sizeof local_buffer
,
708 pt
= SCM_PTAB_ENTRY (port
);
709 str
= scm_from_stringn (buffer
, read
, pt
->encoding
, pt
->ilseq_handler
);
711 result
= scm_string_to_number (str
, scm_from_uint (radix
));
713 SCM_COL (port
) += scm_i_string_length (str
);
715 if (scm_is_true (result
))
718 scm_i_input_error (FUNC_NAME
, port
, "unknown # object", SCM_EOL
);
725 scm_read_quote (int chr
, SCM port
)
728 long line
= SCM_LINUM (port
);
729 int column
= SCM_COL (port
) - 1;
734 p
= scm_sym_quasiquote
;
747 p
= scm_sym_uq_splicing
;
750 scm_ungetc (c
, port
);
757 fprintf (stderr
, "%s: unhandled quote character (%i)\n",
758 "scm_read_quote", chr
);
762 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
763 return maybe_annotate_source (p
, port
, line
, column
);
766 SCM_SYMBOL (sym_syntax
, "syntax");
767 SCM_SYMBOL (sym_quasisyntax
, "quasisyntax");
768 SCM_SYMBOL (sym_unsyntax
, "unsyntax");
769 SCM_SYMBOL (sym_unsyntax_splicing
, "unsyntax-splicing");
772 scm_read_syntax (int chr
, SCM port
)
775 long line
= SCM_LINUM (port
);
776 int column
= SCM_COL (port
) - 1;
794 p
= sym_unsyntax_splicing
;
797 scm_ungetc (c
, port
);
804 fprintf (stderr
, "%s: unhandled syntax character (%i)\n",
805 "scm_read_syntax", chr
);
809 p
= scm_cons2 (p
, scm_read_expression (port
), SCM_EOL
);
810 return maybe_annotate_source (p
, port
, line
, column
);
814 scm_read_nil (int chr
, SCM port
)
816 SCM id
= scm_read_mixed_case_symbol (chr
, port
);
818 if (!scm_is_eq (id
, sym_nil
))
819 scm_i_input_error ("scm_read_nil", port
,
820 "unexpected input while reading #nil: ~a",
823 return SCM_ELISP_NIL
;
827 scm_read_semicolon_comment (int chr
, SCM port
)
831 /* We use the get_byte here because there is no need to get the
832 locale correct with comment input. This presumes that newline
833 always represents itself no matter what the encoding is. */
834 for (c
= scm_get_byte_or_eof (port
);
835 (c
!= EOF
) && (c
!= '\n');
836 c
= scm_get_byte_or_eof (port
));
838 return SCM_UNSPECIFIED
;
842 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
845 scm_read_boolean (int chr
, SCM port
)
858 return SCM_UNSPECIFIED
;
862 scm_read_character (scm_t_wchar chr
, SCM port
)
863 #define FUNC_NAME "scm_lreadr"
865 char buffer
[READER_CHAR_NAME_MAX_SIZE
];
867 size_t charname_len
, bytes_read
;
872 overflow
= read_token (port
, buffer
, READER_CHAR_NAME_MAX_SIZE
, &bytes_read
);
874 scm_i_input_error (FUNC_NAME
, port
, "character name too long", SCM_EOL
);
878 chr
= scm_getc (port
);
880 scm_i_input_error (FUNC_NAME
, port
, "unexpected end of file "
881 "while reading character", SCM_EOL
);
883 /* CHR must be a token delimiter, like a whitespace. */
884 return (SCM_MAKE_CHAR (chr
));
887 pt
= SCM_PTAB_ENTRY (port
);
889 /* Simple ASCII characters can be processed immediately. Also, simple
890 ISO-8859-1 characters can be processed immediately if the encoding for this
891 port is ISO-8859-1. */
892 if (bytes_read
== 1 && ((unsigned char) buffer
[0] <= 127 || pt
->encoding
== NULL
))
895 return SCM_MAKE_CHAR (buffer
[0]);
898 /* Otherwise, convert the buffer into a proper scheme string for
900 charname
= scm_from_stringn (buffer
, bytes_read
, pt
->encoding
,
902 charname_len
= scm_i_string_length (charname
);
903 SCM_COL (port
) += charname_len
;
904 cp
= scm_i_string_ref (charname
, 0);
905 if (charname_len
== 1)
906 return SCM_MAKE_CHAR (cp
);
908 /* Ignore dotted circles, which may be used to keep combining characters from
909 combining with the backslash in #\charname. */
910 if (cp
== SCM_CODEPOINT_DOTTED_CIRCLE
&& charname_len
== 2)
911 return SCM_MAKE_CHAR (scm_i_string_ref (charname
, 1));
913 if (cp
>= '0' && cp
< '8')
915 /* Dirk:FIXME:: This type of character syntax is not R5RS
916 * compliant. Further, it should be verified that the constant
917 * does only consist of octal digits. */
918 SCM p
= scm_string_to_number (charname
, scm_from_uint (8));
921 scm_t_wchar c
= scm_to_uint32 (p
);
922 if (SCM_IS_UNICODE_CHAR (c
))
923 return SCM_MAKE_CHAR (c
);
925 scm_i_input_error (FUNC_NAME
, port
,
926 "out-of-range octal character escape: ~a",
927 scm_list_1 (charname
));
931 if (cp
== 'x' && (charname_len
> 1))
935 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
936 p
= scm_string_to_number (scm_c_substring (charname
, 1, charname_len
),
940 scm_t_wchar c
= scm_to_uint32 (p
);
941 if (SCM_IS_UNICODE_CHAR (c
))
942 return SCM_MAKE_CHAR (c
);
944 scm_i_input_error (FUNC_NAME
, port
,
945 "out-of-range hex character escape: ~a",
946 scm_list_1 (charname
));
950 /* The names of characters should never have non-Latin1
952 if (scm_i_is_narrow_string (charname
)
953 || scm_i_try_narrow_string (charname
))
954 { SCM ch
= scm_i_charname_to_char (scm_i_string_chars (charname
),
956 if (scm_is_true (ch
))
960 scm_i_input_error (FUNC_NAME
, port
, "unknown character name ~a",
961 scm_list_1 (charname
));
963 return SCM_UNSPECIFIED
;
968 scm_read_keyword (int chr
, SCM port
)
972 /* Read the symbol that comprises the keyword. Doing this instead of
973 invoking a specific symbol reader function allows `scm_read_keyword ()'
974 to adapt to the delimiters currently valid of symbols.
976 XXX: This implementation allows sloppy syntaxes like `#: key'. */
977 symbol
= scm_read_expression (port
);
978 if (!scm_is_symbol (symbol
))
979 scm_i_input_error ("scm_read_keyword", port
,
980 "keyword prefix `~a' not followed by a symbol: ~s",
981 scm_list_2 (SCM_MAKE_CHAR (chr
), symbol
));
983 return (scm_symbol_to_keyword (symbol
));
987 scm_read_vector (int chr
, SCM port
, long line
, int column
)
989 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
990 guarantee that it's going to do what we want. After all, this is an
991 implementation detail of `scm_read_vector ()', not a desirable
993 return maybe_annotate_source (scm_vector (scm_read_sexp (chr
, port
)),
998 scm_read_array (int chr
, SCM port
, long line
, int column
)
1000 SCM result
= scm_i_read_array (port
, chr
);
1001 if (scm_is_false (result
))
1004 return maybe_annotate_source (result
, port
, line
, column
);
1008 scm_read_srfi4_vector (int chr
, SCM port
, long line
, int column
)
1010 return scm_read_array (chr
, port
, line
, column
);
1014 scm_read_bytevector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1016 chr
= scm_getc (port
);
1020 chr
= scm_getc (port
);
1024 chr
= scm_getc (port
);
1028 return maybe_annotate_source
1029 (scm_u8_list_to_bytevector (scm_read_sexp (chr
, port
)),
1030 port
, line
, column
);
1033 scm_i_input_error ("read_bytevector", port
,
1034 "invalid bytevector prefix",
1035 SCM_MAKE_CHAR (chr
));
1036 return SCM_UNSPECIFIED
;
1040 scm_read_guile_bit_vector (scm_t_wchar chr
, SCM port
, long line
, int column
)
1042 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1043 terribly inefficient but who cares? */
1044 SCM s_bits
= SCM_EOL
;
1046 for (chr
= scm_getc (port
);
1047 (chr
!= EOF
) && ((chr
== '0') || (chr
== '1'));
1048 chr
= scm_getc (port
))
1050 s_bits
= scm_cons ((chr
== '0') ? SCM_BOOL_F
: SCM_BOOL_T
, s_bits
);
1054 scm_ungetc (chr
, port
);
1056 return maybe_annotate_source
1057 (scm_bitvector (scm_reverse_x (s_bits
, SCM_EOL
)),
1058 port
, line
, column
);
1062 scm_read_scsh_block_comment (scm_t_wchar chr
, SCM port
)
1068 int c
= scm_getc (port
);
1071 scm_i_input_error ("skip_block_comment", port
,
1072 "unterminated `#! ... !#' comment", SCM_EOL
);
1076 else if (c
== '#' && bang_seen
)
1082 return SCM_UNSPECIFIED
;
1086 scm_read_shebang (scm_t_wchar chr
, SCM port
)
1089 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1091 scm_ungetc (c
, port
);
1092 return scm_read_scsh_block_comment (chr
, port
);
1094 if ((c
= scm_get_byte_or_eof (port
)) != '6')
1096 scm_ungetc (c
, port
);
1097 scm_ungetc ('r', port
);
1098 return scm_read_scsh_block_comment (chr
, port
);
1100 if ((c
= scm_get_byte_or_eof (port
)) != 'r')
1102 scm_ungetc (c
, port
);
1103 scm_ungetc ('6', port
);
1104 scm_ungetc ('r', port
);
1105 return scm_read_scsh_block_comment (chr
, port
);
1107 if ((c
= scm_get_byte_or_eof (port
)) != 's')
1109 scm_ungetc (c
, port
);
1110 scm_ungetc ('r', port
);
1111 scm_ungetc ('6', port
);
1112 scm_ungetc ('r', port
);
1113 return scm_read_scsh_block_comment (chr
, port
);
1116 return SCM_UNSPECIFIED
;
1120 scm_read_r6rs_block_comment (scm_t_wchar chr
, SCM port
)
1122 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1123 nested. So care must be taken. */
1124 int nesting_level
= 1;
1126 int a
= scm_getc (port
);
1129 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1130 "unterminated `#| ... |#' comment", SCM_EOL
);
1132 while (nesting_level
> 0)
1134 int b
= scm_getc (port
);
1137 scm_i_input_error ("scm_read_r6rs_block_comment", port
,
1138 "unterminated `#| ... |#' comment", SCM_EOL
);
1140 if (a
== '|' && b
== '#')
1145 else if (a
== '#' && b
== '|')
1154 return SCM_UNSPECIFIED
;
1158 scm_read_commented_expression (scm_t_wchar chr
, SCM port
)
1162 c
= flush_ws (port
, (char *) NULL
);
1164 scm_i_input_error ("read_commented_expression", port
,
1165 "no expression after #; comment", SCM_EOL
);
1166 scm_ungetc (c
, port
);
1167 scm_read_expression (port
);
1168 return SCM_UNSPECIFIED
;
1172 scm_read_extended_symbol (scm_t_wchar chr
, SCM port
)
1174 /* Guile's extended symbol read syntax looks like this:
1176 #{This is all a symbol name}#
1178 So here, CHR is expected to be `{'. */
1181 SCM buf
= scm_i_make_string (1024, NULL
, 0);
1183 buf
= scm_i_string_start_writing (buf
);
1185 while ((chr
= scm_getc (port
)) != EOF
)
1196 scm_i_string_set_x (buf
, len
++, '}');
1202 else if (chr
== '\\')
1204 /* It used to be that print.c would print extended-read-syntax
1205 symbols with backslashes before "non-standard" chars, but
1206 this routine wouldn't do anything with those escapes.
1207 Bummer. What we've done is to change print.c to output
1208 R6RS hex escapes for those characters, relying on the fact
1209 that the extended read syntax would never put a `\' before
1210 an `x'. For now, we just ignore other instances of
1211 backslash in the string. */
1212 switch ((chr
= scm_getc (port
)))
1220 SCM_READ_HEX_ESCAPE (10, ';');
1221 scm_i_string_set_x (buf
, len
++, c
);
1229 scm_i_string_stop_writing ();
1230 scm_i_input_error ("scm_read_extended_symbol", port
,
1231 "illegal character in escape sequence: ~S",
1232 scm_list_1 (SCM_MAKE_CHAR (c
)));
1236 scm_i_string_set_x (buf
, len
++, chr
);
1241 scm_i_string_set_x (buf
, len
++, chr
);
1243 if (len
>= scm_i_string_length (buf
) - 2)
1247 scm_i_string_stop_writing ();
1248 addy
= scm_i_make_string (1024, NULL
, 0);
1249 buf
= scm_string_append (scm_list_2 (buf
, addy
));
1251 buf
= scm_i_string_start_writing (buf
);
1256 scm_i_string_stop_writing ();
1258 scm_i_input_error ("scm_read_extended_symbol", port
,
1259 "end of file while reading symbol", SCM_EOL
);
1261 return (scm_string_to_symbol (scm_c_substring (buf
, 0, len
)));
1266 /* Top-level token readers, i.e., dispatchers. */
1269 scm_read_sharp_extension (int chr
, SCM port
)
1273 proc
= scm_get_hash_procedure (chr
);
1274 if (scm_is_true (scm_procedure_p (proc
)))
1276 long line
= SCM_LINUM (port
);
1277 int column
= SCM_COL (port
) - 2;
1280 got
= scm_call_2 (proc
, SCM_MAKE_CHAR (chr
), port
);
1282 if (scm_is_pair (got
) && !scm_i_has_source_properties (got
))
1283 scm_i_set_source_properties_x (got
, line
, column
, SCM_FILENAME (port
));
1288 return SCM_UNSPECIFIED
;
1291 /* The reader for the sharp `#' character. It basically dispatches reads
1292 among the above token readers. */
1294 scm_read_sharp (scm_t_wchar chr
, SCM port
, long line
, int column
)
1295 #define FUNC_NAME "scm_lreadr"
1299 chr
= scm_getc (port
);
1301 result
= scm_read_sharp_extension (chr
, port
);
1302 if (!scm_is_eq (result
, SCM_UNSPECIFIED
))
1308 return (scm_read_character (chr
, port
));
1310 return (scm_read_vector (chr
, port
, line
, column
));
1315 /* This one may return either a boolean or an SRFI-4 vector. */
1316 return (scm_read_srfi4_vector (chr
, port
, line
, column
));
1318 return (scm_read_bytevector (chr
, port
, line
, column
));
1320 return (scm_read_guile_bit_vector (chr
, port
, line
, column
));
1324 return (scm_read_boolean (chr
, port
));
1326 return (scm_read_keyword (chr
, port
));
1327 case '0': case '1': case '2': case '3': case '4':
1328 case '5': case '6': case '7': case '8': case '9':
1330 #if SCM_ENABLE_DEPRECATED
1331 /* See below for 'i' and 'e'. */
1337 return (scm_read_array (chr
, port
, line
, column
));
1341 #if SCM_ENABLE_DEPRECATED
1343 /* When next char is '(', it really is an old-style
1345 scm_t_wchar next_c
= scm_getc (port
);
1347 scm_ungetc (next_c
, port
);
1349 return scm_read_array (chr
, port
, line
, column
);
1363 return (scm_read_number_and_radix (chr
, port
));
1365 return (scm_read_extended_symbol (chr
, port
));
1367 return (scm_read_shebang (chr
, port
));
1369 return (scm_read_commented_expression (chr
, port
));
1373 return (scm_read_syntax (chr
, port
));
1375 return (scm_read_nil (chr
, port
));
1377 result
= scm_read_sharp_extension (chr
, port
);
1378 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1380 /* To remain compatible with 1.8 and earlier, the following
1381 characters have lower precedence than `read-hash-extend'
1386 return scm_read_r6rs_block_comment (chr
, port
);
1388 scm_i_input_error (FUNC_NAME
, port
, "Unknown # object: ~S",
1389 scm_list_1 (SCM_MAKE_CHAR (chr
)));
1396 return SCM_UNSPECIFIED
;
1401 scm_read_expression (SCM port
)
1402 #define FUNC_NAME "scm_read_expression"
1408 chr
= scm_getc (port
);
1412 case SCM_WHITE_SPACES
:
1413 case SCM_LINE_INCREMENTORS
:
1416 (void) scm_read_semicolon_comment (chr
, port
);
1419 if (!SCM_SQUARE_BRACKETS_P
)
1420 return (scm_read_mixed_case_symbol (chr
, port
));
1421 /* otherwise fall through */
1423 return (scm_read_sexp (chr
, port
));
1425 return (scm_read_string (chr
, port
));
1429 return (scm_read_quote (chr
, port
));
1432 long line
= SCM_LINUM (port
);
1433 int column
= SCM_COL (port
) - 1;
1434 SCM result
= scm_read_sharp (chr
, port
, line
, column
);
1435 if (scm_is_eq (result
, SCM_UNSPECIFIED
))
1436 /* We read a comment or some such. */
1442 scm_i_input_error (FUNC_NAME
, port
, "unexpected \")\"", SCM_EOL
);
1445 if (SCM_SQUARE_BRACKETS_P
)
1446 scm_i_input_error (FUNC_NAME
, port
, "unexpected \"]\"", SCM_EOL
);
1447 /* otherwise fall through */
1451 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE
), scm_keyword_prefix
))
1452 return scm_symbol_to_keyword (scm_read_expression (port
));
1457 if (((chr
>= '0') && (chr
<= '9'))
1458 || (strchr ("+-.", chr
)))
1459 return (scm_read_number (chr
, port
));
1461 return (scm_read_mixed_case_symbol (chr
, port
));
1469 /* Actual reader. */
1471 SCM_DEFINE (scm_read
, "read", 0, 1, 0,
1473 "Read an s-expression from the input port @var{port}, or from\n"
1474 "the current input port if @var{port} is not specified.\n"
1475 "Any whitespace before the next token is discarded.")
1476 #define FUNC_NAME s_scm_read
1480 if (SCM_UNBNDP (port
))
1481 port
= scm_current_input_port ();
1482 SCM_VALIDATE_OPINPORT (1, port
);
1484 c
= flush_ws (port
, (char *) NULL
);
1487 scm_ungetc (c
, port
);
1489 return (scm_read_expression (port
));
1496 /* Manipulate the read-hash-procedures alist. This could be written in
1497 Scheme, but maybe it will also be used by C code during initialisation. */
1498 SCM_DEFINE (scm_read_hash_extend
, "read-hash-extend", 2, 0, 0,
1499 (SCM chr
, SCM proc
),
1500 "Install the procedure @var{proc} for reading expressions\n"
1501 "starting with the character sequence @code{#} and @var{chr}.\n"
1502 "@var{proc} will be called with two arguments: the character\n"
1503 "@var{chr} and the port to read further data from. The object\n"
1504 "returned will be the return value of @code{read}. \n"
1505 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1507 #define FUNC_NAME s_scm_read_hash_extend
1512 SCM_VALIDATE_CHAR (1, chr
);
1513 SCM_ASSERT (scm_is_false (proc
)
1514 || scm_is_eq (scm_procedure_p (proc
), SCM_BOOL_T
),
1515 proc
, SCM_ARG2
, FUNC_NAME
);
1517 /* Check if chr is already in the alist. */
1518 this = scm_i_read_hash_procedures_ref ();
1522 if (scm_is_null (this))
1524 /* not found, so add it to the beginning. */
1525 if (scm_is_true (proc
))
1527 SCM
new = scm_cons (scm_cons (chr
, proc
),
1528 scm_i_read_hash_procedures_ref ());
1529 scm_i_read_hash_procedures_set_x (new);
1533 if (scm_is_eq (chr
, SCM_CAAR (this)))
1535 /* already in the alist. */
1536 if (scm_is_false (proc
))
1539 if (scm_is_false (prev
))
1541 SCM rest
= SCM_CDR (scm_i_read_hash_procedures_ref ());
1542 scm_i_read_hash_procedures_set_x (rest
);
1545 scm_set_cdr_x (prev
, SCM_CDR (this));
1550 scm_set_cdr_x (SCM_CAR (this), proc
);
1555 this = SCM_CDR (this);
1558 return SCM_UNSPECIFIED
;
1562 /* Recover the read-hash procedure corresponding to char c. */
1564 scm_get_hash_procedure (int c
)
1566 SCM rest
= scm_i_read_hash_procedures_ref ();
1570 if (scm_is_null (rest
))
1573 if (SCM_CHAR (SCM_CAAR (rest
)) == c
)
1574 return SCM_CDAR (rest
);
1576 rest
= SCM_CDR (rest
);
1580 #define SCM_ENCODING_SEARCH_SIZE (500)
1582 /* Search the first few hundred characters of a file for an Emacs-like coding
1583 declaration. Returns either NULL or a string whose storage has been
1584 allocated with `scm_gc_malloc ()'. */
1586 scm_i_scan_for_encoding (SCM port
)
1589 char header
[SCM_ENCODING_SEARCH_SIZE
+1];
1590 size_t bytes_read
, encoding_length
, i
;
1591 char *encoding
= NULL
;
1593 char *pos
, *encoding_start
;
1596 pt
= SCM_PTAB_ENTRY (port
);
1598 if (pt
->rw_active
== SCM_PORT_WRITE
)
1602 pt
->rw_active
= SCM_PORT_READ
;
1604 if (pt
->read_pos
== pt
->read_end
)
1606 /* We can use the read buffer, and thus avoid a seek. */
1607 if (scm_fill_input (port
) == EOF
)
1610 bytes_read
= pt
->read_end
- pt
->read_pos
;
1611 if (bytes_read
> SCM_ENCODING_SEARCH_SIZE
)
1612 bytes_read
= SCM_ENCODING_SEARCH_SIZE
;
1614 if (bytes_read
<= 1)
1615 /* An unbuffered port -- don't scan. */
1618 memcpy (header
, pt
->read_pos
, bytes_read
);
1619 header
[bytes_read
] = '\0';
1623 /* Try to read some bytes and then seek back. Not all ports
1624 support seeking back; and indeed some file ports (like
1625 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1626 check performed by SCM_FPORT_FDES---but fail to seek
1627 backwards. Hence this block comes second. We prefer to use
1628 the read buffer in-place. */
1629 if (SCM_FPORTP (port
) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port
)))
1632 bytes_read
= scm_c_read (port
, header
, SCM_ENCODING_SEARCH_SIZE
);
1633 header
[bytes_read
] = '\0';
1634 scm_seek (port
, scm_from_int (0), scm_from_int (SEEK_SET
));
1638 && header
[0] == '\xef' && header
[1] == '\xbb' && header
[2] == '\xbf')
1641 /* search past "coding[:=]" */
1645 if ((pos
= strstr(pos
, "coding")) == NULL
)
1648 pos
+= strlen("coding");
1649 if (pos
- header
>= SCM_ENCODING_SEARCH_SIZE
||
1650 (*pos
== ':' || *pos
== '='))
1658 while (pos
- header
<= SCM_ENCODING_SEARCH_SIZE
&&
1659 (*pos
== ' ' || *pos
== '\t'))
1662 /* grab the next token */
1663 encoding_start
= pos
;
1665 while (encoding_start
+ i
- header
<= SCM_ENCODING_SEARCH_SIZE
1666 && encoding_start
+ i
- header
< bytes_read
1667 && (isalnum ((int) encoding_start
[i
])
1668 || strchr ("_-.:/,+=()", encoding_start
[i
]) != NULL
))
1671 encoding_length
= i
;
1672 if (encoding_length
== 0)
1675 encoding
= scm_gc_strndup (encoding_start
, encoding_length
, "encoding");
1676 for (i
= 0; i
< encoding_length
; i
++)
1677 encoding
[i
] = toupper ((int) encoding
[i
]);
1679 /* push backwards to make sure we were in a comment */
1681 pos
= encoding_start
;
1682 while (pos
>= header
)
1689 else if (*pos
== '\n' || pos
== header
)
1691 /* This wasn't in a semicolon comment. Check for a
1692 hash-bang comment. */
1693 char *beg
= strstr (header
, "#!");
1694 char *end
= strstr (header
, "!#");
1695 if (beg
< encoding_start
&& encoding_start
+ encoding_length
<= end
)
1706 /* This wasn't in a comment */
1709 if (utf8_bom
&& strcmp(encoding
, "UTF-8"))
1710 scm_misc_error (NULL
,
1711 "the port input declares the encoding ~s but is encoded as UTF-8",
1712 scm_list_1 (scm_from_locale_string (encoding
)));
1717 SCM_DEFINE (scm_file_encoding
, "file-encoding", 1, 0, 0,
1719 "Scans the port for an Emacs-like character coding declaration\n"
1720 "near the top of the contents of a port with random-accessible contents.\n"
1721 "The coding declaration is of the form\n"
1722 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1724 "Returns a string containing the character encoding of the file\n"
1725 "if a declaration was found, or @code{#f} otherwise.\n")
1726 #define FUNC_NAME s_scm_file_encoding
1731 SCM_VALIDATE_OPINPORT (SCM_ARG1
, port
);
1733 enc
= scm_i_scan_for_encoding (port
);
1738 s_enc
= scm_from_locale_string (enc
);
1749 SCM read_hash_procs
;
1751 read_hash_procs
= scm_make_fluid_with_default (SCM_EOL
);
1753 scm_i_read_hash_procedures
=
1754 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs
));
1756 scm_init_opts (scm_read_options
, scm_read_opts
);
1757 #include "libguile/read.x"