Add full Unicode capability to ports and the default reader
[bpt/guile.git] / libguile / read.c
CommitLineData
0ba0b384 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
7337d56d 32
a0599745 33#include "libguile/_scm.h"
0ba0b384 34#include "libguile/bytevectors.h"
a0599745
MD
35#include "libguile/chars.h"
36#include "libguile/eval.h"
37#include "libguile/unif.h"
38#include "libguile/keywords.h"
39#include "libguile/alist.h"
40#include "libguile/srcprop.h"
41#include "libguile/hashtab.h"
42#include "libguile/hash.h"
43#include "libguile/ports.h"
44#include "libguile/root.h"
45#include "libguile/strings.h"
ba1b2226 46#include "libguile/strports.h"
a0599745 47#include "libguile/vectors.h"
a0599745 48#include "libguile/validate.h"
a4022e69 49#include "libguile/srfi-4.h"
7337d56d 50#include "libguile/srfi-13.h"
ba1b2226 51
a0599745 52#include "libguile/read.h"
22fc179a
HWN
53#include "libguile/private-options.h"
54
0f2d19dd
JB
55
56\f
57
5bf6a6f0 58SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 59SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 60SCM_SYMBOL (scm_keyword_postfix, "postfix");
c7733771 61
92c2555f 62scm_t_option scm_read_opts[] = {
b7ff98dd
MD
63 { SCM_OPTION_BOOLEAN, "copy", 0,
64 "Copy source code expressions." },
ac74fc22 65 { SCM_OPTION_BOOLEAN, "positions", 0,
deca31e1
GH
66 "Record positions of source code expressions." },
67 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 68 "Convert symbols to lower case."},
f1267706 69 { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
904fabb6 70 "Style of keyword recognition: #f, 'prefix or 'postfix."},
16353acc 71#if SCM_ENABLE_ELISP
16353acc
NJ
72 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
73 "Support Elisp vector syntax, namely `[...]'."},
cd21f5eb 74 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
62560650 75 "Support `\\(' and `\\)' in strings."},
16353acc 76#endif
62560650 77 { 0, },
a16f6fe7
MD
78};
79
39e8f371
HWN
80/*
81 Give meaningful error messages for errors
82
83 We use the format
84
ba1b2226 85 FILE:LINE:COL: MESSAGE
39e8f371
HWN
86 This happened in ....
87
88 This is not standard GNU format, but the test-suite likes the real
89 message to be in front.
90
39e8f371
HWN
91 */
92
93
a4022e69
MV
94void
95scm_i_input_error (char const *function,
96 SCM port, const char *message, SCM arg)
ba1b2226 97{
29a837fd
MV
98 SCM fn = (scm_is_string (SCM_FILENAME(port))
99 ? SCM_FILENAME(port)
100 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 101
29a837fd 102 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
103 SCM string = SCM_EOL;
104 scm_simple_format (string_port,
272632a6 105 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 106 scm_list_4 (fn,
b3aa4626 107 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 108 scm_from_int (SCM_COL (port) + 1),
272632a6 109 scm_from_locale_string (message)));
ba1b2226
HWN
110
111 string = scm_get_output_string (string_port);
112 scm_close_output_port (string_port);
272632a6 113 scm_error_scm (scm_from_locale_symbol ("read-error"),
a4022e69 114 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 115 string,
dd72382c 116 arg,
ba1b2226
HWN
117 SCM_BOOL_F);
118}
39e8f371
HWN
119
120
a1ec6916 121SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 122 (SCM setting),
dc7fa443
MG
123 "Option interface for the read options. Instead of using\n"
124 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 125 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 126#define FUNC_NAME s_scm_read_options
a16f6fe7 127{
b7ff98dd
MD
128 SCM ans = scm_options (setting,
129 scm_read_opts,
1bbd0b84 130 FUNC_NAME);
b7ff98dd
MD
131 if (SCM_COPY_SOURCE_P)
132 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
133 return ans;
134}
1bbd0b84 135#undef FUNC_NAME
a16f6fe7 136
14de3b42
GH
137/* An association list mapping extra hash characters to procedures. */
138static SCM *scm_read_hash_procedures;
deca31e1 139
0f2d19dd 140
7337d56d
LC
141\f
142/* Token readers. */
0f2d19dd 143
0f2d19dd 144
7337d56d
LC
145/* Size of the C buffer used to read symbols and numbers. */
146#define READER_BUFFER_SIZE 128
0f2d19dd 147
7337d56d
LC
148/* Size of the C buffer used to read strings. */
149#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 150
7337d56d
LC
151/* The maximum size of Scheme character names. */
152#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 153
94115ae3 154
7337d56d
LC
155/* `isblank' is only in C99. */
156#define CHAR_IS_BLANK_(_chr) \
157 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 158 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
159
160#ifdef MSDOS
161# define CHAR_IS_BLANK(_chr) \
162 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
163#else
164# define CHAR_IS_BLANK CHAR_IS_BLANK_
165#endif
166
167
168/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
169 structure''). */
170#define CHAR_IS_R5RS_DELIMITER(c) \
171 (CHAR_IS_BLANK (c) \
172 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
173
174#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
175
176/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
177 Structure''. */
178#define CHAR_IS_EXPONENT_MARKER(_chr) \
179 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
180 || ((_chr) == 'd') || ((_chr) == 'l'))
181
454866e0
LC
182/* Read an SCSH block comment. */
183static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
34f3d47d 184static SCM scm_read_commented_expression (int chr, SCM port);
454866e0 185
d41668fa
LC
186/* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
187 zero if the whole token fits in BUF, non-zero otherwise. */
7337d56d 188static inline int
889975e5 189read_token (SCM port, SCM buf, size_t *read)
0520c320 190{
889975e5 191 scm_t_wchar chr;
7337d56d 192 *read = 0;
0520c320 193
889975e5
MG
194 buf = scm_i_string_start_writing (buf);
195 while (*read < scm_i_string_length (buf))
0520c320 196 {
7337d56d 197 chr = scm_getc (port);
7337d56d
LC
198
199 if (chr == EOF)
7337d56d 200 {
889975e5 201 scm_i_string_stop_writing ();
7337d56d
LC
202 return 0;
203 }
889975e5
MG
204
205 chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
206
207 if (CHAR_IS_DELIMITER (chr))
7337d56d 208 {
889975e5
MG
209 scm_i_string_stop_writing ();
210 scm_ungetc (chr, port);
211 return 0;
7337d56d 212 }
889975e5
MG
213
214 scm_i_string_set_x (buf, *read, chr);
215 (*read)++;
0520c320 216 }
889975e5 217 scm_i_string_stop_writing ();
7337d56d
LC
218
219 return 1;
0520c320 220}
1cc91f1b 221
889975e5
MG
222static SCM
223read_complete_token (SCM port, size_t *read)
224{
225 SCM buffer, str = SCM_EOL;
226 size_t len;
227 int overflow;
228
229 buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
230 overflow = read_token (port, buffer, read);
231 if (!overflow)
232 return scm_i_substring (buffer, 0, *read);
233
234 str = scm_string_copy (buffer);
235 do
236 {
237 overflow = read_token (port, buffer, &len);
238 str = scm_string_append (scm_list_2 (str, buffer));
239 *read += len;
240 }
241 while (overflow);
242
243 return scm_i_substring (str, 0, *read);
244}
7337d56d
LC
245
246/* Skip whitespace from PORT and return the first non-whitespace character
247 read. Raise an error on end-of-file. */
248static int
249flush_ws (SCM port, const char *eoferr)
0f2d19dd 250{
889975e5 251 register scm_t_wchar c;
0f2d19dd 252 while (1)
b7f3516f 253 switch (c = scm_getc (port))
0f2d19dd
JB
254 {
255 case EOF:
256 goteof:
257 if (eoferr)
d156d3b7 258 {
a4022e69
MV
259 scm_i_input_error (eoferr,
260 port,
261 "end of file",
262 SCM_EOL);
d156d3b7 263 }
0f2d19dd 264 return c;
7337d56d 265
0f2d19dd
JB
266 case ';':
267 lp:
b7f3516f 268 switch (c = scm_getc (port))
0f2d19dd
JB
269 {
270 case EOF:
271 goto goteof;
272 default:
273 goto lp;
274 case SCM_LINE_INCREMENTORS:
275 break;
276 }
277 break;
7337d56d 278
454866e0
LC
279 case '#':
280 switch (c = scm_getc (port))
281 {
282 case EOF:
283 eoferr = "read_sharp";
284 goto goteof;
285 case '!':
286 scm_read_scsh_block_comment (c, port);
287 break;
34f3d47d
AW
288 case ';':
289 scm_read_commented_expression (c, port);
290 break;
454866e0
LC
291 default:
292 scm_ungetc (c, port);
293 return '#';
294 }
295 break;
296
0f2d19dd 297 case SCM_LINE_INCREMENTORS:
0f2d19dd 298 case SCM_SINGLE_SPACES:
0f2d19dd 299 case '\t':
0f2d19dd 300 break;
7337d56d 301
0f2d19dd
JB
302 default:
303 return c;
304 }
7337d56d
LC
305
306 return 0;
0f2d19dd
JB
307}
308
309
7337d56d
LC
310\f
311/* Token readers. */
1cc91f1b 312
7337d56d
LC
313static SCM scm_read_expression (SCM port);
314static SCM scm_read_sharp (int chr, SCM port);
315static SCM scm_get_hash_procedure (int c);
316static SCM recsexpr (SCM obj, long line, int column, SCM filename);
0f2d19dd
JB
317
318
09a4f039 319static SCM
889975e5 320scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 321#define FUNC_NAME "scm_i_lreadparen"
09a4f039 322{
7337d56d
LC
323 register int c;
324 register SCM tmp;
325 register SCM tl, ans = SCM_EOL;
bd22f1c7 326 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
7337d56d
LC
327 static const int terminating_char = ')';
328
329 /* Need to capture line and column numbers here. */
330 long line = SCM_LINUM (port);
331 int column = SCM_COL (port) - 1;
f9c68a47 332
f9c68a47 333
7337d56d
LC
334 c = flush_ws (port, FUNC_NAME);
335 if (terminating_char == c)
336 return SCM_EOL;
f9c68a47 337
7337d56d
LC
338 scm_ungetc (c, port);
339 if (scm_is_eq (scm_sym_dot,
340 (tmp = scm_read_expression (port))))
341 {
342 ans = scm_read_expression (port);
343 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
344 scm_i_input_error (FUNC_NAME, port, "missing close paren",
345 SCM_EOL);
346 return ans;
347 }
1cc91f1b 348
7337d56d
LC
349 /* Build the head of the list structure. */
350 ans = tl = scm_cons (tmp, SCM_EOL);
351
352 if (SCM_COPY_SOURCE_P)
353 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
354 ? copy
355 : tmp,
356 SCM_EOL);
357
358 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 359 {
7337d56d 360 SCM new_tail;
0f2d19dd 361
7337d56d
LC
362 scm_ungetc (c, port);
363 if (scm_is_eq (scm_sym_dot,
364 (tmp = scm_read_expression (port))))
0f2d19dd 365 {
7337d56d
LC
366 SCM_SETCDR (tl, tmp = scm_read_expression (port));
367
368 if (SCM_COPY_SOURCE_P)
369 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
370 SCM_EOL));
371
372 c = flush_ws (port, FUNC_NAME);
373 if (terminating_char != c)
374 scm_i_input_error (FUNC_NAME, port,
375 "in pair: missing close paren", SCM_EOL);
376 goto exit;
0f2d19dd 377 }
b858464a 378
7337d56d
LC
379 new_tail = scm_cons (tmp, SCM_EOL);
380 SCM_SETCDR (tl, new_tail);
381 tl = new_tail;
382
383 if (SCM_COPY_SOURCE_P)
0f2d19dd 384 {
7337d56d
LC
385 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
386 ? copy
387 : tmp, SCM_EOL);
388 SCM_SETCDR (tl2, new_tail2);
389 tl2 = new_tail2;
390 }
391 }
0f2d19dd 392
7337d56d
LC
393 exit:
394 if (SCM_RECORD_POSITIONS_P)
395 scm_whash_insert (scm_source_whash,
396 ans,
397 scm_make_srcprops (line, column,
398 SCM_FILENAME (port),
399 SCM_COPY_SOURCE_P
400 ? ans2
401 : SCM_UNDEFINED,
402 SCM_EOL));
403 return ans;
404}
405#undef FUNC_NAME
a4022e69 406
7337d56d
LC
407static SCM
408scm_read_string (int chr, SCM port)
409#define FUNC_NAME "scm_lreadr"
410{
411 /* For strings smaller than C_STR, this function creates only one Scheme
412 object (the string returned). */
0f2d19dd 413
7337d56d 414 SCM str = SCM_BOOL_F;
7337d56d 415 unsigned c_str_len = 0;
9c44cd45 416 scm_t_wchar c;
eb42ff25 417
9c44cd45 418 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
7337d56d
LC
419 while ('"' != (c = scm_getc (port)))
420 {
421 if (c == EOF)
9c44cd45
MG
422 {
423 str_eof:
424 scm_i_input_error (FUNC_NAME, port,
425 "end of file in string constant", SCM_EOL);
426 }
0f2d19dd 427
9c44cd45
MG
428 if (c_str_len + 1 >= scm_i_string_length (str))
429 {
430 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
0f2d19dd 431
9c44cd45
MG
432 str = scm_string_append (scm_list_2 (str, addy));
433 }
0f2d19dd 434
7337d56d 435 if (c == '\\')
9c44cd45
MG
436 {
437 switch (c = scm_getc (port))
438 {
439 case EOF:
440 goto str_eof;
441 case '"':
442 case '\\':
443 break;
16353acc 444#if SCM_ENABLE_ELISP
9c44cd45
MG
445 case '(':
446 case ')':
447 if (SCM_ESCAPED_PARENS_P)
448 break;
449 goto bad_escaped;
16353acc 450#endif
9c44cd45
MG
451 case '\n':
452 continue;
453 case '0':
454 c = '\0';
455 break;
456 case 'f':
457 c = '\f';
458 break;
459 case 'n':
460 c = '\n';
461 break;
462 case 'r':
463 c = '\r';
464 break;
465 case 't':
466 c = '\t';
467 break;
468 case 'a':
469 c = '\007';
470 break;
471 case 'v':
472 c = '\v';
473 break;
474 case 'x':
475 {
476 scm_t_wchar a, b;
477 a = scm_getc (port);
478 if (a == EOF)
479 goto str_eof;
480 b = scm_getc (port);
481 if (b == EOF)
482 goto str_eof;
483 if ('0' <= a && a <= '9')
484 a -= '0';
485 else if ('A' <= a && a <= 'F')
486 a = a - 'A' + 10;
487 else if ('a' <= a && a <= 'f')
488 a = a - 'a' + 10;
489 else
490 {
491 c = a;
492 goto bad_escaped;
493 }
494 if ('0' <= b && b <= '9')
495 b -= '0';
496 else if ('A' <= b && b <= 'F')
497 b = b - 'A' + 10;
498 else if ('a' <= b && b <= 'f')
499 b = b - 'a' + 10;
500 else
501 {
502 c = b;
503 goto bad_escaped;
504 }
505 c = a * 16 + b;
506 break;
507 }
508 case 'u':
509 {
510 scm_t_wchar a;
511 int i;
512 c = 0;
513 for (i = 0; i < 4; i++)
514 {
515 a = scm_getc (port);
516 if (a == EOF)
517 goto str_eof;
518 if ('0' <= a && a <= '9')
519 a -= '0';
520 else if ('A' <= a && a <= 'F')
521 a = a - 'A' + 10;
522 else if ('a' <= a && a <= 'f')
523 a = a - 'a' + 10;
524 else
525 {
526 c = a;
527 goto bad_escaped;
528 }
529 c = c * 16 + a;
530 }
531 break;
532 }
533 case 'U':
534 {
535 scm_t_wchar a;
536 int i;
537 c = 0;
538 for (i = 0; i < 6; i++)
539 {
540 a = scm_getc (port);
541 if (a == EOF)
542 goto str_eof;
543 if ('0' <= a && a <= '9')
544 a -= '0';
545 else if ('A' <= a && a <= 'F')
546 a = a - 'A' + 10;
547 else if ('a' <= a && a <= 'f')
548 a = a - 'a' + 10;
549 else
550 {
551 c = a;
552 goto bad_escaped;
553 }
554 c = c * 16 + a;
555 }
556 break;
557 }
558 default:
559 bad_escaped:
560 scm_i_input_error (FUNC_NAME, port,
561 "illegal character in escape sequence: ~S",
562 scm_list_1 (SCM_MAKE_CHAR (c)));
563 }
564 }
565 str = scm_i_string_start_writing (str);
566 scm_i_string_set_x (str, c_str_len++, c);
567 scm_i_string_stop_writing ();
7337d56d 568 }
f13b4400 569
7337d56d
LC
570 if (c_str_len > 0)
571 {
9c44cd45 572 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 573 }
9c44cd45
MG
574
575 return scm_nullstr;
0f2d19dd 576}
db4b4ca6
DH
577#undef FUNC_NAME
578
0f2d19dd 579
7337d56d 580static SCM
889975e5 581scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 582{
889975e5
MG
583 SCM result;
584 SCM buffer;
7337d56d 585 size_t read;
0f2d19dd 586
7337d56d 587 scm_ungetc (chr, port);
889975e5
MG
588 buffer = read_complete_token (port, &read);
589 result = scm_string_to_number (buffer, SCM_UNDEFINED);
590 if (!scm_is_true (result))
591 /* Return a symbol instead of a number. */
592 result = scm_string_to_symbol (buffer);
0f2d19dd 593
7337d56d
LC
594 return result;
595}
0f2d19dd 596
7337d56d 597static SCM
889975e5 598scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 599{
889975e5
MG
600 SCM result;
601 int ends_with_colon = 0;
602 SCM buffer;
7337d56d 603 size_t read = 0;
ef4cbc08 604 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
7337d56d
LC
605
606 scm_ungetc (chr, port);
889975e5
MG
607 buffer = read_complete_token (port, &read);
608 if (read > 0)
609 ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
ef4cbc08 610
889975e5
MG
611 if (postfix && ends_with_colon && (read > 1))
612 result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
7337d56d 613 else
889975e5 614 result = scm_string_to_symbol (buffer);
7337d56d
LC
615
616 return result;
617}
618
619static SCM
889975e5 620scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
621#define FUNC_NAME "scm_lreadr"
622{
889975e5 623 SCM result;
7337d56d 624 size_t read;
889975e5 625 SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
7337d56d 626 unsigned int radix;
7337d56d
LC
627
628 switch (chr)
629 {
630 case 'B':
631 case 'b':
632 radix = 2;
633 break;
634
635 case 'o':
636 case 'O':
637 radix = 8;
638 break;
639
640 case 'd':
641 case 'D':
642 radix = 10;
643 break;
644
645 case 'x':
646 case 'X':
647 radix = 16;
648 break;
649
650 default:
651 scm_ungetc (chr, port);
652 scm_ungetc ('#', port);
653 radix = 10;
654 }
655
889975e5
MG
656 buffer = read_complete_token (port, &read);
657 result = scm_string_to_number (buffer, scm_from_uint (radix));
7337d56d
LC
658
659 if (scm_is_true (result))
660 return result;
661
662 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
663
664 return SCM_BOOL_F;
665}
666#undef FUNC_NAME
667
668static SCM
669scm_read_quote (int chr, SCM port)
670{
671 SCM p;
492faee1
LC
672 long line = SCM_LINUM (port);
673 int column = SCM_COL (port) - 1;
7337d56d
LC
674
675 switch (chr)
676 {
677 case '`':
678 p = scm_sym_quasiquote;
679 break;
680
681 case '\'':
682 p = scm_sym_quote;
683 break;
684
685 case ',':
686 {
889975e5 687 scm_t_wchar c;
7337d56d
LC
688
689 c = scm_getc (port);
690 if ('@' == c)
691 p = scm_sym_uq_splicing;
692 else
0f2d19dd 693 {
7337d56d
LC
694 scm_ungetc (c, port);
695 p = scm_sym_unquote;
0f2d19dd 696 }
7337d56d
LC
697 break;
698 }
0f2d19dd 699
7337d56d
LC
700 default:
701 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 702 "scm_read_quote", chr);
7337d56d 703 abort ();
0f2d19dd 704 }
1cc91f1b 705
7337d56d 706 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1
LC
707 if (SCM_RECORD_POSITIONS_P)
708 scm_whash_insert (scm_source_whash, p,
709 scm_make_srcprops (line, column,
710 SCM_FILENAME (port),
711 SCM_COPY_SOURCE_P
712 ? (scm_cons2 (SCM_CAR (p),
713 SCM_CAR (SCM_CDR (p)),
714 SCM_EOL))
715 : SCM_UNDEFINED,
716 SCM_EOL));
717
0f2d19dd 718
7337d56d
LC
719 return p;
720}
721
34f3d47d
AW
722SCM_SYMBOL (sym_syntax, "syntax");
723SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
724SCM_SYMBOL (sym_unsyntax, "unsyntax");
725SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
726
727static SCM
728scm_read_syntax (int chr, SCM port)
729{
730 SCM p;
731 long line = SCM_LINUM (port);
732 int column = SCM_COL (port) - 1;
733
734 switch (chr)
735 {
736 case '`':
737 p = sym_quasisyntax;
738 break;
739
740 case '\'':
741 p = sym_syntax;
742 break;
743
744 case ',':
745 {
746 int c;
747
748 c = scm_getc (port);
749 if ('@' == c)
750 p = sym_unsyntax_splicing;
751 else
752 {
753 scm_ungetc (c, port);
754 p = sym_unsyntax;
755 }
756 break;
757 }
758
759 default:
760 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
761 "scm_read_syntax", chr);
762 abort ();
763 }
764
765 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
766 if (SCM_RECORD_POSITIONS_P)
767 scm_whash_insert (scm_source_whash, p,
768 scm_make_srcprops (line, column,
769 SCM_FILENAME (port),
770 SCM_COPY_SOURCE_P
771 ? (scm_cons2 (SCM_CAR (p),
772 SCM_CAR (SCM_CDR (p)),
773 SCM_EOL))
774 : SCM_UNDEFINED,
775 SCM_EOL));
776
777
778 return p;
779}
780
7337d56d
LC
781static inline SCM
782scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 783{
0f2d19dd
JB
784 int c;
785
889975e5
MG
786 /* We use the get_byte here because there is no need to get the
787 locale correct with comment input. This presumes that newline
788 always represents itself no matter what the encoding is. */
789 for (c = scm_get_byte_or_eof (port);
7337d56d
LC
790 (c != EOF) && (c != '\n');
791 c = scm_getc (port));
792
793 return SCM_UNSPECIFIED;
794}
795
796\f
797/* Sharp readers, i.e. readers called after a `#' sign has been read. */
798
799static SCM
800scm_read_boolean (int chr, SCM port)
801{
802 switch (chr)
0f2d19dd 803 {
7337d56d
LC
804 case 't':
805 case 'T':
806 return SCM_BOOL_T;
807
808 case 'f':
809 case 'F':
810 return SCM_BOOL_F;
0f2d19dd 811 }
7337d56d
LC
812
813 return SCM_UNSPECIFIED;
814}
815
816static SCM
889975e5 817scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
818#define FUNC_NAME "scm_lreadr"
819{
77332b21 820 SCM ch;
889975e5 821 SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
7337d56d 822 size_t charname_len;
889975e5
MG
823 scm_t_wchar cp;
824 int overflow;
825
826 overflow = read_token (port, charname, &charname_len);
827 charname = scm_c_substring (charname, 0, charname_len);
7337d56d 828
889975e5 829 if (overflow)
7337d56d
LC
830 goto char_error;
831
832 if (charname_len == 0)
0f2d19dd 833 {
7337d56d
LC
834 chr = scm_getc (port);
835 if (chr == EOF)
836 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
837 "while reading character", SCM_EOL);
838
839 /* CHR must be a token delimiter, like a whitespace. */
840 return (SCM_MAKE_CHAR (chr));
0f2d19dd 841 }
7337d56d
LC
842
843 if (charname_len == 1)
889975e5 844 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
7337d56d 845
889975e5
MG
846 cp = scm_i_string_ref (charname, 0);
847 if (cp >= '0' && cp < '8')
7337d56d
LC
848 {
849 /* Dirk:FIXME:: This type of character syntax is not R5RS
850 * compliant. Further, it should be verified that the constant
851 * does only consist of octal digits. Finally, it should be
852 * checked whether the resulting fixnum is in the range of
853 * characters. */
889975e5 854 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d
LC
855 if (SCM_I_INUMP (p))
856 return SCM_MAKE_CHAR (SCM_I_INUM (p));
857 }
858
889975e5
MG
859 /* The names of characters should never have non-Latin1
860 characters. */
861 if (scm_i_is_narrow_string (charname)
862 || scm_i_try_narrow_string (charname))
863 ch = scm_i_charname_to_char (scm_i_string_chars (charname),
864 charname_len);
77332b21
MG
865 if (scm_is_true (ch))
866 return ch;
7337d56d
LC
867
868 char_error:
869 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 870 scm_list_1 (charname));
7337d56d
LC
871
872 return SCM_UNSPECIFIED;
0f2d19dd 873}
db4b4ca6 874#undef FUNC_NAME
0f2d19dd 875
7337d56d
LC
876static inline SCM
877scm_read_keyword (int chr, SCM port)
878{
879 SCM symbol;
880
881 /* Read the symbol that comprises the keyword. Doing this instead of
882 invoking a specific symbol reader function allows `scm_read_keyword ()'
883 to adapt to the delimiters currently valid of symbols.
1cc91f1b 884
7337d56d
LC
885 XXX: This implementation allows sloppy syntaxes like `#: key'. */
886 symbol = scm_read_expression (port);
887 if (!scm_is_symbol (symbol))
7f74cf9a 888 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
889 "keyword prefix `~a' not followed by a symbol: ~s",
890 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
891
892 return (scm_symbol_to_keyword (symbol));
893}
894
895static inline SCM
896scm_read_vector (int chr, SCM port)
09a4f039 897{
7337d56d
LC
898 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
899 guarantee that it's going to do what we want. After all, this is an
900 implementation detail of `scm_read_vector ()', not a desirable
901 property. */
902 return (scm_vector (scm_read_sexp (chr, port)));
903}
09a4f039 904
7337d56d
LC
905static inline SCM
906scm_read_srfi4_vector (int chr, SCM port)
907{
908 return scm_i_read_array (port, chr);
909}
910
0ba0b384 911static SCM
889975e5 912scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
913{
914 chr = scm_getc (port);
915 if (chr != 'u')
916 goto syntax;
917
918 chr = scm_getc (port);
919 if (chr != '8')
920 goto syntax;
921
922 chr = scm_getc (port);
923 if (chr != '(')
924 goto syntax;
925
926 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
927
928 syntax:
929 scm_i_input_error ("read_bytevector", port,
930 "invalid bytevector prefix",
931 SCM_MAKE_CHAR (chr));
932 return SCM_UNSPECIFIED;
933}
934
7337d56d 935static SCM
889975e5 936scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
937{
938 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
939 terribly inefficient but who cares? */
940 SCM s_bits = SCM_EOL;
941
942 for (chr = scm_getc (port);
943 (chr != EOF) && ((chr == '0') || (chr == '1'));
944 chr = scm_getc (port))
09a4f039 945 {
7337d56d 946 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 947 }
7337d56d
LC
948
949 if (chr != EOF)
950 scm_ungetc (chr, port);
951
952 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
953}
954
955static inline SCM
889975e5 956scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
957{
958 int bang_seen = 0;
959
889975e5
MG
960 /* We can use the get_byte here because there is no need to get the
961 locale correct when reading comments. This presumes that
962 hash and exclamation points always represent themselves no
963 matter what the source encoding is.*/
7337d56d 964 for (;;)
09a4f039 965 {
889975e5 966 int c = scm_get_byte_or_eof (port);
62850ef3 967
7337d56d
LC
968 if (c == EOF)
969 scm_i_input_error ("skip_block_comment", port,
970 "unterminated `#! ... !#' comment", SCM_EOL);
971
972 if (c == '!')
973 bang_seen = 1;
974 else if (c == '#' && bang_seen)
975 break;
976 else
977 bang_seen = 0;
978 }
979
980 return SCM_UNSPECIFIED;
981}
982
34f3d47d 983static SCM
889975e5 984scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 985{
889975e5 986 scm_t_wchar c;
34f3d47d
AW
987
988 c = flush_ws (port, (char *) NULL);
989 if (EOF == c)
990 scm_i_input_error ("read_commented_expression", port,
991 "no expression after #; comment", SCM_EOL);
992 scm_ungetc (c, port);
993 scm_read_expression (port);
994 return SCM_UNSPECIFIED;
995}
996
7337d56d 997static SCM
889975e5 998scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
999{
1000 /* Guile's extended symbol read syntax looks like this:
1001
1002 #{This is all a symbol name}#
1003
1004 So here, CHR is expected to be `{'. */
7337d56d
LC
1005 int saw_brace = 0, finished = 0;
1006 size_t len = 0;
889975e5 1007 SCM buf = scm_i_make_string (1024, NULL);
7337d56d 1008
889975e5 1009 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1010
1011 while ((chr = scm_getc (port)) != EOF)
1012 {
1013 if (saw_brace)
09a4f039 1014 {
7337d56d
LC
1015 if (chr == '#')
1016 {
1017 finished = 1;
1018 break;
1019 }
1020 else
1021 {
1022 saw_brace = 0;
889975e5
MG
1023 scm_i_string_set_x (buf, len++, '}');
1024 scm_i_string_set_x (buf, len++, chr);
7337d56d 1025 }
09a4f039 1026 }
7337d56d
LC
1027 else if (chr == '}')
1028 saw_brace = 1;
1029 else
889975e5 1030 scm_i_string_set_x (buf, len++, chr);
62850ef3 1031
889975e5 1032 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1033 {
889975e5
MG
1034 scm_i_string_stop_writing ();
1035 SCM addy = scm_i_make_string (1024, NULL);
1036 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1037 len = 0;
889975e5 1038 buf = scm_i_string_start_writing (buf);
7337d56d 1039 }
62850ef3 1040
7337d56d
LC
1041 if (finished)
1042 break;
1043 }
889975e5 1044 scm_i_string_stop_writing ();
7337d56d 1045
889975e5 1046 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1047}
1048
1049
1050\f
1051/* Top-level token readers, i.e., dispatchers. */
1052
1053static SCM
1054scm_read_sharp_extension (int chr, SCM port)
1055{
1056 SCM proc;
1057
1058 proc = scm_get_hash_procedure (chr);
1059 if (scm_is_true (scm_procedure_p (proc)))
1060 {
1061 long line = SCM_LINUM (port);
1062 int column = SCM_COL (port) - 2;
1063 SCM got;
1064
1065 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1066 if (!scm_is_eq (got, SCM_UNSPECIFIED))
62850ef3 1067 {
7337d56d
LC
1068 if (SCM_RECORD_POSITIONS_P)
1069 return (recsexpr (got, line, column,
1070 SCM_FILENAME (port)));
1071 else
1072 return got;
62850ef3 1073 }
09a4f039 1074 }
7337d56d
LC
1075
1076 return SCM_UNSPECIFIED;
1077}
1078
1079/* The reader for the sharp `#' character. It basically dispatches reads
1080 among the above token readers. */
1081static SCM
889975e5 1082scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1083#define FUNC_NAME "scm_lreadr"
1084{
1085 SCM result;
1086
1087 chr = scm_getc (port);
1088
1089 result = scm_read_sharp_extension (chr, port);
1090 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1091 return result;
1092
1093 switch (chr)
1094 {
1095 case '\\':
1096 return (scm_read_character (chr, port));
1097 case '(':
1098 return (scm_read_vector (chr, port));
1099 case 's':
1100 case 'u':
1101 case 'f':
1102 /* This one may return either a boolean or an SRFI-4 vector. */
1103 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1104 case 'v':
1105 return (scm_read_bytevector (chr, port));
7337d56d
LC
1106 case '*':
1107 return (scm_read_guile_bit_vector (chr, port));
1108 case 't':
1109 case 'T':
1110 case 'F':
1111 /* This one may return either a boolean or an SRFI-4 vector. */
1112 return (scm_read_boolean (chr, port));
1113 case ':':
1114 return (scm_read_keyword (chr, port));
1115 case '0': case '1': case '2': case '3': case '4':
1116 case '5': case '6': case '7': case '8': case '9':
1117 case '@':
1118#if SCM_ENABLE_DEPRECATED
1119 /* See below for 'i' and 'e'. */
1120 case 'a':
1121 case 'c':
1122 case 'y':
1123 case 'h':
1124 case 'l':
1125#endif
1126 return (scm_i_read_array (port, chr));
1127
1128 case 'i':
1129 case 'e':
1130#if SCM_ENABLE_DEPRECATED
1131 {
1132 /* When next char is '(', it really is an old-style
1133 uniform array. */
889975e5 1134 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1135 if (next_c != EOF)
1136 scm_ungetc (next_c, port);
1137 if (next_c == '(')
1138 return scm_i_read_array (port, chr);
1139 /* Fall through. */
1140 }
1141#endif
1142 case 'b':
1143 case 'B':
1144 case 'o':
1145 case 'O':
1146 case 'd':
1147 case 'D':
1148 case 'x':
1149 case 'X':
1150 case 'I':
1151 case 'E':
1152 return (scm_read_number_and_radix (chr, port));
1153 case '{':
1154 return (scm_read_extended_symbol (chr, port));
1155 case '!':
1156 return (scm_read_scsh_block_comment (chr, port));
34f3d47d
AW
1157 case ';':
1158 return (scm_read_commented_expression (chr, port));
1159 case '`':
1160 case '\'':
1161 case ',':
1162 return (scm_read_syntax (chr, port));
7337d56d
LC
1163 default:
1164 result = scm_read_sharp_extension (chr, port);
1165 if (scm_is_eq (result, SCM_UNSPECIFIED))
1166 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1167 scm_list_1 (SCM_MAKE_CHAR (chr)));
1168 else
1169 return result;
1170 }
1171
1172 return SCM_UNSPECIFIED;
1173}
1174#undef FUNC_NAME
1175
1176static SCM
1177scm_read_expression (SCM port)
1178#define FUNC_NAME "scm_read_expression"
1179{
1180 while (1)
1181 {
889975e5 1182 register scm_t_wchar chr;
7337d56d
LC
1183
1184 chr = scm_getc (port);
1185
1186 switch (chr)
1187 {
1188 case SCM_WHITE_SPACES:
1189 case SCM_LINE_INCREMENTORS:
1190 break;
1191 case ';':
1192 (void) scm_read_semicolon_comment (chr, port);
1193 break;
1194 case '(':
1195 return (scm_read_sexp (chr, port));
1196 case '"':
1197 return (scm_read_string (chr, port));
1198 case '\'':
1199 case '`':
1200 case ',':
1201 return (scm_read_quote (chr, port));
1202 case '#':
1203 {
1204 SCM result;
1205 result = scm_read_sharp (chr, port);
1206 if (scm_is_eq (result, SCM_UNSPECIFIED))
1207 /* We read a comment or some such. */
1208 break;
1209 else
1210 return result;
1211 }
1212 case ')':
1213 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1214 break;
1215 case EOF:
1216 return SCM_EOF_VAL;
1217 case ':':
1218 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1219 return scm_symbol_to_keyword (scm_read_expression (port));
1220 /* Fall through. */
1221
1222 default:
1223 {
1224 if (((chr >= '0') && (chr <= '9'))
1225 || (strchr ("+-.", chr)))
1226 return (scm_read_number (chr, port));
1227 else
1228 return (scm_read_mixed_case_symbol (chr, port));
1229 }
1230 }
1231 }
1232}
1233#undef FUNC_NAME
1234
1235\f
1236/* Actual reader. */
1237
1238SCM_DEFINE (scm_read, "read", 0, 1, 0,
1239 (SCM port),
1240 "Read an s-expression from the input port @var{port}, or from\n"
1241 "the current input port if @var{port} is not specified.\n"
1242 "Any whitespace before the next token is discarded.")
1243#define FUNC_NAME s_scm_read
1244{
1245 int c;
1246
1247 if (SCM_UNBNDP (port))
1248 port = scm_current_input_port ();
1249 SCM_VALIDATE_OPINPORT (1, port);
1250
1251 c = flush_ws (port, (char *) NULL);
1252 if (EOF == c)
1253 return SCM_EOF_VAL;
1254 scm_ungetc (c, port);
1255
1256 return (scm_read_expression (port));
09a4f039 1257}
db4b4ca6 1258#undef FUNC_NAME
09a4f039 1259
0f2d19dd
JB
1260
1261\f
1262
7337d56d
LC
1263/* Used when recording expressions constructed by `scm_read_sharp ()'. */
1264static SCM
1265recsexpr (SCM obj, long line, int column, SCM filename)
1266{
1267 if (!scm_is_pair(obj)) {
1268 return obj;
1269 } else {
1270 SCM tmp = obj, copy;
1271 /* If this sexpr is visible in the read:sharp source, we want to
1272 keep that information, so only record non-constant cons cells
1273 which haven't previously been read by the reader. */
1274 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1275 {
1276 if (SCM_COPY_SOURCE_P)
1277 {
1278 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1279 SCM_UNDEFINED);
1280 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1281 {
1282 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1283 line,
1284 column,
1285 filename),
1286 SCM_UNDEFINED));
1287 copy = SCM_CDR (copy);
1288 }
1289 SCM_SETCDR (copy, tmp);
1290 }
1291 else
1292 {
1293 recsexpr (SCM_CAR (obj), line, column, filename);
1294 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1295 recsexpr (SCM_CAR (tmp), line, column, filename);
1296 copy = SCM_UNDEFINED;
1297 }
1298 scm_whash_insert (scm_source_whash,
1299 obj,
1300 scm_make_srcprops (line,
1301 column,
1302 filename,
1303 copy,
1304 SCM_EOL));
1305 }
1306 return obj;
1307 }
1308}
1309
14de3b42
GH
1310/* Manipulate the read-hash-procedures alist. This could be written in
1311 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1312SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1313 (SCM chr, SCM proc),
dc7fa443
MG
1314 "Install the procedure @var{proc} for reading expressions\n"
1315 "starting with the character sequence @code{#} and @var{chr}.\n"
1316 "@var{proc} will be called with two arguments: the character\n"
1317 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1318 "returned will be the return value of @code{read}. \n"
1319 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1320 )
1bbd0b84 1321#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1322{
fed9c9a2
GH
1323 SCM this;
1324 SCM prev;
1325
36284627 1326 SCM_VALIDATE_CHAR (1, chr);
7888309b 1327 SCM_ASSERT (scm_is_false (proc)
bc36d050 1328 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1329 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1330
14de3b42
GH
1331 /* Check if chr is already in the alist. */
1332 this = *scm_read_hash_procedures;
1333 prev = SCM_BOOL_F;
fed9c9a2
GH
1334 while (1)
1335 {
d2e53ed6 1336 if (scm_is_null (this))
fed9c9a2
GH
1337 {
1338 /* not found, so add it to the beginning. */
7888309b 1339 if (scm_is_true (proc))
fed9c9a2 1340 {
14de3b42
GH
1341 *scm_read_hash_procedures =
1342 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
fed9c9a2
GH
1343 }
1344 break;
1345 }
bc36d050 1346 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1347 {
1348 /* already in the alist. */
7888309b 1349 if (scm_is_false (proc))
14de3b42
GH
1350 {
1351 /* remove it. */
7888309b 1352 if (scm_is_false (prev))
14de3b42
GH
1353 {
1354 *scm_read_hash_procedures =
1355 SCM_CDR (*scm_read_hash_procedures);
1356 }
1357 else
1358 scm_set_cdr_x (prev, SCM_CDR (this));
1359 }
fed9c9a2 1360 else
14de3b42
GH
1361 {
1362 /* replace it. */
1363 scm_set_cdr_x (SCM_CAR (this), proc);
1364 }
fed9c9a2
GH
1365 break;
1366 }
1367 prev = this;
1368 this = SCM_CDR (this);
1369 }
deca31e1 1370
deca31e1
GH
1371 return SCM_UNSPECIFIED;
1372}
1bbd0b84 1373#undef FUNC_NAME
0f2d19dd 1374
deca31e1
GH
1375/* Recover the read-hash procedure corresponding to char c. */
1376static SCM
6e8d25a6 1377scm_get_hash_procedure (int c)
deca31e1 1378{
14de3b42 1379 SCM rest = *scm_read_hash_procedures;
fed9c9a2 1380
deca31e1
GH
1381 while (1)
1382 {
d2e53ed6 1383 if (scm_is_null (rest))
deca31e1
GH
1384 return SCM_BOOL_F;
1385
7866a09b 1386 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1387 return SCM_CDAR (rest);
1388
1389 rest = SCM_CDR (rest);
1390 }
1391}
1cc91f1b 1392
889975e5
MG
1393#define SCM_ENCODING_SEARCH_SIZE (500)
1394
1395/* Search the first few hundred characters of a file for
1396 an emacs-like coding declaration. */
1397char *
1398scm_scan_for_encoding (SCM port)
1399{
1400 char header[SCM_ENCODING_SEARCH_SIZE+1];
1401 size_t bytes_read;
1402 char *encoding = NULL;
1403 int utf8_bom = 0;
1404 char *pos;
1405 int i;
1406 int in_comment;
1407
1408 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1409 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1410
1411 if (bytes_read > 3
1412 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1413 utf8_bom = 1;
1414
1415 /* search past "coding[:=]" */
1416 pos = header;
1417 while (1)
1418 {
1419 if ((pos = strstr(pos, "coding")) == NULL)
1420 return NULL;
1421
1422 pos += strlen("coding");
1423 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1424 (*pos == ':' || *pos == '='))
1425 {
1426 pos ++;
1427 break;
1428 }
1429 }
1430
1431 /* skip spaces */
1432 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1433 (*pos == ' ' || *pos == '\t'))
1434 pos ++;
1435
1436 /* grab the next token */
1437 i = 0;
1438 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1439 && (isalnum(pos[i]) || pos[i] == '_' || pos[i] == '-' || pos[i] == '.'))
1440 i++;
1441
1442 if (i == 0)
1443 return NULL;
1444
1445 encoding = scm_malloc (i+1);
1446 memcpy (encoding, pos, i);
1447 encoding[i] ='\0';
1448 for (i = 0; i < strlen(encoding); i++)
1449 encoding[i] = toupper(encoding[i]);
1450
1451 /* push backwards to make sure we were in a comment */
1452 in_comment = 0;
1453 while (pos - i - header > 0)
1454 {
1455 if (*(pos - i) == '\n')
1456 {
1457 /* This wasn't in a semicolon comment. Check for a
1458 hash-bang comment. */
1459 char *beg = strstr (header, "#!");
1460 char *end = strstr (header, "!#");
1461 if (beg < pos && pos < end)
1462 in_comment = 1;
1463 break;
1464 }
1465 if (*(pos - i) == ';')
1466 {
1467 in_comment = 1;
1468 break;
1469 }
1470 i ++;
1471 }
1472 if (!in_comment)
1473 {
1474 /* This wasn't in a comment */
1475 free (encoding);
1476 return NULL;
1477 }
1478 if (utf8_bom && strcmp(encoding, "UTF-8"))
1479 scm_misc_error (NULL,
1480 "the port input declares the encoding ~s but is encoded as UTF-8",
1481 scm_list_1 (scm_from_locale_string (encoding)));
1482
1483 return encoding;
1484}
1485
1486SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1487 (SCM port),
1488 "Scans the port for an EMACS-like character coding declaration\n"
1489 "near the top of the contents of a port with random-acessible contents.\n"
1490 "The coding declaration is of the form\n"
1491 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1492 "\n"
1493 "Returns a string containing the character encoding of the file\n"
1494 "if a declaration was found, or @code{#f} otherwise.\n")
1495#define FUNC_NAME s_scm_file_encoding
1496{
1497 char *enc;
1498 SCM s_enc;
1499
1500 enc = scm_scan_for_encoding (port);
1501 if (enc == NULL)
1502 return SCM_BOOL_F;
1503 else
1504 {
1505 s_enc = scm_from_locale_string (enc);
1506 free (enc);
1507 return s_enc;
1508 }
1509
1510 return SCM_BOOL_F;
1511}
1512#undef FUNC_NAME
1513
0f2d19dd
JB
1514void
1515scm_init_read ()
0f2d19dd 1516{
14de3b42 1517 scm_read_hash_procedures =
86d31dfe 1518 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
fed9c9a2 1519
62560650 1520 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1521#include "libguile/read.x"
0f2d19dd 1522}
89e00824
ML
1523
1524/*
1525 Local Variables:
1526 c-file-style: "gnu"
1527 End:
1528*/