Fix off-by-one error in processing Emacs-style coding declaration.
[bpt/guile.git] / libguile / read.c
CommitLineData
0ba0b384 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
7337d56d 32
a0599745 33#include "libguile/_scm.h"
0ba0b384 34#include "libguile/bytevectors.h"
a0599745
MD
35#include "libguile/chars.h"
36#include "libguile/eval.h"
2fa901a5 37#include "libguile/arrays.h"
cf396142 38#include "libguile/bitvectors.h"
a0599745
MD
39#include "libguile/keywords.h"
40#include "libguile/alist.h"
41#include "libguile/srcprop.h"
42#include "libguile/hashtab.h"
43#include "libguile/hash.h"
44#include "libguile/ports.h"
45#include "libguile/root.h"
46#include "libguile/strings.h"
ba1b2226 47#include "libguile/strports.h"
a0599745 48#include "libguile/vectors.h"
a0599745 49#include "libguile/validate.h"
a4022e69 50#include "libguile/srfi-4.h"
7337d56d 51#include "libguile/srfi-13.h"
ba1b2226 52
a0599745 53#include "libguile/read.h"
22fc179a
HWN
54#include "libguile/private-options.h"
55
0f2d19dd
JB
56
57\f
58
5bf6a6f0 59SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 60SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 61SCM_SYMBOL (scm_keyword_postfix, "postfix");
c7733771 62
92c2555f 63scm_t_option scm_read_opts[] = {
b7ff98dd
MD
64 { SCM_OPTION_BOOLEAN, "copy", 0,
65 "Copy source code expressions." },
ac74fc22 66 { SCM_OPTION_BOOLEAN, "positions", 0,
deca31e1
GH
67 "Record positions of source code expressions." },
68 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 69 "Convert symbols to lower case."},
f1267706 70 { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
904fabb6 71 "Style of keyword recognition: #f, 'prefix or 'postfix."},
16353acc 72#if SCM_ENABLE_ELISP
16353acc
NJ
73 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
74 "Support Elisp vector syntax, namely `[...]'."},
cd21f5eb 75 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
62560650 76 "Support `\\(' and `\\)' in strings."},
16353acc 77#endif
62560650 78 { 0, },
a16f6fe7
MD
79};
80
39e8f371
HWN
81/*
82 Give meaningful error messages for errors
83
84 We use the format
85
ba1b2226 86 FILE:LINE:COL: MESSAGE
39e8f371
HWN
87 This happened in ....
88
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
91
39e8f371
HWN
92 */
93
94
a4022e69
MV
95void
96scm_i_input_error (char const *function,
97 SCM port, const char *message, SCM arg)
ba1b2226 98{
29a837fd
MV
99 SCM fn = (scm_is_string (SCM_FILENAME(port))
100 ? SCM_FILENAME(port)
101 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 102
29a837fd 103 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
104 SCM string = SCM_EOL;
105 scm_simple_format (string_port,
272632a6 106 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 107 scm_list_4 (fn,
b3aa4626 108 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 109 scm_from_int (SCM_COL (port) + 1),
272632a6 110 scm_from_locale_string (message)));
ba1b2226
HWN
111
112 string = scm_get_output_string (string_port);
113 scm_close_output_port (string_port);
272632a6 114 scm_error_scm (scm_from_locale_symbol ("read-error"),
a4022e69 115 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 116 string,
dd72382c 117 arg,
ba1b2226
HWN
118 SCM_BOOL_F);
119}
39e8f371
HWN
120
121
a1ec6916 122SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 123 (SCM setting),
dc7fa443
MG
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 127#define FUNC_NAME s_scm_read_options
a16f6fe7 128{
b7ff98dd
MD
129 SCM ans = scm_options (setting,
130 scm_read_opts,
1bbd0b84 131 FUNC_NAME);
b7ff98dd
MD
132 if (SCM_COPY_SOURCE_P)
133 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
134 return ans;
135}
1bbd0b84 136#undef FUNC_NAME
a16f6fe7 137
14de3b42
GH
138/* An association list mapping extra hash characters to procedures. */
139static SCM *scm_read_hash_procedures;
deca31e1 140
0f2d19dd 141
7337d56d
LC
142\f
143/* Token readers. */
0f2d19dd 144
0f2d19dd 145
7337d56d
LC
146/* Size of the C buffer used to read symbols and numbers. */
147#define READER_BUFFER_SIZE 128
0f2d19dd 148
7337d56d
LC
149/* Size of the C buffer used to read strings. */
150#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 151
7337d56d
LC
152/* The maximum size of Scheme character names. */
153#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 154
94115ae3 155
7337d56d
LC
156/* `isblank' is only in C99. */
157#define CHAR_IS_BLANK_(_chr) \
158 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 159 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
160
161#ifdef MSDOS
162# define CHAR_IS_BLANK(_chr) \
163 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
164#else
165# define CHAR_IS_BLANK CHAR_IS_BLANK_
166#endif
167
168
169/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
170 structure''). */
171#define CHAR_IS_R5RS_DELIMITER(c) \
172 (CHAR_IS_BLANK (c) \
173 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
174
175#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
176
177/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
178 Structure''. */
179#define CHAR_IS_EXPONENT_MARKER(_chr) \
180 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
181 || ((_chr) == 'd') || ((_chr) == 'l'))
182
454866e0 183/* Read an SCSH block comment. */
620c8965
LC
184static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
185static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
186static SCM scm_read_commented_expression (scm_t_wchar, SCM);
187static SCM scm_get_hash_procedure (int);
454866e0 188
d41668fa
LC
189/* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
190 zero if the whole token fits in BUF, non-zero otherwise. */
7337d56d 191static inline int
889975e5 192read_token (SCM port, SCM buf, size_t *read)
0520c320 193{
889975e5 194 scm_t_wchar chr;
7337d56d 195 *read = 0;
0520c320 196
889975e5
MG
197 buf = scm_i_string_start_writing (buf);
198 while (*read < scm_i_string_length (buf))
0520c320 199 {
7337d56d 200 chr = scm_getc (port);
7337d56d
LC
201
202 if (chr == EOF)
7337d56d 203 {
889975e5 204 scm_i_string_stop_writing ();
7337d56d
LC
205 return 0;
206 }
889975e5
MG
207
208 chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
209
210 if (CHAR_IS_DELIMITER (chr))
7337d56d 211 {
889975e5
MG
212 scm_i_string_stop_writing ();
213 scm_ungetc (chr, port);
214 return 0;
7337d56d 215 }
889975e5
MG
216
217 scm_i_string_set_x (buf, *read, chr);
218 (*read)++;
0520c320 219 }
889975e5 220 scm_i_string_stop_writing ();
7337d56d
LC
221
222 return 1;
0520c320 223}
1cc91f1b 224
889975e5
MG
225static SCM
226read_complete_token (SCM port, size_t *read)
227{
228 SCM buffer, str = SCM_EOL;
229 size_t len;
230 int overflow;
231
232 buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
233 overflow = read_token (port, buffer, read);
234 if (!overflow)
235 return scm_i_substring (buffer, 0, *read);
236
237 str = scm_string_copy (buffer);
238 do
239 {
240 overflow = read_token (port, buffer, &len);
241 str = scm_string_append (scm_list_2 (str, buffer));
242 *read += len;
243 }
244 while (overflow);
245
246 return scm_i_substring (str, 0, *read);
247}
7337d56d
LC
248
249/* Skip whitespace from PORT and return the first non-whitespace character
250 read. Raise an error on end-of-file. */
251static int
252flush_ws (SCM port, const char *eoferr)
0f2d19dd 253{
889975e5 254 register scm_t_wchar c;
0f2d19dd 255 while (1)
b7f3516f 256 switch (c = scm_getc (port))
0f2d19dd
JB
257 {
258 case EOF:
259 goteof:
260 if (eoferr)
d156d3b7 261 {
a4022e69
MV
262 scm_i_input_error (eoferr,
263 port,
264 "end of file",
265 SCM_EOL);
d156d3b7 266 }
0f2d19dd 267 return c;
7337d56d 268
0f2d19dd
JB
269 case ';':
270 lp:
b7f3516f 271 switch (c = scm_getc (port))
0f2d19dd
JB
272 {
273 case EOF:
274 goto goteof;
275 default:
276 goto lp;
277 case SCM_LINE_INCREMENTORS:
278 break;
279 }
280 break;
7337d56d 281
454866e0
LC
282 case '#':
283 switch (c = scm_getc (port))
284 {
285 case EOF:
286 eoferr = "read_sharp";
287 goto goteof;
288 case '!':
289 scm_read_scsh_block_comment (c, port);
290 break;
34f3d47d
AW
291 case ';':
292 scm_read_commented_expression (c, port);
293 break;
620c8965
LC
294 case '|':
295 if (scm_is_false (scm_get_hash_procedure (c)))
296 {
297 scm_read_r6rs_block_comment (c, port);
298 break;
299 }
300 /* fall through */
454866e0
LC
301 default:
302 scm_ungetc (c, port);
303 return '#';
304 }
305 break;
306
0f2d19dd 307 case SCM_LINE_INCREMENTORS:
0f2d19dd 308 case SCM_SINGLE_SPACES:
0f2d19dd 309 case '\t':
0f2d19dd 310 break;
7337d56d 311
0f2d19dd
JB
312 default:
313 return c;
314 }
7337d56d
LC
315
316 return 0;
0f2d19dd
JB
317}
318
319
7337d56d
LC
320\f
321/* Token readers. */
1cc91f1b 322
7337d56d
LC
323static SCM scm_read_expression (SCM port);
324static SCM scm_read_sharp (int chr, SCM port);
7337d56d 325static SCM recsexpr (SCM obj, long line, int column, SCM filename);
0f2d19dd
JB
326
327
09a4f039 328static SCM
889975e5 329scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 330#define FUNC_NAME "scm_i_lreadparen"
09a4f039 331{
7337d56d
LC
332 register int c;
333 register SCM tmp;
334 register SCM tl, ans = SCM_EOL;
bd22f1c7 335 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
7337d56d
LC
336 static const int terminating_char = ')';
337
338 /* Need to capture line and column numbers here. */
339 long line = SCM_LINUM (port);
340 int column = SCM_COL (port) - 1;
f9c68a47 341
f9c68a47 342
7337d56d
LC
343 c = flush_ws (port, FUNC_NAME);
344 if (terminating_char == c)
345 return SCM_EOL;
f9c68a47 346
7337d56d
LC
347 scm_ungetc (c, port);
348 if (scm_is_eq (scm_sym_dot,
349 (tmp = scm_read_expression (port))))
350 {
351 ans = scm_read_expression (port);
352 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
353 scm_i_input_error (FUNC_NAME, port, "missing close paren",
354 SCM_EOL);
355 return ans;
356 }
1cc91f1b 357
7337d56d
LC
358 /* Build the head of the list structure. */
359 ans = tl = scm_cons (tmp, SCM_EOL);
360
361 if (SCM_COPY_SOURCE_P)
362 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
363 ? copy
364 : tmp,
365 SCM_EOL);
366
367 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 368 {
7337d56d 369 SCM new_tail;
0f2d19dd 370
7337d56d
LC
371 scm_ungetc (c, port);
372 if (scm_is_eq (scm_sym_dot,
373 (tmp = scm_read_expression (port))))
0f2d19dd 374 {
7337d56d
LC
375 SCM_SETCDR (tl, tmp = scm_read_expression (port));
376
377 if (SCM_COPY_SOURCE_P)
378 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
379 SCM_EOL));
380
381 c = flush_ws (port, FUNC_NAME);
382 if (terminating_char != c)
383 scm_i_input_error (FUNC_NAME, port,
384 "in pair: missing close paren", SCM_EOL);
385 goto exit;
0f2d19dd 386 }
b858464a 387
7337d56d
LC
388 new_tail = scm_cons (tmp, SCM_EOL);
389 SCM_SETCDR (tl, new_tail);
390 tl = new_tail;
391
392 if (SCM_COPY_SOURCE_P)
0f2d19dd 393 {
7337d56d
LC
394 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
395 ? copy
396 : tmp, SCM_EOL);
397 SCM_SETCDR (tl2, new_tail2);
398 tl2 = new_tail2;
399 }
400 }
0f2d19dd 401
7337d56d
LC
402 exit:
403 if (SCM_RECORD_POSITIONS_P)
404 scm_whash_insert (scm_source_whash,
405 ans,
406 scm_make_srcprops (line, column,
407 SCM_FILENAME (port),
408 SCM_COPY_SOURCE_P
409 ? ans2
410 : SCM_UNDEFINED,
411 SCM_EOL));
412 return ans;
413}
414#undef FUNC_NAME
a4022e69 415
7337d56d
LC
416static SCM
417scm_read_string (int chr, SCM port)
418#define FUNC_NAME "scm_lreadr"
419{
420 /* For strings smaller than C_STR, this function creates only one Scheme
421 object (the string returned). */
0f2d19dd 422
7337d56d 423 SCM str = SCM_BOOL_F;
7337d56d 424 unsigned c_str_len = 0;
9c44cd45 425 scm_t_wchar c;
eb42ff25 426
9c44cd45 427 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
7337d56d
LC
428 while ('"' != (c = scm_getc (port)))
429 {
430 if (c == EOF)
9c44cd45
MG
431 {
432 str_eof:
433 scm_i_input_error (FUNC_NAME, port,
434 "end of file in string constant", SCM_EOL);
435 }
0f2d19dd 436
9c44cd45
MG
437 if (c_str_len + 1 >= scm_i_string_length (str))
438 {
439 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
0f2d19dd 440
9c44cd45
MG
441 str = scm_string_append (scm_list_2 (str, addy));
442 }
0f2d19dd 443
7337d56d 444 if (c == '\\')
9c44cd45
MG
445 {
446 switch (c = scm_getc (port))
447 {
448 case EOF:
449 goto str_eof;
450 case '"':
451 case '\\':
452 break;
16353acc 453#if SCM_ENABLE_ELISP
9c44cd45
MG
454 case '(':
455 case ')':
456 if (SCM_ESCAPED_PARENS_P)
457 break;
458 goto bad_escaped;
16353acc 459#endif
9c44cd45
MG
460 case '\n':
461 continue;
462 case '0':
463 c = '\0';
464 break;
465 case 'f':
466 c = '\f';
467 break;
468 case 'n':
469 c = '\n';
470 break;
471 case 'r':
472 c = '\r';
473 break;
474 case 't':
475 c = '\t';
476 break;
477 case 'a':
478 c = '\007';
479 break;
480 case 'v':
481 c = '\v';
482 break;
483 case 'x':
484 {
485 scm_t_wchar a, b;
486 a = scm_getc (port);
487 if (a == EOF)
488 goto str_eof;
489 b = scm_getc (port);
490 if (b == EOF)
491 goto str_eof;
492 if ('0' <= a && a <= '9')
493 a -= '0';
494 else if ('A' <= a && a <= 'F')
495 a = a - 'A' + 10;
496 else if ('a' <= a && a <= 'f')
497 a = a - 'a' + 10;
498 else
499 {
500 c = a;
501 goto bad_escaped;
502 }
503 if ('0' <= b && b <= '9')
504 b -= '0';
505 else if ('A' <= b && b <= 'F')
506 b = b - 'A' + 10;
507 else if ('a' <= b && b <= 'f')
508 b = b - 'a' + 10;
509 else
510 {
511 c = b;
512 goto bad_escaped;
513 }
514 c = a * 16 + b;
515 break;
516 }
517 case 'u':
518 {
519 scm_t_wchar a;
520 int i;
521 c = 0;
522 for (i = 0; i < 4; i++)
523 {
524 a = scm_getc (port);
525 if (a == EOF)
526 goto str_eof;
527 if ('0' <= a && a <= '9')
528 a -= '0';
529 else if ('A' <= a && a <= 'F')
530 a = a - 'A' + 10;
531 else if ('a' <= a && a <= 'f')
532 a = a - 'a' + 10;
533 else
534 {
535 c = a;
536 goto bad_escaped;
537 }
538 c = c * 16 + a;
539 }
540 break;
541 }
542 case 'U':
543 {
544 scm_t_wchar a;
545 int i;
546 c = 0;
547 for (i = 0; i < 6; i++)
548 {
549 a = scm_getc (port);
550 if (a == EOF)
551 goto str_eof;
552 if ('0' <= a && a <= '9')
553 a -= '0';
554 else if ('A' <= a && a <= 'F')
555 a = a - 'A' + 10;
556 else if ('a' <= a && a <= 'f')
557 a = a - 'a' + 10;
558 else
559 {
560 c = a;
561 goto bad_escaped;
562 }
563 c = c * 16 + a;
564 }
565 break;
566 }
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
7337d56d 577 }
f13b4400 578
7337d56d
LC
579 if (c_str_len > 0)
580 {
9c44cd45 581 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 582 }
9c44cd45
MG
583
584 return scm_nullstr;
0f2d19dd 585}
db4b4ca6
DH
586#undef FUNC_NAME
587
0f2d19dd 588
7337d56d 589static SCM
889975e5 590scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 591{
889975e5
MG
592 SCM result;
593 SCM buffer;
7337d56d 594 size_t read;
0f2d19dd 595
7337d56d 596 scm_ungetc (chr, port);
889975e5
MG
597 buffer = read_complete_token (port, &read);
598 result = scm_string_to_number (buffer, SCM_UNDEFINED);
599 if (!scm_is_true (result))
600 /* Return a symbol instead of a number. */
601 result = scm_string_to_symbol (buffer);
0f2d19dd 602
7337d56d
LC
603 return result;
604}
0f2d19dd 605
7337d56d 606static SCM
889975e5 607scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 608{
889975e5
MG
609 SCM result;
610 int ends_with_colon = 0;
611 SCM buffer;
7337d56d 612 size_t read = 0;
ef4cbc08 613 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
7337d56d
LC
614
615 scm_ungetc (chr, port);
889975e5
MG
616 buffer = read_complete_token (port, &read);
617 if (read > 0)
618 ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
ef4cbc08 619
889975e5
MG
620 if (postfix && ends_with_colon && (read > 1))
621 result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
7337d56d 622 else
889975e5 623 result = scm_string_to_symbol (buffer);
7337d56d
LC
624
625 return result;
626}
627
628static SCM
889975e5 629scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
630#define FUNC_NAME "scm_lreadr"
631{
889975e5 632 SCM result;
7337d56d 633 size_t read;
889975e5 634 SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
7337d56d 635 unsigned int radix;
7337d56d
LC
636
637 switch (chr)
638 {
639 case 'B':
640 case 'b':
641 radix = 2;
642 break;
643
644 case 'o':
645 case 'O':
646 radix = 8;
647 break;
648
649 case 'd':
650 case 'D':
651 radix = 10;
652 break;
653
654 case 'x':
655 case 'X':
656 radix = 16;
657 break;
658
659 default:
660 scm_ungetc (chr, port);
661 scm_ungetc ('#', port);
662 radix = 10;
663 }
664
889975e5
MG
665 buffer = read_complete_token (port, &read);
666 result = scm_string_to_number (buffer, scm_from_uint (radix));
7337d56d
LC
667
668 if (scm_is_true (result))
669 return result;
670
671 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
672
673 return SCM_BOOL_F;
674}
675#undef FUNC_NAME
676
677static SCM
678scm_read_quote (int chr, SCM port)
679{
680 SCM p;
492faee1
LC
681 long line = SCM_LINUM (port);
682 int column = SCM_COL (port) - 1;
7337d56d
LC
683
684 switch (chr)
685 {
686 case '`':
687 p = scm_sym_quasiquote;
688 break;
689
690 case '\'':
691 p = scm_sym_quote;
692 break;
693
694 case ',':
695 {
889975e5 696 scm_t_wchar c;
7337d56d
LC
697
698 c = scm_getc (port);
699 if ('@' == c)
700 p = scm_sym_uq_splicing;
701 else
0f2d19dd 702 {
7337d56d
LC
703 scm_ungetc (c, port);
704 p = scm_sym_unquote;
0f2d19dd 705 }
7337d56d
LC
706 break;
707 }
0f2d19dd 708
7337d56d
LC
709 default:
710 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 711 "scm_read_quote", chr);
7337d56d 712 abort ();
0f2d19dd 713 }
1cc91f1b 714
7337d56d 715 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1
LC
716 if (SCM_RECORD_POSITIONS_P)
717 scm_whash_insert (scm_source_whash, p,
718 scm_make_srcprops (line, column,
719 SCM_FILENAME (port),
720 SCM_COPY_SOURCE_P
721 ? (scm_cons2 (SCM_CAR (p),
722 SCM_CAR (SCM_CDR (p)),
723 SCM_EOL))
724 : SCM_UNDEFINED,
725 SCM_EOL));
726
0f2d19dd 727
7337d56d
LC
728 return p;
729}
730
34f3d47d
AW
731SCM_SYMBOL (sym_syntax, "syntax");
732SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
733SCM_SYMBOL (sym_unsyntax, "unsyntax");
734SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
735
736static SCM
737scm_read_syntax (int chr, SCM port)
738{
739 SCM p;
740 long line = SCM_LINUM (port);
741 int column = SCM_COL (port) - 1;
742
743 switch (chr)
744 {
745 case '`':
746 p = sym_quasisyntax;
747 break;
748
749 case '\'':
750 p = sym_syntax;
751 break;
752
753 case ',':
754 {
755 int c;
756
757 c = scm_getc (port);
758 if ('@' == c)
759 p = sym_unsyntax_splicing;
760 else
761 {
762 scm_ungetc (c, port);
763 p = sym_unsyntax;
764 }
765 break;
766 }
767
768 default:
769 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
770 "scm_read_syntax", chr);
771 abort ();
772 }
773
774 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
775 if (SCM_RECORD_POSITIONS_P)
776 scm_whash_insert (scm_source_whash, p,
777 scm_make_srcprops (line, column,
778 SCM_FILENAME (port),
779 SCM_COPY_SOURCE_P
780 ? (scm_cons2 (SCM_CAR (p),
781 SCM_CAR (SCM_CDR (p)),
782 SCM_EOL))
783 : SCM_UNDEFINED,
784 SCM_EOL));
785
786
787 return p;
788}
789
7337d56d
LC
790static inline SCM
791scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 792{
0f2d19dd
JB
793 int c;
794
889975e5
MG
795 /* We use the get_byte here because there is no need to get the
796 locale correct with comment input. This presumes that newline
797 always represents itself no matter what the encoding is. */
798 for (c = scm_get_byte_or_eof (port);
7337d56d
LC
799 (c != EOF) && (c != '\n');
800 c = scm_getc (port));
801
802 return SCM_UNSPECIFIED;
803}
804
805\f
806/* Sharp readers, i.e. readers called after a `#' sign has been read. */
807
808static SCM
809scm_read_boolean (int chr, SCM port)
810{
811 switch (chr)
0f2d19dd 812 {
7337d56d
LC
813 case 't':
814 case 'T':
815 return SCM_BOOL_T;
816
817 case 'f':
818 case 'F':
819 return SCM_BOOL_F;
0f2d19dd 820 }
7337d56d
LC
821
822 return SCM_UNSPECIFIED;
823}
824
825static SCM
889975e5 826scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
827#define FUNC_NAME "scm_lreadr"
828{
889975e5 829 SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
7337d56d 830 size_t charname_len;
889975e5
MG
831 scm_t_wchar cp;
832 int overflow;
833
834 overflow = read_token (port, charname, &charname_len);
835 charname = scm_c_substring (charname, 0, charname_len);
7337d56d 836
889975e5 837 if (overflow)
7337d56d
LC
838 goto char_error;
839
840 if (charname_len == 0)
0f2d19dd 841 {
7337d56d
LC
842 chr = scm_getc (port);
843 if (chr == EOF)
844 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
845 "while reading character", SCM_EOL);
846
847 /* CHR must be a token delimiter, like a whitespace. */
848 return (SCM_MAKE_CHAR (chr));
0f2d19dd 849 }
7337d56d
LC
850
851 if (charname_len == 1)
889975e5 852 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
7337d56d 853
889975e5 854 cp = scm_i_string_ref (charname, 0);
0dcd7e61
MG
855 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
856 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
857
889975e5 858 if (cp >= '0' && cp < '8')
7337d56d
LC
859 {
860 /* Dirk:FIXME:: This type of character syntax is not R5RS
861 * compliant. Further, it should be verified that the constant
0ffc78e3 862 * does only consist of octal digits. */
889975e5 863 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 864 if (SCM_I_INUMP (p))
0ffc78e3
MG
865 {
866 scm_t_wchar c = SCM_I_INUM (p);
867 if (SCM_IS_UNICODE_CHAR (c))
868 return SCM_MAKE_CHAR (c);
869 else
870 scm_i_input_error (FUNC_NAME, port,
871 "out-of-range octal character escape: ~a",
872 scm_list_1 (charname));
873 }
7337d56d
LC
874 }
875
889975e5
MG
876 /* The names of characters should never have non-Latin1
877 characters. */
878 if (scm_i_is_narrow_string (charname)
879 || scm_i_try_narrow_string (charname))
4769c9db
AW
880 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
881 charname_len);
882 if (scm_is_true (ch))
883 return ch;
884 }
7337d56d
LC
885
886 char_error:
887 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 888 scm_list_1 (charname));
7337d56d
LC
889
890 return SCM_UNSPECIFIED;
0f2d19dd 891}
db4b4ca6 892#undef FUNC_NAME
0f2d19dd 893
7337d56d
LC
894static inline SCM
895scm_read_keyword (int chr, SCM port)
896{
897 SCM symbol;
898
899 /* Read the symbol that comprises the keyword. Doing this instead of
900 invoking a specific symbol reader function allows `scm_read_keyword ()'
901 to adapt to the delimiters currently valid of symbols.
1cc91f1b 902
7337d56d
LC
903 XXX: This implementation allows sloppy syntaxes like `#: key'. */
904 symbol = scm_read_expression (port);
905 if (!scm_is_symbol (symbol))
7f74cf9a 906 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
907 "keyword prefix `~a' not followed by a symbol: ~s",
908 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
909
910 return (scm_symbol_to_keyword (symbol));
911}
912
913static inline SCM
914scm_read_vector (int chr, SCM port)
09a4f039 915{
7337d56d
LC
916 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
917 guarantee that it's going to do what we want. After all, this is an
918 implementation detail of `scm_read_vector ()', not a desirable
919 property. */
920 return (scm_vector (scm_read_sexp (chr, port)));
921}
09a4f039 922
7337d56d
LC
923static inline SCM
924scm_read_srfi4_vector (int chr, SCM port)
925{
926 return scm_i_read_array (port, chr);
927}
928
0ba0b384 929static SCM
889975e5 930scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
931{
932 chr = scm_getc (port);
933 if (chr != 'u')
934 goto syntax;
935
936 chr = scm_getc (port);
937 if (chr != '8')
938 goto syntax;
939
940 chr = scm_getc (port);
941 if (chr != '(')
942 goto syntax;
943
944 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
945
946 syntax:
947 scm_i_input_error ("read_bytevector", port,
948 "invalid bytevector prefix",
949 SCM_MAKE_CHAR (chr));
950 return SCM_UNSPECIFIED;
951}
952
7337d56d 953static SCM
889975e5 954scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
955{
956 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
957 terribly inefficient but who cares? */
958 SCM s_bits = SCM_EOL;
959
960 for (chr = scm_getc (port);
961 (chr != EOF) && ((chr == '0') || (chr == '1'));
962 chr = scm_getc (port))
09a4f039 963 {
7337d56d 964 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 965 }
7337d56d
LC
966
967 if (chr != EOF)
968 scm_ungetc (chr, port);
969
970 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
971}
972
973static inline SCM
889975e5 974scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
975{
976 int bang_seen = 0;
977
889975e5
MG
978 /* We can use the get_byte here because there is no need to get the
979 locale correct when reading comments. This presumes that
980 hash and exclamation points always represent themselves no
981 matter what the source encoding is.*/
7337d56d 982 for (;;)
09a4f039 983 {
889975e5 984 int c = scm_get_byte_or_eof (port);
62850ef3 985
7337d56d
LC
986 if (c == EOF)
987 scm_i_input_error ("skip_block_comment", port,
988 "unterminated `#! ... !#' comment", SCM_EOL);
989
990 if (c == '!')
991 bang_seen = 1;
992 else if (c == '#' && bang_seen)
993 break;
994 else
995 bang_seen = 0;
996 }
997
998 return SCM_UNSPECIFIED;
999}
1000
620c8965
LC
1001static SCM
1002scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1003{
1004 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1005 nested. So care must be taken. */
1006 int nesting_level = 1;
1007 int opening_seen = 0, closing_seen = 0;
1008
1009 while (nesting_level > 0)
1010 {
1011 int c = scm_getc (port);
1012
1013 if (c == EOF)
1014 scm_i_input_error (__FUNCTION__, port,
1015 "unterminated `#| ... |#' comment", SCM_EOL);
1016
1017 if (opening_seen)
1018 {
1019 if (c == '|')
1020 nesting_level++;
1021 opening_seen = 0;
1022 }
1023 else if (closing_seen)
1024 {
1025 if (c == '#')
1026 nesting_level--;
1027 closing_seen = 0;
1028 }
1029 else if (c == '|')
1030 closing_seen = 1;
1031 else if (c == '#')
1032 opening_seen = 1;
1033 else
1034 opening_seen = closing_seen = 0;
1035 }
1036
1037 return SCM_UNSPECIFIED;
1038}
1039
34f3d47d 1040static SCM
889975e5 1041scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1042{
889975e5 1043 scm_t_wchar c;
34f3d47d
AW
1044
1045 c = flush_ws (port, (char *) NULL);
1046 if (EOF == c)
1047 scm_i_input_error ("read_commented_expression", port,
1048 "no expression after #; comment", SCM_EOL);
1049 scm_ungetc (c, port);
1050 scm_read_expression (port);
1051 return SCM_UNSPECIFIED;
1052}
1053
7337d56d 1054static SCM
889975e5 1055scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1056{
1057 /* Guile's extended symbol read syntax looks like this:
1058
1059 #{This is all a symbol name}#
1060
1061 So here, CHR is expected to be `{'. */
7337d56d
LC
1062 int saw_brace = 0, finished = 0;
1063 size_t len = 0;
889975e5 1064 SCM buf = scm_i_make_string (1024, NULL);
7337d56d 1065
889975e5 1066 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1067
1068 while ((chr = scm_getc (port)) != EOF)
1069 {
1070 if (saw_brace)
09a4f039 1071 {
7337d56d
LC
1072 if (chr == '#')
1073 {
1074 finished = 1;
1075 break;
1076 }
1077 else
1078 {
1079 saw_brace = 0;
889975e5
MG
1080 scm_i_string_set_x (buf, len++, '}');
1081 scm_i_string_set_x (buf, len++, chr);
7337d56d 1082 }
09a4f039 1083 }
7337d56d
LC
1084 else if (chr == '}')
1085 saw_brace = 1;
1086 else
889975e5 1087 scm_i_string_set_x (buf, len++, chr);
62850ef3 1088
889975e5 1089 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1090 {
889975e5
MG
1091 scm_i_string_stop_writing ();
1092 SCM addy = scm_i_make_string (1024, NULL);
1093 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1094 len = 0;
889975e5 1095 buf = scm_i_string_start_writing (buf);
7337d56d 1096 }
62850ef3 1097
7337d56d
LC
1098 if (finished)
1099 break;
1100 }
889975e5 1101 scm_i_string_stop_writing ();
7337d56d 1102
889975e5 1103 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1104}
1105
1106
1107\f
1108/* Top-level token readers, i.e., dispatchers. */
1109
1110static SCM
1111scm_read_sharp_extension (int chr, SCM port)
1112{
1113 SCM proc;
1114
1115 proc = scm_get_hash_procedure (chr);
1116 if (scm_is_true (scm_procedure_p (proc)))
1117 {
1118 long line = SCM_LINUM (port);
1119 int column = SCM_COL (port) - 2;
1120 SCM got;
1121
1122 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1123 if (!scm_is_eq (got, SCM_UNSPECIFIED))
62850ef3 1124 {
7337d56d
LC
1125 if (SCM_RECORD_POSITIONS_P)
1126 return (recsexpr (got, line, column,
1127 SCM_FILENAME (port)));
1128 else
1129 return got;
62850ef3 1130 }
09a4f039 1131 }
7337d56d
LC
1132
1133 return SCM_UNSPECIFIED;
1134}
1135
1136/* The reader for the sharp `#' character. It basically dispatches reads
1137 among the above token readers. */
1138static SCM
889975e5 1139scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1140#define FUNC_NAME "scm_lreadr"
1141{
1142 SCM result;
1143
1144 chr = scm_getc (port);
1145
1146 result = scm_read_sharp_extension (chr, port);
1147 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1148 return result;
1149
1150 switch (chr)
1151 {
1152 case '\\':
1153 return (scm_read_character (chr, port));
1154 case '(':
1155 return (scm_read_vector (chr, port));
1156 case 's':
1157 case 'u':
1158 case 'f':
1159 /* This one may return either a boolean or an SRFI-4 vector. */
1160 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1161 case 'v':
1162 return (scm_read_bytevector (chr, port));
7337d56d
LC
1163 case '*':
1164 return (scm_read_guile_bit_vector (chr, port));
1165 case 't':
1166 case 'T':
1167 case 'F':
1168 /* This one may return either a boolean or an SRFI-4 vector. */
1169 return (scm_read_boolean (chr, port));
1170 case ':':
1171 return (scm_read_keyword (chr, port));
1172 case '0': case '1': case '2': case '3': case '4':
1173 case '5': case '6': case '7': case '8': case '9':
1174 case '@':
1175#if SCM_ENABLE_DEPRECATED
1176 /* See below for 'i' and 'e'. */
1177 case 'a':
1178 case 'c':
1179 case 'y':
1180 case 'h':
1181 case 'l':
1182#endif
1183 return (scm_i_read_array (port, chr));
1184
1185 case 'i':
1186 case 'e':
1187#if SCM_ENABLE_DEPRECATED
1188 {
1189 /* When next char is '(', it really is an old-style
1190 uniform array. */
889975e5 1191 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1192 if (next_c != EOF)
1193 scm_ungetc (next_c, port);
1194 if (next_c == '(')
1195 return scm_i_read_array (port, chr);
1196 /* Fall through. */
1197 }
1198#endif
1199 case 'b':
1200 case 'B':
1201 case 'o':
1202 case 'O':
1203 case 'd':
1204 case 'D':
1205 case 'x':
1206 case 'X':
1207 case 'I':
1208 case 'E':
1209 return (scm_read_number_and_radix (chr, port));
1210 case '{':
1211 return (scm_read_extended_symbol (chr, port));
1212 case '!':
1213 return (scm_read_scsh_block_comment (chr, port));
34f3d47d
AW
1214 case ';':
1215 return (scm_read_commented_expression (chr, port));
1216 case '`':
1217 case '\'':
1218 case ',':
1219 return (scm_read_syntax (chr, port));
7337d56d
LC
1220 default:
1221 result = scm_read_sharp_extension (chr, port);
1222 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1223 {
1224 /* To remain compatible with 1.8 and earlier, the following
1225 characters have lower precedence than `read-hash-extend'
1226 characters. */
1227 switch (chr)
1228 {
1229 case '|':
1230 return scm_read_r6rs_block_comment (chr, port);
1231 default:
1232 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1233 scm_list_1 (SCM_MAKE_CHAR (chr)));
1234 }
1235 }
7337d56d
LC
1236 else
1237 return result;
1238 }
1239
1240 return SCM_UNSPECIFIED;
1241}
1242#undef FUNC_NAME
1243
1244static SCM
1245scm_read_expression (SCM port)
1246#define FUNC_NAME "scm_read_expression"
1247{
1248 while (1)
1249 {
889975e5 1250 register scm_t_wchar chr;
7337d56d
LC
1251
1252 chr = scm_getc (port);
1253
1254 switch (chr)
1255 {
1256 case SCM_WHITE_SPACES:
1257 case SCM_LINE_INCREMENTORS:
1258 break;
1259 case ';':
1260 (void) scm_read_semicolon_comment (chr, port);
1261 break;
1262 case '(':
1263 return (scm_read_sexp (chr, port));
1264 case '"':
1265 return (scm_read_string (chr, port));
1266 case '\'':
1267 case '`':
1268 case ',':
1269 return (scm_read_quote (chr, port));
1270 case '#':
1271 {
1272 SCM result;
1273 result = scm_read_sharp (chr, port);
1274 if (scm_is_eq (result, SCM_UNSPECIFIED))
1275 /* We read a comment or some such. */
1276 break;
1277 else
1278 return result;
1279 }
1280 case ')':
1281 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1282 break;
1283 case EOF:
1284 return SCM_EOF_VAL;
1285 case ':':
1286 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1287 return scm_symbol_to_keyword (scm_read_expression (port));
1288 /* Fall through. */
1289
1290 default:
1291 {
1292 if (((chr >= '0') && (chr <= '9'))
1293 || (strchr ("+-.", chr)))
1294 return (scm_read_number (chr, port));
1295 else
1296 return (scm_read_mixed_case_symbol (chr, port));
1297 }
1298 }
1299 }
1300}
1301#undef FUNC_NAME
1302
1303\f
1304/* Actual reader. */
1305
1306SCM_DEFINE (scm_read, "read", 0, 1, 0,
1307 (SCM port),
1308 "Read an s-expression from the input port @var{port}, or from\n"
1309 "the current input port if @var{port} is not specified.\n"
1310 "Any whitespace before the next token is discarded.")
1311#define FUNC_NAME s_scm_read
1312{
1313 int c;
1314
1315 if (SCM_UNBNDP (port))
1316 port = scm_current_input_port ();
1317 SCM_VALIDATE_OPINPORT (1, port);
1318
1319 c = flush_ws (port, (char *) NULL);
1320 if (EOF == c)
1321 return SCM_EOF_VAL;
1322 scm_ungetc (c, port);
1323
1324 return (scm_read_expression (port));
09a4f039 1325}
db4b4ca6 1326#undef FUNC_NAME
09a4f039 1327
0f2d19dd
JB
1328
1329\f
1330
7337d56d
LC
1331/* Used when recording expressions constructed by `scm_read_sharp ()'. */
1332static SCM
1333recsexpr (SCM obj, long line, int column, SCM filename)
1334{
1335 if (!scm_is_pair(obj)) {
1336 return obj;
1337 } else {
1338 SCM tmp = obj, copy;
1339 /* If this sexpr is visible in the read:sharp source, we want to
1340 keep that information, so only record non-constant cons cells
1341 which haven't previously been read by the reader. */
1342 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1343 {
1344 if (SCM_COPY_SOURCE_P)
1345 {
1346 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1347 SCM_UNDEFINED);
1348 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1349 {
1350 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1351 line,
1352 column,
1353 filename),
1354 SCM_UNDEFINED));
1355 copy = SCM_CDR (copy);
1356 }
1357 SCM_SETCDR (copy, tmp);
1358 }
1359 else
1360 {
1361 recsexpr (SCM_CAR (obj), line, column, filename);
1362 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1363 recsexpr (SCM_CAR (tmp), line, column, filename);
1364 copy = SCM_UNDEFINED;
1365 }
1366 scm_whash_insert (scm_source_whash,
1367 obj,
1368 scm_make_srcprops (line,
1369 column,
1370 filename,
1371 copy,
1372 SCM_EOL));
1373 }
1374 return obj;
1375 }
1376}
1377
14de3b42
GH
1378/* Manipulate the read-hash-procedures alist. This could be written in
1379 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1380SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1381 (SCM chr, SCM proc),
dc7fa443
MG
1382 "Install the procedure @var{proc} for reading expressions\n"
1383 "starting with the character sequence @code{#} and @var{chr}.\n"
1384 "@var{proc} will be called with two arguments: the character\n"
1385 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1386 "returned will be the return value of @code{read}. \n"
1387 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1388 )
1bbd0b84 1389#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1390{
fed9c9a2
GH
1391 SCM this;
1392 SCM prev;
1393
36284627 1394 SCM_VALIDATE_CHAR (1, chr);
7888309b 1395 SCM_ASSERT (scm_is_false (proc)
bc36d050 1396 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1397 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1398
14de3b42
GH
1399 /* Check if chr is already in the alist. */
1400 this = *scm_read_hash_procedures;
1401 prev = SCM_BOOL_F;
fed9c9a2
GH
1402 while (1)
1403 {
d2e53ed6 1404 if (scm_is_null (this))
fed9c9a2
GH
1405 {
1406 /* not found, so add it to the beginning. */
7888309b 1407 if (scm_is_true (proc))
fed9c9a2 1408 {
14de3b42
GH
1409 *scm_read_hash_procedures =
1410 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
fed9c9a2
GH
1411 }
1412 break;
1413 }
bc36d050 1414 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1415 {
1416 /* already in the alist. */
7888309b 1417 if (scm_is_false (proc))
14de3b42
GH
1418 {
1419 /* remove it. */
7888309b 1420 if (scm_is_false (prev))
14de3b42
GH
1421 {
1422 *scm_read_hash_procedures =
1423 SCM_CDR (*scm_read_hash_procedures);
1424 }
1425 else
1426 scm_set_cdr_x (prev, SCM_CDR (this));
1427 }
fed9c9a2 1428 else
14de3b42
GH
1429 {
1430 /* replace it. */
1431 scm_set_cdr_x (SCM_CAR (this), proc);
1432 }
fed9c9a2
GH
1433 break;
1434 }
1435 prev = this;
1436 this = SCM_CDR (this);
1437 }
deca31e1 1438
deca31e1
GH
1439 return SCM_UNSPECIFIED;
1440}
1bbd0b84 1441#undef FUNC_NAME
0f2d19dd 1442
deca31e1
GH
1443/* Recover the read-hash procedure corresponding to char c. */
1444static SCM
6e8d25a6 1445scm_get_hash_procedure (int c)
deca31e1 1446{
14de3b42 1447 SCM rest = *scm_read_hash_procedures;
fed9c9a2 1448
deca31e1
GH
1449 while (1)
1450 {
d2e53ed6 1451 if (scm_is_null (rest))
deca31e1
GH
1452 return SCM_BOOL_F;
1453
7866a09b 1454 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1455 return SCM_CDAR (rest);
1456
1457 rest = SCM_CDR (rest);
1458 }
1459}
1cc91f1b 1460
889975e5
MG
1461#define SCM_ENCODING_SEARCH_SIZE (500)
1462
f8a1c9a8
LC
1463/* Search the first few hundred characters of a file for an Emacs-like coding
1464 declaration. Returns either NULL or a string whose storage has been
1465 allocated with `scm_gc_malloc ()'. */
889975e5 1466char *
f8a1c9a8 1467scm_i_scan_for_encoding (SCM port)
889975e5
MG
1468{
1469 char header[SCM_ENCODING_SEARCH_SIZE+1];
1470 size_t bytes_read;
1471 char *encoding = NULL;
1472 int utf8_bom = 0;
1473 char *pos;
1474 int i;
1475 int in_comment;
1476
1477 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1478 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1479
1480 if (bytes_read > 3
1481 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1482 utf8_bom = 1;
1483
1484 /* search past "coding[:=]" */
1485 pos = header;
1486 while (1)
1487 {
1488 if ((pos = strstr(pos, "coding")) == NULL)
1489 return NULL;
1490
1491 pos += strlen("coding");
1492 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1493 (*pos == ':' || *pos == '='))
1494 {
1495 pos ++;
1496 break;
1497 }
1498 }
1499
1500 /* skip spaces */
1501 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1502 (*pos == ' ' || *pos == '\t'))
1503 pos ++;
1504
1505 /* grab the next token */
1506 i = 0;
1507 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
060e305a 1508 && pos + i - header < bytes_read
6d736fdb
MG
1509 && (isalnum((int) pos[i]) || pos[i] == '_' || pos[i] == '-'
1510 || pos[i] == '.'))
889975e5
MG
1511 i++;
1512
1513 if (i == 0)
1514 return NULL;
1515
b9fd657a 1516 encoding = scm_gc_strndup (pos, i, "encoding");
026ed239
MG
1517 for (i = 0; i < strlen (encoding); i++)
1518 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1519
1520 /* push backwards to make sure we were in a comment */
1521 in_comment = 0;
1522 while (pos - i - header > 0)
1523 {
1524 if (*(pos - i) == '\n')
1525 {
1526 /* This wasn't in a semicolon comment. Check for a
1527 hash-bang comment. */
1528 char *beg = strstr (header, "#!");
1529 char *end = strstr (header, "!#");
1530 if (beg < pos && pos < end)
1531 in_comment = 1;
1532 break;
1533 }
1534 if (*(pos - i) == ';')
1535 {
1536 in_comment = 1;
1537 break;
1538 }
1539 i ++;
1540 }
1541 if (!in_comment)
f8a1c9a8
LC
1542 /* This wasn't in a comment */
1543 return NULL;
1544
889975e5
MG
1545 if (utf8_bom && strcmp(encoding, "UTF-8"))
1546 scm_misc_error (NULL,
1547 "the port input declares the encoding ~s but is encoded as UTF-8",
1548 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1549
889975e5
MG
1550 return encoding;
1551}
1552
1553SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1554 (SCM port),
1555 "Scans the port for an EMACS-like character coding declaration\n"
1556 "near the top of the contents of a port with random-acessible contents.\n"
1557 "The coding declaration is of the form\n"
1558 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1559 "\n"
1560 "Returns a string containing the character encoding of the file\n"
1561 "if a declaration was found, or @code{#f} otherwise.\n")
1562#define FUNC_NAME s_scm_file_encoding
1563{
1564 char *enc;
1565 SCM s_enc;
f8a1c9a8
LC
1566
1567 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1568 if (enc == NULL)
1569 return SCM_BOOL_F;
1570 else
1571 {
1572 s_enc = scm_from_locale_string (enc);
889975e5
MG
1573 return s_enc;
1574 }
f8a1c9a8 1575
889975e5
MG
1576 return SCM_BOOL_F;
1577}
1578#undef FUNC_NAME
1579
0f2d19dd
JB
1580void
1581scm_init_read ()
0f2d19dd 1582{
14de3b42 1583 scm_read_hash_procedures =
86d31dfe 1584 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
fed9c9a2 1585
62560650 1586 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1587#include "libguile/read.x"
0f2d19dd 1588}
89e00824
ML
1589
1590/*
1591 Local Variables:
1592 c-file-style: "gnu"
1593 End:
1594*/