libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63 SCM_SYMBOL (sym_nil, "nil");
  64
  65 scm_t_option scm_read_opts[] = {
  66   { SCM_OPTION_BOOLEAN, "copy", 0,
  67     "Copy source code expressions." },
  68   { SCM_OPTION_BOOLEAN, "positions", 0,
  69     "Record positions of source code expressions." },
  70   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  71     "Convert symbols to lower case."},
  72   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  73     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  74   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  75     "Support Elisp vector syntax, namely `[...]'."},
  76   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  77     "Support `\\(' and `\\)' in strings."},
  78   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  79     "Use R6RS variable-length character and string hex escapes."},
  80   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  81     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  82   { 0, },
  83 };
  84
  85 /*
  86   Give meaningful error messages for errors
  87
  88   We use the format
  89
  90   FILE:LINE:COL: MESSAGE
  91   This happened in ....
  92
  93   This is not standard GNU format, but the test-suite likes the real
  94   message to be in front.
  95
  96  */
  97
  98
  99 void
 100 scm_i_input_error (char const *function,
 101                    SCM port, const char *message, SCM arg)
 102 {
 103   SCM fn = (scm_is_string (SCM_FILENAME(port))
 104             ? SCM_FILENAME(port)
 105             : scm_from_locale_string ("#<unknown port>"));
 106
 107   SCM string_port = scm_open_output_string ();
 108   SCM string = SCM_EOL;
 109   scm_simple_format (string_port,
 110                      scm_from_locale_string ("~A:~S:~S: ~A"),
 111                      scm_list_4 (fn,
 112                                  scm_from_long (SCM_LINUM (port) + 1),
 113                                  scm_from_int (SCM_COL (port) + 1),
 114                                  scm_from_locale_string (message)));
 115
 116   string = scm_get_output_string (string_port);
 117   scm_close_output_port (string_port);
 118   scm_error_scm (scm_from_locale_symbol ("read-error"),
 119                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 120                  string,
 121                  arg,
 122                  SCM_BOOL_F);
 123 }
 124
 125
 126 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 127             (SCM setting),
 128             "Option interface for the read options. Instead of using\n"
 129             "this procedure directly, use the procedures @code{read-enable},\n"
 130             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 131 #define FUNC_NAME s_scm_read_options
 132 {
 133   SCM ans = scm_options (setting,
 134                          scm_read_opts,
 135                          FUNC_NAME);
 136   if (SCM_COPY_SOURCE_P)
 137     SCM_RECORD_POSITIONS_P = 1;
 138   return ans;
 139 }
 140 #undef FUNC_NAME
 141
 142 /* An association list mapping extra hash characters to procedures.  */
 143 static SCM *scm_read_hash_procedures;
 144
 145
 146 \f
 147 /* Token readers.  */
 148
 149
 150 /* Size of the C buffer used to read symbols and numbers.  */
 151 #define READER_BUFFER_SIZE            128
 152
 153 /* Size of the C buffer used to read strings.  */
 154 #define READER_STRING_BUFFER_SIZE     512
 155
 156 /* The maximum size of Scheme character names.  */
 157 #define READER_CHAR_NAME_MAX_SIZE      50
 158
 159
 160 /* `isblank' is only in C99.  */
 161 #define CHAR_IS_BLANK_(_chr)                                    \
 162   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 163    || ((_chr) == '\f') || ((_chr) == '\r'))
 164
 165 #ifdef MSDOS
 166 # define CHAR_IS_BLANK(_chr)                    \
 167   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 168 #else
 169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 170 #endif
 171
 172
 173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 174    structure'').  */
 175 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 176   (CHAR_IS_BLANK (c)                                            \
 177    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 178    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 179
 180 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 181
 182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 183    Structure''.  */
 184 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 185   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 186    || ((_chr) == 'd') || ((_chr) == 'l'))
 187
 188 /* Read an SCSH block comment.  */
 189 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 190 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 191 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 192 static SCM scm_get_hash_procedure (int);
 193
 194 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 195    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 196    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 197    bytes actually read.  */
 198 static inline int
 199 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 200  {
 201    *read = 0;
 202
 203    while (*read < buf_size)
 204      {
 205        int chr;
 206
 207        chr = scm_get_byte_or_eof (port);
 208
 209        if (chr == EOF)
 210         return 0;
 211       else if (CHAR_IS_DELIMITER (chr))
 212         {
 213           scm_unget_byte (chr, port);
 214           return 0;
 215         }
 216       else
 217         {
 218           *buf = (char) chr;
 219           buf++, (*read)++;
 220         }
 221      }
 222
 223    return 1;
 224  }
 225
 226 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 227    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 228    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 229    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 230    will be set the number of bytes actually read.  */
 231 static int
 232 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 233                            char **overflow_buffer, size_t *read)
 234 {
 235   int overflow = 0;
 236   size_t bytes_read, overflow_size;
 237
 238   *overflow_buffer = NULL;
 239   overflow_size = 0;
 240
 241   do
 242     {
 243       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 244       if (bytes_read == 0)
 245         break;
 246       if (overflow || overflow_size != 0)
 247         {
 248           if (overflow_size == 0)
 249             {
 250               *overflow_buffer = scm_malloc (bytes_read);
 251               memcpy (*overflow_buffer, buffer, bytes_read);
 252               overflow_size = bytes_read;
 253             }
 254           else
 255             {
 256               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 257               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 258               overflow_size += bytes_read;
 259             }
 260         }
 261     }
 262   while (overflow);
 263
 264   if (overflow_size)
 265     *read = overflow_size;
 266   else
 267     *read = bytes_read;
 268
 269   return (overflow_size != 0);
 270 }
 271
 272 /* Skip whitespace from PORT and return the first non-whitespace character
 273    read.  Raise an error on end-of-file.  */
 274 static int
 275 flush_ws (SCM port, const char *eoferr)
 276 {
 277   register scm_t_wchar c;
 278   while (1)
 279     switch (c = scm_getc (port))
 280       {
 281       case EOF:
 282       goteof:
 283         if (eoferr)
 284           {
 285             scm_i_input_error (eoferr,
 286                                port,
 287                                "end of file",
 288                                SCM_EOL);
 289           }
 290         return c;
 291
 292       case ';':
 293       lp:
 294         switch (c = scm_getc (port))
 295           {
 296           case EOF:
 297             goto goteof;
 298           default:
 299             goto lp;
 300           case SCM_LINE_INCREMENTORS:
 301             break;
 302           }
 303         break;
 304
 305       case '#':
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             eoferr = "read_sharp";
 310             goto goteof;
 311           case '!':
 312             scm_read_scsh_block_comment (c, port);
 313             break;
 314           case ';':
 315             scm_read_commented_expression (c, port);
 316             break;
 317           case '|':
 318             if (scm_is_false (scm_get_hash_procedure (c)))
 319               {
 320                 scm_read_r6rs_block_comment (c, port);
 321                 break;
 322               }
 323             /* fall through */
 324           default:
 325             scm_ungetc (c, port);
 326             return '#';
 327           }
 328         break;
 329
 330       case SCM_LINE_INCREMENTORS:
 331       case SCM_SINGLE_SPACES:
 332       case '\t':
 333         break;
 334
 335       default:
 336         return c;
 337       }
 338
 339   return 0;
 340 }
 341
 342
 343 \f
 344 /* Token readers.  */
 345
 346 static SCM scm_read_expression (SCM port);
 347 static SCM scm_read_sharp (int chr, SCM port);
 348 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 349
 350
 351 static SCM
 352 scm_read_sexp (scm_t_wchar chr, SCM port)
 353 #define FUNC_NAME "scm_i_lreadparen"
 354 {
 355   register int c;
 356   register SCM tmp;
 357   register SCM tl, ans = SCM_EOL;
 358   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 359   const int terminating_char = ((chr == '[') ? ']' : ')');
 360
 361   /* Need to capture line and column numbers here. */
 362   long line = SCM_LINUM (port);
 363   int column = SCM_COL (port) - 1;
 364
 365
 366   c = flush_ws (port, FUNC_NAME);
 367   if (terminating_char == c)
 368     return SCM_EOL;
 369
 370   scm_ungetc (c, port);
 371   if (scm_is_eq (scm_sym_dot,
 372                  (tmp = scm_read_expression (port))))
 373     {
 374       ans = scm_read_expression (port);
 375       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 376         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 377                            SCM_EOL);
 378       return ans;
 379     }
 380
 381   /* Build the head of the list structure. */
 382   ans = tl = scm_cons (tmp, SCM_EOL);
 383
 384   if (SCM_COPY_SOURCE_P)
 385     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 386                            ? copy
 387                            : tmp,
 388                            SCM_EOL);
 389
 390   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 391     {
 392       SCM new_tail;
 393
 394       scm_ungetc (c, port);
 395       if (scm_is_eq (scm_sym_dot,
 396                      (tmp = scm_read_expression (port))))
 397         {
 398           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 399
 400           if (SCM_COPY_SOURCE_P)
 401             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 402                                        SCM_EOL));
 403
 404           c = flush_ws (port, FUNC_NAME);
 405           if (terminating_char != c)
 406             scm_i_input_error (FUNC_NAME, port,
 407                                "in pair: missing close paren", SCM_EOL);
 408           goto exit;
 409         }
 410
 411       new_tail = scm_cons (tmp, SCM_EOL);
 412       SCM_SETCDR (tl, new_tail);
 413       tl = new_tail;
 414
 415       if (SCM_COPY_SOURCE_P)
 416         {
 417           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 418                                     ? copy
 419                                     : tmp, SCM_EOL);
 420           SCM_SETCDR (tl2, new_tail2);
 421           tl2 = new_tail2;
 422         }
 423     }
 424
 425  exit:
 426   if (SCM_RECORD_POSITIONS_P)
 427     scm_whash_insert (scm_source_whash,
 428                       ans,
 429                       scm_make_srcprops (line, column,
 430                                          SCM_FILENAME (port),
 431                                          SCM_COPY_SOURCE_P
 432                                          ? ans2
 433                                          : SCM_UNDEFINED,
 434                                          SCM_EOL));
 435   return ans;
 436 }
 437 #undef FUNC_NAME
 438
 439
 440 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 441    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 442    found.  */
 443 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 444   do                                                               \
 445     {                                                              \
 446       scm_t_wchar a;                                               \
 447       size_t i = 0;                                                \
 448       c = 0;                                                       \
 449       while (i < ndigits)                                          \
 450         {                                                          \
 451           a = scm_getc (port);                                     \
 452           if (a == EOF)                                            \
 453             goto str_eof;                                          \
 454           if (terminator                                           \
 455               && (a == (scm_t_wchar) terminator)                   \
 456               && (i > 0))                                          \
 457             break;                                                 \
 458           if ('0' <= a && a <= '9')                                \
 459             a -= '0';                                              \
 460           else if ('A' <= a && a <= 'F')                           \
 461             a = a - 'A' + 10;                                      \
 462           else if ('a' <= a && a <= 'f')                           \
 463             a = a - 'a' + 10;                                      \
 464           else                                                     \
 465             {                                                      \
 466               c = a;                                               \
 467               goto bad_escaped;                                    \
 468             }                                                      \
 469           c = c * 16 + a;                                          \
 470           i ++;                                                    \
 471         }                                                          \
 472     } while (0)
 473
 474 static SCM
 475 scm_read_string (int chr, SCM port)
 476 #define FUNC_NAME "scm_lreadr"
 477 {
 478   /* For strings smaller than C_STR, this function creates only one Scheme
 479      object (the string returned).  */
 480
 481   SCM str = SCM_BOOL_F;
 482   unsigned c_str_len = 0;
 483   scm_t_wchar c;
 484
 485   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 486   while ('"' != (c = scm_getc (port)))
 487     {
 488       if (c == EOF)
 489         {
 490         str_eof:
 491           scm_i_input_error (FUNC_NAME, port,
 492                              "end of file in string constant", SCM_EOL);
 493         }
 494
 495       if (c_str_len + 1 >= scm_i_string_length (str))
 496         {
 497           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 498
 499           str = scm_string_append (scm_list_2 (str, addy));
 500         }
 501
 502       if (c == '\\')
 503         {
 504           switch (c = scm_getc (port))
 505             {
 506             case EOF:
 507               goto str_eof;
 508             case '"':
 509             case '\\':
 510               break;
 511             case '(':
 512             case ')':
 513               if (SCM_ESCAPED_PARENS_P)
 514                 break;
 515               goto bad_escaped;
 516             case '\n':
 517               continue;
 518             case '0':
 519               c = '\0';
 520               break;
 521             case 'f':
 522               c = '\f';
 523               break;
 524             case 'n':
 525               c = '\n';
 526               break;
 527             case 'r':
 528               c = '\r';
 529               break;
 530             case 't':
 531               c = '\t';
 532               break;
 533             case 'a':
 534               c = '\007';
 535               break;
 536             case 'v':
 537               c = '\v';
 538               break;
 539             case 'b':
 540               c = '\010';
 541               break;
 542             case 'x':
 543               if (SCM_R6RS_ESCAPES_P)
 544                 SCM_READ_HEX_ESCAPE (10, ';');
 545               else
 546                 SCM_READ_HEX_ESCAPE (2, '\0');
 547               break;
 548             case 'u':
 549               if (!SCM_R6RS_ESCAPES_P)
 550                 {
 551                   SCM_READ_HEX_ESCAPE (4, '\0');
 552                   break;
 553                 }
 554             case 'U':
 555               if (!SCM_R6RS_ESCAPES_P)
 556                 {
 557                   SCM_READ_HEX_ESCAPE (6, '\0');
 558                   break;
 559                 }
 560             default:
 561             bad_escaped:
 562               scm_i_input_error (FUNC_NAME, port,
 563                                  "illegal character in escape sequence: ~S",
 564                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 565             }
 566         }
 567       str = scm_i_string_start_writing (str);
 568       scm_i_string_set_x (str, c_str_len++, c);
 569       scm_i_string_stop_writing ();
 570     }
 571
 572   if (c_str_len > 0)
 573     {
 574       return scm_i_substring_copy (str, 0, c_str_len);
 575     }
 576
 577   return scm_nullstr;
 578 }
 579 #undef FUNC_NAME
 580
 581
 582 static SCM
 583 scm_read_number (scm_t_wchar chr, SCM port)
 584 {
 585   SCM result, str = SCM_EOL;
 586   char buffer[READER_BUFFER_SIZE];
 587   char *overflow_buffer = NULL;
 588   size_t bytes_read;
 589   int overflow;
 590   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 591
 592   scm_ungetc (chr, port);
 593   overflow = read_complete_token (port, buffer, sizeof (buffer),
 594                                   &overflow_buffer, &bytes_read);
 595
 596   if (!overflow)
 597     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 598   else
 599     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 600                             pt->ilseq_handler);
 601
 602   result = scm_string_to_number (str, SCM_UNDEFINED);
 603   if (!scm_is_true (result))
 604     {
 605       /* Return a symbol instead of a number */
 606       if (SCM_CASE_INSENSITIVE_P)
 607         str = scm_string_downcase_x (str);
 608       result = scm_string_to_symbol (str);
 609     }
 610
 611   if (overflow)
 612     free (overflow_buffer);
 613   SCM_COL (port) += scm_i_string_length (str);
 614   return result;
 615 }
 616
 617 static SCM
 618 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 619 {
 620   SCM result;
 621   int ends_with_colon = 0;
 622   size_t bytes_read;
 623   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 624   int overflow;
 625   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 626   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 627   SCM str;
 628
 629   scm_ungetc (chr, port);
 630   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 631                                   &overflow_buffer, &bytes_read);
 632   if (bytes_read > 0)
 633     {
 634       if (!overflow)
 635         ends_with_colon = buffer[bytes_read - 1] == ':';
 636       else
 637         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 638     }
 639
 640   if (postfix && ends_with_colon && (bytes_read > 1))
 641     {
 642       if (!overflow)
 643         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 644       else
 645         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 646                                 pt->ilseq_handler);
 647
 648       if (SCM_CASE_INSENSITIVE_P)
 649         str = scm_string_downcase_x (str);
 650       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 651     }
 652   else
 653     {
 654       if (!overflow)
 655         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 656       else
 657         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 658                                 pt->ilseq_handler);
 659
 660       if (SCM_CASE_INSENSITIVE_P)
 661         str = scm_string_downcase_x (str);
 662       result = scm_string_to_symbol (str);
 663     }
 664
 665   if (overflow)
 666     free (overflow_buffer);
 667   SCM_COL (port) += scm_i_string_length (str);
 668   return result;
 669 }
 670
 671 static SCM
 672 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 673 #define FUNC_NAME "scm_lreadr"
 674 {
 675   SCM result;
 676   size_t read;
 677   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 678   int overflow;
 679   unsigned int radix;
 680   SCM str;
 681   scm_t_port *pt;
 682
 683   switch (chr)
 684     {
 685     case 'B':
 686     case 'b':
 687       radix = 2;
 688       break;
 689
 690     case 'o':
 691     case 'O':
 692       radix = 8;
 693       break;
 694
 695     case 'd':
 696     case 'D':
 697       radix = 10;
 698       break;
 699
 700     case 'x':
 701     case 'X':
 702       radix = 16;
 703       break;
 704
 705     default:
 706       scm_ungetc (chr, port);
 707       scm_ungetc ('#', port);
 708       radix = 10;
 709     }
 710
 711   overflow = read_complete_token (port, buffer, sizeof (buffer),
 712                                   &overflow_buffer, &read);
 713
 714   pt = SCM_PTAB_ENTRY (port);
 715   if (!overflow)
 716     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 717   else
 718     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 719                             pt->ilseq_handler);
 720
 721   result = scm_string_to_number (str, scm_from_uint (radix));
 722
 723   if (overflow)
 724     free (overflow_buffer);
 725
 726   SCM_COL (port) += scm_i_string_length (str);
 727
 728   if (scm_is_true (result))
 729     return result;
 730
 731   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 732
 733   return SCM_BOOL_F;
 734 }
 735 #undef FUNC_NAME
 736
 737 static SCM
 738 scm_read_quote (int chr, SCM port)
 739 {
 740   SCM p;
 741   long line = SCM_LINUM (port);
 742   int column = SCM_COL (port) - 1;
 743
 744   switch (chr)
 745     {
 746     case '`':
 747       p = scm_sym_quasiquote;
 748       break;
 749
 750     case '\'':
 751       p = scm_sym_quote;
 752       break;
 753
 754     case ',':
 755       {
 756         scm_t_wchar c;
 757
 758         c = scm_getc (port);
 759         if ('@' == c)
 760           p = scm_sym_uq_splicing;
 761         else
 762           {
 763             scm_ungetc (c, port);
 764             p = scm_sym_unquote;
 765           }
 766         break;
 767       }
 768
 769     default:
 770       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 771                "scm_read_quote", chr);
 772       abort ();
 773     }
 774
 775   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 776   if (SCM_RECORD_POSITIONS_P)
 777     scm_whash_insert (scm_source_whash, p,
 778                       scm_make_srcprops (line, column,
 779                                          SCM_FILENAME (port),
 780                                          SCM_COPY_SOURCE_P
 781                                          ? (scm_cons2 (SCM_CAR (p),
 782                                                        SCM_CAR (SCM_CDR (p)),
 783                                                        SCM_EOL))
 784                                          : SCM_UNDEFINED,
 785                                          SCM_EOL));
 786
 787
 788   return p;
 789 }
 790
 791 SCM_SYMBOL (sym_syntax, "syntax");
 792 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 793 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 794 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 795
 796 static SCM
 797 scm_read_syntax (int chr, SCM port)
 798 {
 799   SCM p;
 800   long line = SCM_LINUM (port);
 801   int column = SCM_COL (port) - 1;
 802
 803   switch (chr)
 804     {
 805     case '`':
 806       p = sym_quasisyntax;
 807       break;
 808
 809     case '\'':
 810       p = sym_syntax;
 811       break;
 812
 813     case ',':
 814       {
 815         int c;
 816
 817         c = scm_getc (port);
 818         if ('@' == c)
 819           p = sym_unsyntax_splicing;
 820         else
 821           {
 822             scm_ungetc (c, port);
 823             p = sym_unsyntax;
 824           }
 825         break;
 826       }
 827
 828     default:
 829       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 830                "scm_read_syntax", chr);
 831       abort ();
 832     }
 833
 834   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 835   if (SCM_RECORD_POSITIONS_P)
 836     scm_whash_insert (scm_source_whash, p,
 837                       scm_make_srcprops (line, column,
 838                                          SCM_FILENAME (port),
 839                                          SCM_COPY_SOURCE_P
 840                                          ? (scm_cons2 (SCM_CAR (p),
 841                                                        SCM_CAR (SCM_CDR (p)),
 842                                                        SCM_EOL))
 843                                          : SCM_UNDEFINED,
 844                                          SCM_EOL));
 845
 846
 847   return p;
 848 }
 849
 850 static inline SCM
 851 scm_read_nil (int chr, SCM port)
 852 {
 853   SCM id = scm_read_mixed_case_symbol (chr, port);
 854
 855   if (!scm_is_eq (id, sym_nil))
 856     scm_i_input_error ("scm_read_nil", port,
 857                        "unexpected input while reading #nil: ~a",
 858                        scm_list_1 (id));
 859
 860   return SCM_ELISP_NIL;
 861 }
 862
 863 static inline SCM
 864 scm_read_semicolon_comment (int chr, SCM port)
 865 {
 866   int c;
 867
 868   /* We use the get_byte here because there is no need to get the
 869      locale correct with comment input. This presumes that newline
 870      always represents itself no matter what the encoding is.  */
 871   for (c = scm_get_byte_or_eof (port);
 872        (c != EOF) && (c != '\n');
 873        c = scm_get_byte_or_eof (port));
 874
 875   return SCM_UNSPECIFIED;
 876 }
 877
 878 \f
 879 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 880
 881 static SCM
 882 scm_read_boolean (int chr, SCM port)
 883 {
 884   switch (chr)
 885     {
 886     case 't':
 887     case 'T':
 888       return SCM_BOOL_T;
 889
 890     case 'f':
 891     case 'F':
 892       return SCM_BOOL_F;
 893     }
 894
 895   return SCM_UNSPECIFIED;
 896 }
 897
 898 static SCM
 899 scm_read_character (scm_t_wchar chr, SCM port)
 900 #define FUNC_NAME "scm_lreadr"
 901 {
 902   char buffer[READER_CHAR_NAME_MAX_SIZE];
 903   SCM charname;
 904   size_t charname_len, bytes_read;
 905   scm_t_wchar cp;
 906   int overflow;
 907   scm_t_port *pt;
 908
 909   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 910   if (overflow)
 911     goto char_error;
 912
 913   if (bytes_read == 0)
 914     {
 915       chr = scm_getc (port);
 916       if (chr == EOF)
 917         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 918                            "while reading character", SCM_EOL);
 919
 920       /* CHR must be a token delimiter, like a whitespace.  */
 921       return (SCM_MAKE_CHAR (chr));
 922     }
 923
 924   pt = SCM_PTAB_ENTRY (port);
 925
 926   /* Simple ASCII characters can be processed immediately.  Also, simple
 927      ISO-8859-1 characters can be processed immediately if the encoding for this
 928      port is ISO-8859-1.  */
 929   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 930     {
 931       SCM_COL (port) += 1;
 932       return SCM_MAKE_CHAR (buffer[0]);
 933     }
 934
 935   /* Otherwise, convert the buffer into a proper scheme string for
 936      processing.  */
 937   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 938                                pt->ilseq_handler);
 939   charname_len = scm_i_string_length (charname);
 940   SCM_COL (port) += charname_len;
 941   cp = scm_i_string_ref (charname, 0);
 942   if (charname_len == 1)
 943     return SCM_MAKE_CHAR (cp);
 944
 945   /* Ignore dotted circles, which may be used to keep combining characters from
 946      combining with the backslash in #\charname.  */
 947   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 948     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 949
 950   if (cp >= '0' && cp < '8')
 951     {
 952       /* Dirk:FIXME::  This type of character syntax is not R5RS
 953        * compliant.  Further, it should be verified that the constant
 954        * does only consist of octal digits.  */
 955       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 956       if (SCM_I_INUMP (p))
 957         {
 958           scm_t_wchar c = SCM_I_INUM (p);
 959           if (SCM_IS_UNICODE_CHAR (c))
 960             return SCM_MAKE_CHAR (c);
 961           else
 962             scm_i_input_error (FUNC_NAME, port,
 963                                "out-of-range octal character escape: ~a",
 964                                scm_list_1 (charname));
 965         }
 966     }
 967
 968   if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
 969     {
 970       SCM p;
 971
 972       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 973       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 974                                 scm_from_uint (16));
 975       if (SCM_I_INUMP (p))
 976         {
 977           scm_t_wchar c = SCM_I_INUM (p);
 978           if (SCM_IS_UNICODE_CHAR (c))
 979             return SCM_MAKE_CHAR (c);
 980           else
 981             scm_i_input_error (FUNC_NAME, port,
 982                                "out-of-range hex character escape: ~a",
 983                                scm_list_1 (charname));
 984         }
 985     }
 986
 987   /* The names of characters should never have non-Latin1
 988      characters.  */
 989   if (scm_i_is_narrow_string (charname)
 990       || scm_i_try_narrow_string (charname))
 991     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 992                                        charname_len);
 993       if (scm_is_true (ch))
 994         return ch;
 995     }
 996
 997  char_error:
 998   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 999                      scm_list_1 (charname));
1000
1001   return SCM_UNSPECIFIED;
1002 }
1003 #undef FUNC_NAME
1004
1005 static inline SCM
1006 scm_read_keyword (int chr, SCM port)
1007 {
1008   SCM symbol;
1009
1010   /* Read the symbol that comprises the keyword.  Doing this instead of
1011      invoking a specific symbol reader function allows `scm_read_keyword ()'
1012      to adapt to the delimiters currently valid of symbols.
1013
1014      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1015   symbol = scm_read_expression (port);
1016   if (!scm_is_symbol (symbol))
1017     scm_i_input_error ("scm_read_keyword", port,
1018                        "keyword prefix `~a' not followed by a symbol: ~s",
1019                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1020
1021   return (scm_symbol_to_keyword (symbol));
1022 }
1023
1024 static inline SCM
1025 scm_read_vector (int chr, SCM port)
1026 {
1027   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1028      guarantee that it's going to do what we want.  After all, this is an
1029      implementation detail of `scm_read_vector ()', not a desirable
1030      property.  */
1031   return (scm_vector (scm_read_sexp (chr, port)));
1032 }
1033
1034 static inline SCM
1035 scm_read_srfi4_vector (int chr, SCM port)
1036 {
1037   return scm_i_read_array (port, chr);
1038 }
1039
1040 static SCM
1041 scm_read_bytevector (scm_t_wchar chr, SCM port)
1042 {
1043   chr = scm_getc (port);
1044   if (chr != 'u')
1045     goto syntax;
1046
1047   chr = scm_getc (port);
1048   if (chr != '8')
1049     goto syntax;
1050
1051   chr = scm_getc (port);
1052   if (chr != '(')
1053     goto syntax;
1054
1055   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1056
1057  syntax:
1058   scm_i_input_error ("read_bytevector", port,
1059                      "invalid bytevector prefix",
1060                      SCM_MAKE_CHAR (chr));
1061   return SCM_UNSPECIFIED;
1062 }
1063
1064 static SCM
1065 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1066 {
1067   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1068      terribly inefficient but who cares?  */
1069   SCM s_bits = SCM_EOL;
1070
1071   for (chr = scm_getc (port);
1072        (chr != EOF) && ((chr == '0') || (chr == '1'));
1073        chr = scm_getc (port))
1074     {
1075       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1076     }
1077
1078   if (chr != EOF)
1079     scm_ungetc (chr, port);
1080
1081   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1082 }
1083
1084 static inline SCM
1085 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1086 {
1087   int bang_seen = 0;
1088
1089   /* We can use the get_byte here because there is no need to get the
1090      locale correct when reading comments. This presumes that
1091      hash and exclamation points always represent themselves no
1092      matter what the source encoding is.*/
1093   for (;;)
1094     {
1095       int c = scm_get_byte_or_eof (port);
1096
1097       if (c == EOF)
1098         scm_i_input_error ("skip_block_comment", port,
1099                            "unterminated `#! ... !#' comment", SCM_EOL);
1100
1101       if (c == '!')
1102         bang_seen = 1;
1103       else if (c == '#' && bang_seen)
1104         break;
1105       else
1106         bang_seen = 0;
1107     }
1108
1109   return SCM_UNSPECIFIED;
1110 }
1111
1112 static SCM
1113 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1114 {
1115   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1116      nested.  So care must be taken.  */
1117   int nesting_level = 1;
1118   int opening_seen = 0, closing_seen = 0;
1119
1120   while (nesting_level > 0)
1121     {
1122       int c = scm_getc (port);
1123
1124       if (c == EOF)
1125         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1126                            "unterminated `#| ... |#' comment", SCM_EOL);
1127
1128       if (opening_seen)
1129         {
1130           if (c == '|')
1131             nesting_level++;
1132           opening_seen = 0;
1133         }
1134       else if (closing_seen)
1135         {
1136           if (c == '#')
1137             nesting_level--;
1138           closing_seen = 0;
1139         }
1140       else if (c == '|')
1141         closing_seen = 1;
1142       else if (c == '#')
1143         opening_seen = 1;
1144       else
1145         opening_seen = closing_seen = 0;
1146     }
1147
1148   return SCM_UNSPECIFIED;
1149 }
1150
1151 static SCM
1152 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1153 {
1154   scm_t_wchar c;
1155
1156   c = flush_ws (port, (char *) NULL);
1157   if (EOF == c)
1158     scm_i_input_error ("read_commented_expression", port,
1159                        "no expression after #; comment", SCM_EOL);
1160   scm_ungetc (c, port);
1161   scm_read_expression (port);
1162   return SCM_UNSPECIFIED;
1163 }
1164
1165 static SCM
1166 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1167 {
1168   /* Guile's extended symbol read syntax looks like this:
1169
1170        #{This is all a symbol name}#
1171
1172      So here, CHR is expected to be `{'.  */
1173   int saw_brace = 0, finished = 0;
1174   size_t len = 0;
1175   SCM buf = scm_i_make_string (1024, NULL);
1176
1177   buf = scm_i_string_start_writing (buf);
1178
1179   while ((chr = scm_getc (port)) != EOF)
1180     {
1181       if (saw_brace)
1182         {
1183           if (chr == '#')
1184             {
1185               finished = 1;
1186               break;
1187             }
1188           else
1189             {
1190               saw_brace = 0;
1191               scm_i_string_set_x (buf, len++, '}');
1192               scm_i_string_set_x (buf, len++, chr);
1193             }
1194         }
1195       else if (chr == '}')
1196         saw_brace = 1;
1197       else
1198         scm_i_string_set_x (buf, len++, chr);
1199
1200       if (len >= scm_i_string_length (buf) - 2)
1201         {
1202           SCM addy;
1203
1204           scm_i_string_stop_writing ();
1205           addy = scm_i_make_string (1024, NULL);
1206           buf = scm_string_append (scm_list_2 (buf, addy));
1207           len = 0;
1208           buf = scm_i_string_start_writing (buf);
1209         }
1210
1211       if (finished)
1212         break;
1213     }
1214   scm_i_string_stop_writing ();
1215
1216   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1217 }
1218
1219
1220 \f
1221 /* Top-level token readers, i.e., dispatchers.  */
1222
1223 static SCM
1224 scm_read_sharp_extension (int chr, SCM port)
1225 {
1226   SCM proc;
1227
1228   proc = scm_get_hash_procedure (chr);
1229   if (scm_is_true (scm_procedure_p (proc)))
1230     {
1231       long line = SCM_LINUM (port);
1232       int column = SCM_COL (port) - 2;
1233       SCM got;
1234
1235       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1236       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1237         {
1238           if (SCM_RECORD_POSITIONS_P)
1239             return (recsexpr (got, line, column,
1240                               SCM_FILENAME (port)));
1241           else
1242             return got;
1243         }
1244     }
1245
1246   return SCM_UNSPECIFIED;
1247 }
1248
1249 /* The reader for the sharp `#' character.  It basically dispatches reads
1250    among the above token readers.   */
1251 static SCM
1252 scm_read_sharp (scm_t_wchar chr, SCM port)
1253 #define FUNC_NAME "scm_lreadr"
1254 {
1255   SCM result;
1256
1257   chr = scm_getc (port);
1258
1259   result = scm_read_sharp_extension (chr, port);
1260   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1261     return result;
1262
1263   switch (chr)
1264     {
1265     case '\\':
1266       return (scm_read_character (chr, port));
1267     case '(':
1268       return (scm_read_vector (chr, port));
1269     case 's':
1270     case 'u':
1271     case 'f':
1272       /* This one may return either a boolean or an SRFI-4 vector.  */
1273       return (scm_read_srfi4_vector (chr, port));
1274     case 'v':
1275       return (scm_read_bytevector (chr, port));
1276     case '*':
1277       return (scm_read_guile_bit_vector (chr, port));
1278     case 't':
1279     case 'T':
1280     case 'F':
1281       /* This one may return either a boolean or an SRFI-4 vector.  */
1282       return (scm_read_boolean (chr, port));
1283     case ':':
1284       return (scm_read_keyword (chr, port));
1285     case '0': case '1': case '2': case '3': case '4':
1286     case '5': case '6': case '7': case '8': case '9':
1287     case '@':
1288 #if SCM_ENABLE_DEPRECATED
1289       /* See below for 'i' and 'e'. */
1290     case 'a':
1291     case 'c':
1292     case 'y':
1293     case 'h':
1294     case 'l':
1295 #endif
1296       return (scm_i_read_array (port, chr));
1297
1298     case 'i':
1299     case 'e':
1300 #if SCM_ENABLE_DEPRECATED
1301       {
1302         /* When next char is '(', it really is an old-style
1303            uniform array. */
1304         scm_t_wchar next_c = scm_getc (port);
1305         if (next_c != EOF)
1306           scm_ungetc (next_c, port);
1307         if (next_c == '(')
1308           return scm_i_read_array (port, chr);
1309         /* Fall through. */
1310       }
1311 #endif
1312     case 'b':
1313     case 'B':
1314     case 'o':
1315     case 'O':
1316     case 'd':
1317     case 'D':
1318     case 'x':
1319     case 'X':
1320     case 'I':
1321     case 'E':
1322       return (scm_read_number_and_radix (chr, port));
1323     case '{':
1324       return (scm_read_extended_symbol (chr, port));
1325     case '!':
1326       return (scm_read_scsh_block_comment (chr, port));
1327     case ';':
1328       return (scm_read_commented_expression (chr, port));
1329     case '`':
1330     case '\'':
1331     case ',':
1332       return (scm_read_syntax (chr, port));
1333     case 'n':
1334       return (scm_read_nil (chr, port));
1335     default:
1336       result = scm_read_sharp_extension (chr, port);
1337       if (scm_is_eq (result, SCM_UNSPECIFIED))
1338         {
1339           /* To remain compatible with 1.8 and earlier, the following
1340              characters have lower precedence than `read-hash-extend'
1341              characters.  */
1342           switch (chr)
1343             {
1344             case '|':
1345               return scm_read_r6rs_block_comment (chr, port);
1346             default:
1347               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1348                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1349             }
1350         }
1351       else
1352         return result;
1353     }
1354
1355   return SCM_UNSPECIFIED;
1356 }
1357 #undef FUNC_NAME
1358
1359 static SCM
1360 scm_read_expression (SCM port)
1361 #define FUNC_NAME "scm_read_expression"
1362 {
1363   while (1)
1364     {
1365       register scm_t_wchar chr;
1366
1367       chr = scm_getc (port);
1368
1369       switch (chr)
1370         {
1371         case SCM_WHITE_SPACES:
1372         case SCM_LINE_INCREMENTORS:
1373           break;
1374         case ';':
1375           (void) scm_read_semicolon_comment (chr, port);
1376           break;
1377         case '[':
1378           if (!SCM_SQUARE_BRACKETS_P)
1379             return (scm_read_mixed_case_symbol (chr, port));
1380           /* otherwise fall through */
1381         case '(':
1382           return (scm_read_sexp (chr, port));
1383         case '"':
1384           return (scm_read_string (chr, port));
1385         case '\'':
1386         case '`':
1387         case ',':
1388           return (scm_read_quote (chr, port));
1389         case '#':
1390           {
1391             SCM result;
1392             result = scm_read_sharp (chr, port);
1393             if (scm_is_eq (result, SCM_UNSPECIFIED))
1394               /* We read a comment or some such.  */
1395               break;
1396             else
1397               return result;
1398           }
1399         case ')':
1400           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1401           break;
1402         case EOF:
1403           return SCM_EOF_VAL;
1404         case ':':
1405           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1406             return scm_symbol_to_keyword (scm_read_expression (port));
1407           /* Fall through.  */
1408
1409         default:
1410           {
1411             if (((chr >= '0') && (chr <= '9'))
1412                 || (strchr ("+-.", chr)))
1413               return (scm_read_number (chr, port));
1414             else
1415               return (scm_read_mixed_case_symbol (chr, port));
1416           }
1417         }
1418     }
1419 }
1420 #undef FUNC_NAME
1421
1422 \f
1423 /* Actual reader.  */
1424
1425 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1426             (SCM port),
1427             "Read an s-expression from the input port @var{port}, or from\n"
1428             "the current input port if @var{port} is not specified.\n"
1429             "Any whitespace before the next token is discarded.")
1430 #define FUNC_NAME s_scm_read
1431 {
1432   int c;
1433
1434   if (SCM_UNBNDP (port))
1435     port = scm_current_input_port ();
1436   SCM_VALIDATE_OPINPORT (1, port);
1437
1438   c = flush_ws (port, (char *) NULL);
1439   if (EOF == c)
1440     return SCM_EOF_VAL;
1441   scm_ungetc (c, port);
1442
1443   return (scm_read_expression (port));
1444 }
1445 #undef FUNC_NAME
1446
1447
1448 \f
1449
1450 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1451 static SCM
1452 recsexpr (SCM obj, long line, int column, SCM filename)
1453 {
1454   if (!scm_is_pair(obj)) {
1455     return obj;
1456   } else {
1457     SCM tmp = obj, copy;
1458     /* If this sexpr is visible in the read:sharp source, we want to
1459        keep that information, so only record non-constant cons cells
1460        which haven't previously been read by the reader. */
1461     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1462       {
1463         if (SCM_COPY_SOURCE_P)
1464           {
1465             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1466                              SCM_UNDEFINED);
1467             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1468               {
1469                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1470                                                       line,
1471                                                       column,
1472                                                       filename),
1473                                             SCM_UNDEFINED));
1474                 copy = SCM_CDR (copy);
1475               }
1476             SCM_SETCDR (copy, tmp);
1477           }
1478         else
1479           {
1480             recsexpr (SCM_CAR (obj), line, column, filename);
1481             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1482               recsexpr (SCM_CAR (tmp), line, column, filename);
1483             copy = SCM_UNDEFINED;
1484           }
1485         scm_whash_insert (scm_source_whash,
1486                           obj,
1487                           scm_make_srcprops (line,
1488                                              column,
1489                                              filename,
1490                                              copy,
1491                                              SCM_EOL));
1492       }
1493     return obj;
1494   }
1495 }
1496
1497 /* Manipulate the read-hash-procedures alist.  This could be written in
1498    Scheme, but maybe it will also be used by C code during initialisation.  */
1499 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1500             (SCM chr, SCM proc),
1501             "Install the procedure @var{proc} for reading expressions\n"
1502             "starting with the character sequence @code{#} and @var{chr}.\n"
1503             "@var{proc} will be called with two arguments:  the character\n"
1504             "@var{chr} and the port to read further data from. The object\n"
1505             "returned will be the return value of @code{read}. \n"
1506             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1507             )
1508 #define FUNC_NAME s_scm_read_hash_extend
1509 {
1510   SCM this;
1511   SCM prev;
1512
1513   SCM_VALIDATE_CHAR (1, chr);
1514   SCM_ASSERT (scm_is_false (proc)
1515               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1516               proc, SCM_ARG2, FUNC_NAME);
1517
1518   /* Check if chr is already in the alist.  */
1519   this = *scm_read_hash_procedures;
1520   prev = SCM_BOOL_F;
1521   while (1)
1522     {
1523       if (scm_is_null (this))
1524         {
1525           /* not found, so add it to the beginning.  */
1526           if (scm_is_true (proc))
1527             {
1528               *scm_read_hash_procedures =
1529                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1530             }
1531           break;
1532         }
1533       if (scm_is_eq (chr, SCM_CAAR (this)))
1534         {
1535           /* already in the alist.  */
1536           if (scm_is_false (proc))
1537             {
1538               /* remove it.  */
1539               if (scm_is_false (prev))
1540                 {
1541                   *scm_read_hash_procedures =
1542                     SCM_CDR (*scm_read_hash_procedures);
1543                 }
1544               else
1545                 scm_set_cdr_x (prev, SCM_CDR (this));
1546             }
1547           else
1548             {
1549               /* replace it.  */
1550               scm_set_cdr_x (SCM_CAR (this), proc);
1551             }
1552           break;
1553         }
1554       prev = this;
1555       this = SCM_CDR (this);
1556     }
1557
1558   return SCM_UNSPECIFIED;
1559 }
1560 #undef FUNC_NAME
1561
1562 /* Recover the read-hash procedure corresponding to char c.  */
1563 static SCM
1564 scm_get_hash_procedure (int c)
1565 {
1566   SCM rest = *scm_read_hash_procedures;
1567
1568   while (1)
1569     {
1570       if (scm_is_null (rest))
1571         return SCM_BOOL_F;
1572
1573       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1574         return SCM_CDAR (rest);
1575
1576       rest = SCM_CDR (rest);
1577     }
1578 }
1579
1580 #define SCM_ENCODING_SEARCH_SIZE (500)
1581
1582 /* Search the first few hundred characters of a file for an Emacs-like coding
1583    declaration.  Returns either NULL or a string whose storage has been
1584    allocated with `scm_gc_malloc ()'.  */
1585 char *
1586 scm_i_scan_for_encoding (SCM port)
1587 {
1588   char header[SCM_ENCODING_SEARCH_SIZE+1];
1589   size_t bytes_read;
1590   char *encoding = NULL;
1591   int utf8_bom = 0;
1592   char *pos;
1593   int i;
1594   int in_comment;
1595
1596   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1597     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1598        "guile <(echo '(display "hello")')") so bail out.  */
1599     return NULL;
1600
1601   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1602
1603   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1604
1605   if (bytes_read > 3
1606       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1607     utf8_bom = 1;
1608
1609   /* search past "coding[:=]" */
1610   pos = header;
1611   while (1)
1612     {
1613       if ((pos = strstr(pos, "coding")) == NULL)
1614         return NULL;
1615
1616       pos += strlen("coding");
1617       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1618           (*pos == ':' || *pos == '='))
1619         {
1620           pos ++;
1621           break;
1622         }
1623     }
1624
1625   /* skip spaces */
1626   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1627          (*pos == ' ' || *pos == '\t'))
1628     pos ++;
1629
1630   /* grab the next token */
1631   i = 0;
1632   while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1633          && pos + i - header < bytes_read
1634          && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1635     i++;
1636
1637   if (i == 0)
1638     return NULL;
1639
1640   encoding = scm_gc_strndup (pos, i, "encoding");
1641   for (i = 0; i < strlen (encoding); i++)
1642     encoding[i] = toupper ((int) encoding[i]);
1643
1644   /* push backwards to make sure we were in a comment */
1645   in_comment = 0;
1646   while (pos - i - header > 0)
1647     {
1648       if (*(pos - i) == '\n')
1649         {
1650           /* This wasn't in a semicolon comment. Check for a
1651            hash-bang comment. */
1652           char *beg = strstr (header, "#!");
1653           char *end = strstr (header, "!#");
1654           if (beg < pos && pos < end)
1655             in_comment = 1;
1656           break;
1657         }
1658       if (*(pos - i) == ';')
1659         {
1660           in_comment = 1;
1661           break;
1662         }
1663       i ++;
1664     }
1665   if (!in_comment)
1666     /* This wasn't in a comment */
1667     return NULL;
1668
1669   if (utf8_bom && strcmp(encoding, "UTF-8"))
1670     scm_misc_error (NULL,
1671                     "the port input declares the encoding ~s but is encoded as UTF-8",
1672                     scm_list_1 (scm_from_locale_string (encoding)));
1673
1674   return encoding;
1675 }
1676
1677 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1678             (SCM port),
1679             "Scans the port for an Emacs-like character coding declaration\n"
1680             "near the top of the contents of a port with random-acessible contents.\n"
1681             "The coding declaration is of the form\n"
1682             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1683             "\n"
1684             "Returns a string containing the character encoding of the file\n"
1685             "if a declaration was found, or @code{#f} otherwise.\n")
1686 #define FUNC_NAME s_scm_file_encoding
1687 {
1688   char *enc;
1689   SCM s_enc;
1690
1691   enc = scm_i_scan_for_encoding (port);
1692   if (enc == NULL)
1693     return SCM_BOOL_F;
1694   else
1695     {
1696       s_enc = scm_from_locale_string (enc);
1697       return s_enc;
1698     }
1699
1700   return SCM_BOOL_F;
1701 }
1702 #undef FUNC_NAME
1703
1704 void
1705 scm_init_read ()
1706 {
1707   scm_read_hash_procedures =
1708     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1709
1710   scm_init_opts (scm_read_options, scm_read_opts);
1711 #include "libguile/read.x"
1712 }
1713
1714 /*
1715   Local Variables:
1716   c-file-style: "gnu"
1717   End:
1718 */