libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 0,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static inline SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static inline void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static inline int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   register scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 361
 362
 363 static SCM
 364 scm_read_sexp (scm_t_wchar chr, SCM port)
 365 #define FUNC_NAME "scm_i_lreadparen"
 366 {
 367   register int c;
 368   register SCM tmp;
 369   register SCM tl, ans = SCM_EOL;
 370   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 371   const int terminating_char = ((chr == '[') ? ']' : ')');
 372
 373   /* Need to capture line and column numbers here. */
 374   long line = SCM_LINUM (port);
 375   int column = SCM_COL (port) - 1;
 376
 377
 378   c = flush_ws (port, FUNC_NAME);
 379   if (terminating_char == c)
 380     return SCM_EOL;
 381
 382   scm_ungetc (c, port);
 383   if (scm_is_eq (scm_sym_dot,
 384                  (tmp = scm_read_expression (port))))
 385     {
 386       ans = scm_read_expression (port);
 387       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 388         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 389                            SCM_EOL);
 390       return ans;
 391     }
 392
 393   /* Build the head of the list structure. */
 394   ans = tl = scm_cons (tmp, SCM_EOL);
 395
 396   if (SCM_COPY_SOURCE_P)
 397     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 398                            ? copy
 399                            : tmp,
 400                            SCM_EOL);
 401
 402   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 403     {
 404       SCM new_tail;
 405
 406       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 407         scm_i_input_error (FUNC_NAME, port,
 408                            "in pair: mismatched close paren: ~A",
 409                            scm_list_1 (SCM_MAKE_CHAR (c)));
 410
 411       scm_ungetc (c, port);
 412       tmp = scm_read_expression (port);
 413
 414       if (scm_is_eq (scm_sym_dot, tmp))
 415         {
 416           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 417
 418           if (SCM_COPY_SOURCE_P)
 419             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 420                                        SCM_EOL));
 421
 422           c = flush_ws (port, FUNC_NAME);
 423           if (terminating_char != c)
 424             scm_i_input_error (FUNC_NAME, port,
 425                                "in pair: missing close paren", SCM_EOL);
 426           goto exit;
 427         }
 428
 429       new_tail = scm_cons (tmp, SCM_EOL);
 430       SCM_SETCDR (tl, new_tail);
 431       tl = new_tail;
 432
 433       if (SCM_COPY_SOURCE_P)
 434         {
 435           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 436                                     ? copy
 437                                     : tmp, SCM_EOL);
 438           SCM_SETCDR (tl2, new_tail2);
 439           tl2 = new_tail2;
 440         }
 441     }
 442
 443  exit:
 444   if (SCM_RECORD_POSITIONS_P)
 445     scm_whash_insert (scm_source_whash,
 446                       ans,
 447                       scm_make_srcprops (line, column,
 448                                          SCM_FILENAME (port),
 449                                          SCM_COPY_SOURCE_P
 450                                          ? ans2
 451                                          : SCM_UNDEFINED,
 452                                          SCM_EOL));
 453   return ans;
 454 }
 455 #undef FUNC_NAME
 456
 457
 458 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 459    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 460    found.  */
 461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 462   do                                                               \
 463     {                                                              \
 464       scm_t_wchar a;                                               \
 465       size_t i = 0;                                                \
 466       c = 0;                                                       \
 467       while (i < ndigits)                                          \
 468         {                                                          \
 469           a = scm_getc (port);                                     \
 470           if (a == EOF)                                            \
 471             goto str_eof;                                          \
 472           if (terminator                                           \
 473               && (a == (scm_t_wchar) terminator)                   \
 474               && (i > 0))                                          \
 475             break;                                                 \
 476           if ('0' <= a && a <= '9')                                \
 477             a -= '0';                                              \
 478           else if ('A' <= a && a <= 'F')                           \
 479             a = a - 'A' + 10;                                      \
 480           else if ('a' <= a && a <= 'f')                           \
 481             a = a - 'a' + 10;                                      \
 482           else                                                     \
 483             {                                                      \
 484               c = a;                                               \
 485               goto bad_escaped;                                    \
 486             }                                                      \
 487           c = c * 16 + a;                                          \
 488           i ++;                                                    \
 489         }                                                          \
 490     } while (0)
 491
 492 static void
 493 skip_intraline_whitespace (SCM port)
 494 {
 495   scm_t_wchar c;
 496
 497   do
 498     {
 499       c = scm_getc (port);
 500       if (c == EOF)
 501         return;
 502     }
 503   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 504
 505   scm_ungetc (c, port);
 506 }
 507
 508 static SCM
 509 scm_read_string (int chr, SCM port)
 510 #define FUNC_NAME "scm_lreadr"
 511 {
 512   /* For strings smaller than C_STR, this function creates only one Scheme
 513      object (the string returned).  */
 514
 515   SCM str = SCM_BOOL_F;
 516   unsigned c_str_len = 0;
 517   scm_t_wchar c;
 518
 519   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 520   while ('"' != (c = scm_getc (port)))
 521     {
 522       if (c == EOF)
 523         {
 524         str_eof:
 525           scm_i_input_error (FUNC_NAME, port,
 526                              "end of file in string constant", SCM_EOL);
 527         }
 528
 529       if (c_str_len + 1 >= scm_i_string_length (str))
 530         {
 531           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 532
 533           str = scm_string_append (scm_list_2 (str, addy));
 534         }
 535
 536       if (c == '\\')
 537         {
 538           switch (c = scm_getc (port))
 539             {
 540             case EOF:
 541               goto str_eof;
 542             case '"':
 543             case '\\':
 544               break;
 545             case '\n':
 546               if (SCM_HUNGRY_EOL_ESCAPES_P)
 547                 skip_intraline_whitespace (port);
 548               continue;
 549             case '0':
 550               c = '\0';
 551               break;
 552             case 'f':
 553               c = '\f';
 554               break;
 555             case 'n':
 556               c = '\n';
 557               break;
 558             case 'r':
 559               c = '\r';
 560               break;
 561             case 't':
 562               c = '\t';
 563               break;
 564             case 'a':
 565               c = '\007';
 566               break;
 567             case 'v':
 568               c = '\v';
 569               break;
 570             case 'b':
 571               c = '\010';
 572               break;
 573             case 'x':
 574               if (SCM_R6RS_ESCAPES_P)
 575                 SCM_READ_HEX_ESCAPE (10, ';');
 576               else
 577                 SCM_READ_HEX_ESCAPE (2, '\0');
 578               break;
 579             case 'u':
 580               if (!SCM_R6RS_ESCAPES_P)
 581                 {
 582                   SCM_READ_HEX_ESCAPE (4, '\0');
 583                   break;
 584                 }
 585             case 'U':
 586               if (!SCM_R6RS_ESCAPES_P)
 587                 {
 588                   SCM_READ_HEX_ESCAPE (6, '\0');
 589                   break;
 590                 }
 591             default:
 592             bad_escaped:
 593               scm_i_input_error (FUNC_NAME, port,
 594                                  "illegal character in escape sequence: ~S",
 595                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 596             }
 597         }
 598       str = scm_i_string_start_writing (str);
 599       scm_i_string_set_x (str, c_str_len++, c);
 600       scm_i_string_stop_writing ();
 601     }
 602
 603   if (c_str_len > 0)
 604     {
 605       return scm_i_substring_copy (str, 0, c_str_len);
 606     }
 607
 608   return scm_nullstr;
 609 }
 610 #undef FUNC_NAME
 611
 612
 613 static SCM
 614 scm_read_number (scm_t_wchar chr, SCM port)
 615 {
 616   SCM result, str = SCM_EOL;
 617   char buffer[READER_BUFFER_SIZE];
 618   char *overflow_buffer = NULL;
 619   size_t bytes_read;
 620   int overflow;
 621   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 622
 623   scm_ungetc (chr, port);
 624   overflow = read_complete_token (port, buffer, sizeof (buffer),
 625                                   &overflow_buffer, &bytes_read);
 626
 627   if (!overflow)
 628     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 629   else
 630     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 631                             pt->ilseq_handler);
 632
 633   result = scm_string_to_number (str, SCM_UNDEFINED);
 634   if (!scm_is_true (result))
 635     {
 636       /* Return a symbol instead of a number */
 637       if (SCM_CASE_INSENSITIVE_P)
 638         str = scm_string_downcase_x (str);
 639       result = scm_string_to_symbol (str);
 640     }
 641
 642   if (overflow)
 643     free (overflow_buffer);
 644   SCM_COL (port) += scm_i_string_length (str);
 645   return result;
 646 }
 647
 648 static SCM
 649 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 650 {
 651   SCM result;
 652   int ends_with_colon = 0;
 653   size_t bytes_read;
 654   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 655   int overflow;
 656   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 657   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 658   SCM str;
 659
 660   scm_ungetc (chr, port);
 661   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 662                                   &overflow_buffer, &bytes_read);
 663   if (bytes_read > 0)
 664     {
 665       if (!overflow)
 666         ends_with_colon = buffer[bytes_read - 1] == ':';
 667       else
 668         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 669     }
 670
 671   if (postfix && ends_with_colon && (bytes_read > 1))
 672     {
 673       if (!overflow)
 674         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 675       else
 676         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 677                                 pt->ilseq_handler);
 678
 679       if (SCM_CASE_INSENSITIVE_P)
 680         str = scm_string_downcase_x (str);
 681       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 682     }
 683   else
 684     {
 685       if (!overflow)
 686         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 687       else
 688         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 689                                 pt->ilseq_handler);
 690
 691       if (SCM_CASE_INSENSITIVE_P)
 692         str = scm_string_downcase_x (str);
 693       result = scm_string_to_symbol (str);
 694     }
 695
 696   if (overflow)
 697     free (overflow_buffer);
 698   SCM_COL (port) += scm_i_string_length (str);
 699   return result;
 700 }
 701
 702 static SCM
 703 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 704 #define FUNC_NAME "scm_lreadr"
 705 {
 706   SCM result;
 707   size_t read;
 708   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 709   int overflow;
 710   unsigned int radix;
 711   SCM str;
 712   scm_t_port *pt;
 713
 714   switch (chr)
 715     {
 716     case 'B':
 717     case 'b':
 718       radix = 2;
 719       break;
 720
 721     case 'o':
 722     case 'O':
 723       radix = 8;
 724       break;
 725
 726     case 'd':
 727     case 'D':
 728       radix = 10;
 729       break;
 730
 731     case 'x':
 732     case 'X':
 733       radix = 16;
 734       break;
 735
 736     default:
 737       scm_ungetc (chr, port);
 738       scm_ungetc ('#', port);
 739       radix = 10;
 740     }
 741
 742   overflow = read_complete_token (port, buffer, sizeof (buffer),
 743                                   &overflow_buffer, &read);
 744
 745   pt = SCM_PTAB_ENTRY (port);
 746   if (!overflow)
 747     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 748   else
 749     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 750                             pt->ilseq_handler);
 751
 752   result = scm_string_to_number (str, scm_from_uint (radix));
 753
 754   if (overflow)
 755     free (overflow_buffer);
 756
 757   SCM_COL (port) += scm_i_string_length (str);
 758
 759   if (scm_is_true (result))
 760     return result;
 761
 762   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 763
 764   return SCM_BOOL_F;
 765 }
 766 #undef FUNC_NAME
 767
 768 static SCM
 769 scm_read_quote (int chr, SCM port)
 770 {
 771   SCM p;
 772   long line = SCM_LINUM (port);
 773   int column = SCM_COL (port) - 1;
 774
 775   switch (chr)
 776     {
 777     case '`':
 778       p = scm_sym_quasiquote;
 779       break;
 780
 781     case '\'':
 782       p = scm_sym_quote;
 783       break;
 784
 785     case ',':
 786       {
 787         scm_t_wchar c;
 788
 789         c = scm_getc (port);
 790         if ('@' == c)
 791           p = scm_sym_uq_splicing;
 792         else
 793           {
 794             scm_ungetc (c, port);
 795             p = scm_sym_unquote;
 796           }
 797         break;
 798       }
 799
 800     default:
 801       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 802                "scm_read_quote", chr);
 803       abort ();
 804     }
 805
 806   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 807   if (SCM_RECORD_POSITIONS_P)
 808     scm_whash_insert (scm_source_whash, p,
 809                       scm_make_srcprops (line, column,
 810                                          SCM_FILENAME (port),
 811                                          SCM_COPY_SOURCE_P
 812                                          ? (scm_cons2 (SCM_CAR (p),
 813                                                        SCM_CAR (SCM_CDR (p)),
 814                                                        SCM_EOL))
 815                                          : SCM_UNDEFINED,
 816                                          SCM_EOL));
 817
 818
 819   return p;
 820 }
 821
 822 SCM_SYMBOL (sym_syntax, "syntax");
 823 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 824 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 825 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 826
 827 static SCM
 828 scm_read_syntax (int chr, SCM port)
 829 {
 830   SCM p;
 831   long line = SCM_LINUM (port);
 832   int column = SCM_COL (port) - 1;
 833
 834   switch (chr)
 835     {
 836     case '`':
 837       p = sym_quasisyntax;
 838       break;
 839
 840     case '\'':
 841       p = sym_syntax;
 842       break;
 843
 844     case ',':
 845       {
 846         int c;
 847
 848         c = scm_getc (port);
 849         if ('@' == c)
 850           p = sym_unsyntax_splicing;
 851         else
 852           {
 853             scm_ungetc (c, port);
 854             p = sym_unsyntax;
 855           }
 856         break;
 857       }
 858
 859     default:
 860       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 861                "scm_read_syntax", chr);
 862       abort ();
 863     }
 864
 865   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 866   if (SCM_RECORD_POSITIONS_P)
 867     scm_whash_insert (scm_source_whash, p,
 868                       scm_make_srcprops (line, column,
 869                                          SCM_FILENAME (port),
 870                                          SCM_COPY_SOURCE_P
 871                                          ? (scm_cons2 (SCM_CAR (p),
 872                                                        SCM_CAR (SCM_CDR (p)),
 873                                                        SCM_EOL))
 874                                          : SCM_UNDEFINED,
 875                                          SCM_EOL));
 876
 877
 878   return p;
 879 }
 880
 881 static inline SCM
 882 scm_read_nil (int chr, SCM port)
 883 {
 884   SCM id = scm_read_mixed_case_symbol (chr, port);
 885
 886   if (!scm_is_eq (id, sym_nil))
 887     scm_i_input_error ("scm_read_nil", port,
 888                        "unexpected input while reading #nil: ~a",
 889                        scm_list_1 (id));
 890
 891   return SCM_ELISP_NIL;
 892 }
 893
 894 static inline SCM
 895 scm_read_semicolon_comment (int chr, SCM port)
 896 {
 897   int c;
 898
 899   /* We use the get_byte here because there is no need to get the
 900      locale correct with comment input. This presumes that newline
 901      always represents itself no matter what the encoding is.  */
 902   for (c = scm_get_byte_or_eof (port);
 903        (c != EOF) && (c != '\n');
 904        c = scm_get_byte_or_eof (port));
 905
 906   return SCM_UNSPECIFIED;
 907 }
 908
 909 \f
 910 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 911
 912 static SCM
 913 scm_read_boolean (int chr, SCM port)
 914 {
 915   switch (chr)
 916     {
 917     case 't':
 918     case 'T':
 919       return SCM_BOOL_T;
 920
 921     case 'f':
 922     case 'F':
 923       return SCM_BOOL_F;
 924     }
 925
 926   return SCM_UNSPECIFIED;
 927 }
 928
 929 static SCM
 930 scm_read_character (scm_t_wchar chr, SCM port)
 931 #define FUNC_NAME "scm_lreadr"
 932 {
 933   char buffer[READER_CHAR_NAME_MAX_SIZE];
 934   SCM charname;
 935   size_t charname_len, bytes_read;
 936   scm_t_wchar cp;
 937   int overflow;
 938   scm_t_port *pt;
 939
 940   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 941   if (overflow)
 942     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 943
 944   if (bytes_read == 0)
 945     {
 946       chr = scm_getc (port);
 947       if (chr == EOF)
 948         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 949                            "while reading character", SCM_EOL);
 950
 951       /* CHR must be a token delimiter, like a whitespace.  */
 952       return (SCM_MAKE_CHAR (chr));
 953     }
 954
 955   pt = SCM_PTAB_ENTRY (port);
 956
 957   /* Simple ASCII characters can be processed immediately.  Also, simple
 958      ISO-8859-1 characters can be processed immediately if the encoding for this
 959      port is ISO-8859-1.  */
 960   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 961     {
 962       SCM_COL (port) += 1;
 963       return SCM_MAKE_CHAR (buffer[0]);
 964     }
 965
 966   /* Otherwise, convert the buffer into a proper scheme string for
 967      processing.  */
 968   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 969                                pt->ilseq_handler);
 970   charname_len = scm_i_string_length (charname);
 971   SCM_COL (port) += charname_len;
 972   cp = scm_i_string_ref (charname, 0);
 973   if (charname_len == 1)
 974     return SCM_MAKE_CHAR (cp);
 975
 976   /* Ignore dotted circles, which may be used to keep combining characters from
 977      combining with the backslash in #\charname.  */
 978   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 979     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 980
 981   if (cp >= '0' && cp < '8')
 982     {
 983       /* Dirk:FIXME::  This type of character syntax is not R5RS
 984        * compliant.  Further, it should be verified that the constant
 985        * does only consist of octal digits.  */
 986       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 987       if (SCM_I_INUMP (p))
 988         {
 989           scm_t_wchar c = scm_to_uint32 (p);
 990           if (SCM_IS_UNICODE_CHAR (c))
 991             return SCM_MAKE_CHAR (c);
 992           else
 993             scm_i_input_error (FUNC_NAME, port,
 994                                "out-of-range octal character escape: ~a",
 995                                scm_list_1 (charname));
 996         }
 997     }
 998
 999   if (cp == 'x' && (charname_len > 1))
1000     {
1001       SCM p;
1002
1003       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1005                                 scm_from_uint (16));
1006       if (SCM_I_INUMP (p))
1007         {
1008           scm_t_wchar c = scm_to_uint32 (p);
1009           if (SCM_IS_UNICODE_CHAR (c))
1010             return SCM_MAKE_CHAR (c);
1011           else
1012             scm_i_input_error (FUNC_NAME, port,
1013                                "out-of-range hex character escape: ~a",
1014                                scm_list_1 (charname));
1015         }
1016     }
1017
1018   /* The names of characters should never have non-Latin1
1019      characters.  */
1020   if (scm_i_is_narrow_string (charname)
1021       || scm_i_try_narrow_string (charname))
1022     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1023                                        charname_len);
1024       if (scm_is_true (ch))
1025         return ch;
1026     }
1027
1028   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1029                      scm_list_1 (charname));
1030
1031   return SCM_UNSPECIFIED;
1032 }
1033 #undef FUNC_NAME
1034
1035 static inline SCM
1036 scm_read_keyword (int chr, SCM port)
1037 {
1038   SCM symbol;
1039
1040   /* Read the symbol that comprises the keyword.  Doing this instead of
1041      invoking a specific symbol reader function allows `scm_read_keyword ()'
1042      to adapt to the delimiters currently valid of symbols.
1043
1044      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1045   symbol = scm_read_expression (port);
1046   if (!scm_is_symbol (symbol))
1047     scm_i_input_error ("scm_read_keyword", port,
1048                        "keyword prefix `~a' not followed by a symbol: ~s",
1049                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1050
1051   return (scm_symbol_to_keyword (symbol));
1052 }
1053
1054 static inline SCM
1055 scm_read_vector (int chr, SCM port)
1056 {
1057   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058      guarantee that it's going to do what we want.  After all, this is an
1059      implementation detail of `scm_read_vector ()', not a desirable
1060      property.  */
1061   return (scm_vector (scm_read_sexp (chr, port)));
1062 }
1063
1064 static inline SCM
1065 scm_read_srfi4_vector (int chr, SCM port)
1066 {
1067   return scm_i_read_array (port, chr);
1068 }
1069
1070 static SCM
1071 scm_read_bytevector (scm_t_wchar chr, SCM port)
1072 {
1073   chr = scm_getc (port);
1074   if (chr != 'u')
1075     goto syntax;
1076
1077   chr = scm_getc (port);
1078   if (chr != '8')
1079     goto syntax;
1080
1081   chr = scm_getc (port);
1082   if (chr != '(')
1083     goto syntax;
1084
1085   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1086
1087  syntax:
1088   scm_i_input_error ("read_bytevector", port,
1089                      "invalid bytevector prefix",
1090                      SCM_MAKE_CHAR (chr));
1091   return SCM_UNSPECIFIED;
1092 }
1093
1094 static SCM
1095 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1096 {
1097   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1098      terribly inefficient but who cares?  */
1099   SCM s_bits = SCM_EOL;
1100
1101   for (chr = scm_getc (port);
1102        (chr != EOF) && ((chr == '0') || (chr == '1'));
1103        chr = scm_getc (port))
1104     {
1105       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1106     }
1107
1108   if (chr != EOF)
1109     scm_ungetc (chr, port);
1110
1111   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1112 }
1113
1114 static inline SCM
1115 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1116 {
1117   int bang_seen = 0;
1118
1119   /* We can use the get_byte here because there is no need to get the
1120      locale correct when reading comments. This presumes that
1121      hash and exclamation points always represent themselves no
1122      matter what the source encoding is.*/
1123   for (;;)
1124     {
1125       int c = scm_get_byte_or_eof (port);
1126
1127       if (c == EOF)
1128         scm_i_input_error ("skip_block_comment", port,
1129                            "unterminated `#! ... !#' comment", SCM_EOL);
1130
1131       if (c == '!')
1132         bang_seen = 1;
1133       else if (c == '#' && bang_seen)
1134         break;
1135       else
1136         bang_seen = 0;
1137     }
1138
1139   return SCM_UNSPECIFIED;
1140 }
1141
1142 static inline SCM
1143 scm_read_shebang (scm_t_wchar chr, SCM port)
1144 {
1145   int c = 0;
1146   if ((c = scm_get_byte_or_eof (port)) != 'r')
1147     {
1148       scm_ungetc (c, port);
1149       return scm_read_scsh_block_comment (chr, port);
1150     }
1151   if ((c = scm_get_byte_or_eof (port)) != '6')
1152     {
1153       scm_ungetc (c, port);
1154       scm_ungetc ('r', port);
1155       return scm_read_scsh_block_comment (chr, port);
1156     }
1157   if ((c = scm_get_byte_or_eof (port)) != 'r')
1158     {
1159       scm_ungetc (c, port);
1160       scm_ungetc ('6', port);
1161       scm_ungetc ('r', port);
1162       return scm_read_scsh_block_comment (chr, port);
1163     }
1164   if ((c = scm_get_byte_or_eof (port)) != 's')
1165     {
1166       scm_ungetc (c, port);
1167       scm_ungetc ('r', port);
1168       scm_ungetc ('6', port);
1169       scm_ungetc ('r', port);
1170       return scm_read_scsh_block_comment (chr, port);
1171     }
1172
1173   return SCM_UNSPECIFIED;
1174 }
1175
1176 static SCM
1177 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1178 {
1179   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1180      nested.  So care must be taken.  */
1181   int nesting_level = 1;
1182   int opening_seen = 0, closing_seen = 0;
1183
1184   while (nesting_level > 0)
1185     {
1186       int c = scm_getc (port);
1187
1188       if (c == EOF)
1189         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1190                            "unterminated `#| ... |#' comment", SCM_EOL);
1191
1192       if (opening_seen)
1193         {
1194           if (c == '|')
1195             nesting_level++;
1196           opening_seen = 0;
1197         }
1198       else if (closing_seen)
1199         {
1200           if (c == '#')
1201             nesting_level--;
1202           closing_seen = 0;
1203         }
1204       else if (c == '|')
1205         closing_seen = 1;
1206       else if (c == '#')
1207         opening_seen = 1;
1208       else
1209         opening_seen = closing_seen = 0;
1210     }
1211
1212   return SCM_UNSPECIFIED;
1213 }
1214
1215 static SCM
1216 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1217 {
1218   scm_t_wchar c;
1219
1220   c = flush_ws (port, (char *) NULL);
1221   if (EOF == c)
1222     scm_i_input_error ("read_commented_expression", port,
1223                        "no expression after #; comment", SCM_EOL);
1224   scm_ungetc (c, port);
1225   scm_read_expression (port);
1226   return SCM_UNSPECIFIED;
1227 }
1228
1229 static SCM
1230 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1231 {
1232   /* Guile's extended symbol read syntax looks like this:
1233
1234        #{This is all a symbol name}#
1235
1236      So here, CHR is expected to be `{'.  */
1237   int saw_brace = 0, finished = 0;
1238   size_t len = 0;
1239   SCM buf = scm_i_make_string (1024, NULL);
1240
1241   buf = scm_i_string_start_writing (buf);
1242
1243   while ((chr = scm_getc (port)) != EOF)
1244     {
1245       if (saw_brace)
1246         {
1247           if (chr == '#')
1248             {
1249               finished = 1;
1250               break;
1251             }
1252           else
1253             {
1254               saw_brace = 0;
1255               scm_i_string_set_x (buf, len++, '}');
1256               scm_i_string_set_x (buf, len++, chr);
1257             }
1258         }
1259       else if (chr == '}')
1260         saw_brace = 1;
1261       else
1262         scm_i_string_set_x (buf, len++, chr);
1263
1264       if (len >= scm_i_string_length (buf) - 2)
1265         {
1266           SCM addy;
1267
1268           scm_i_string_stop_writing ();
1269           addy = scm_i_make_string (1024, NULL);
1270           buf = scm_string_append (scm_list_2 (buf, addy));
1271           len = 0;
1272           buf = scm_i_string_start_writing (buf);
1273         }
1274
1275       if (finished)
1276         break;
1277     }
1278   scm_i_string_stop_writing ();
1279
1280   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1281 }
1282
1283
1284 \f
1285 /* Top-level token readers, i.e., dispatchers.  */
1286
1287 static SCM
1288 scm_read_sharp_extension (int chr, SCM port)
1289 {
1290   SCM proc;
1291
1292   proc = scm_get_hash_procedure (chr);
1293   if (scm_is_true (scm_procedure_p (proc)))
1294     {
1295       long line = SCM_LINUM (port);
1296       int column = SCM_COL (port) - 2;
1297       SCM got;
1298
1299       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1300       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1301         {
1302           if (SCM_RECORD_POSITIONS_P)
1303             return (recsexpr (got, line, column,
1304                               SCM_FILENAME (port)));
1305           else
1306             return got;
1307         }
1308     }
1309
1310   return SCM_UNSPECIFIED;
1311 }
1312
1313 /* The reader for the sharp `#' character.  It basically dispatches reads
1314    among the above token readers.   */
1315 static SCM
1316 scm_read_sharp (scm_t_wchar chr, SCM port)
1317 #define FUNC_NAME "scm_lreadr"
1318 {
1319   SCM result;
1320
1321   chr = scm_getc (port);
1322
1323   result = scm_read_sharp_extension (chr, port);
1324   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1325     return result;
1326
1327   switch (chr)
1328     {
1329     case '\\':
1330       return (scm_read_character (chr, port));
1331     case '(':
1332       return (scm_read_vector (chr, port));
1333     case 's':
1334     case 'u':
1335     case 'f':
1336       /* This one may return either a boolean or an SRFI-4 vector.  */
1337       return (scm_read_srfi4_vector (chr, port));
1338     case 'v':
1339       return (scm_read_bytevector (chr, port));
1340     case '*':
1341       return (scm_read_guile_bit_vector (chr, port));
1342     case 't':
1343     case 'T':
1344     case 'F':
1345       /* This one may return either a boolean or an SRFI-4 vector.  */
1346       return (scm_read_boolean (chr, port));
1347     case ':':
1348       return (scm_read_keyword (chr, port));
1349     case '0': case '1': case '2': case '3': case '4':
1350     case '5': case '6': case '7': case '8': case '9':
1351     case '@':
1352 #if SCM_ENABLE_DEPRECATED
1353       /* See below for 'i' and 'e'. */
1354     case 'a':
1355     case 'c':
1356     case 'y':
1357     case 'h':
1358     case 'l':
1359 #endif
1360       return (scm_i_read_array (port, chr));
1361
1362     case 'i':
1363     case 'e':
1364 #if SCM_ENABLE_DEPRECATED
1365       {
1366         /* When next char is '(', it really is an old-style
1367            uniform array. */
1368         scm_t_wchar next_c = scm_getc (port);
1369         if (next_c != EOF)
1370           scm_ungetc (next_c, port);
1371         if (next_c == '(')
1372           return scm_i_read_array (port, chr);
1373         /* Fall through. */
1374       }
1375 #endif
1376     case 'b':
1377     case 'B':
1378     case 'o':
1379     case 'O':
1380     case 'd':
1381     case 'D':
1382     case 'x':
1383     case 'X':
1384     case 'I':
1385     case 'E':
1386       return (scm_read_number_and_radix (chr, port));
1387     case '{':
1388       return (scm_read_extended_symbol (chr, port));
1389     case '!':
1390       return (scm_read_shebang (chr, port));
1391     case ';':
1392       return (scm_read_commented_expression (chr, port));
1393     case '`':
1394     case '\'':
1395     case ',':
1396       return (scm_read_syntax (chr, port));
1397     case 'n':
1398       return (scm_read_nil (chr, port));
1399     default:
1400       result = scm_read_sharp_extension (chr, port);
1401       if (scm_is_eq (result, SCM_UNSPECIFIED))
1402         {
1403           /* To remain compatible with 1.8 and earlier, the following
1404              characters have lower precedence than `read-hash-extend'
1405              characters.  */
1406           switch (chr)
1407             {
1408             case '|':
1409               return scm_read_r6rs_block_comment (chr, port);
1410             default:
1411               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1412                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1413             }
1414         }
1415       else
1416         return result;
1417     }
1418
1419   return SCM_UNSPECIFIED;
1420 }
1421 #undef FUNC_NAME
1422
1423 static SCM
1424 scm_read_expression (SCM port)
1425 #define FUNC_NAME "scm_read_expression"
1426 {
1427   while (1)
1428     {
1429       register scm_t_wchar chr;
1430
1431       chr = scm_getc (port);
1432
1433       switch (chr)
1434         {
1435         case SCM_WHITE_SPACES:
1436         case SCM_LINE_INCREMENTORS:
1437           break;
1438         case ';':
1439           (void) scm_read_semicolon_comment (chr, port);
1440           break;
1441         case '[':
1442           if (!SCM_SQUARE_BRACKETS_P)
1443             return (scm_read_mixed_case_symbol (chr, port));
1444           /* otherwise fall through */
1445         case '(':
1446           return (scm_read_sexp (chr, port));
1447         case '"':
1448           return (scm_read_string (chr, port));
1449         case '\'':
1450         case '`':
1451         case ',':
1452           return (scm_read_quote (chr, port));
1453         case '#':
1454           {
1455             SCM result;
1456             result = scm_read_sharp (chr, port);
1457             if (scm_is_eq (result, SCM_UNSPECIFIED))
1458               /* We read a comment or some such.  */
1459               break;
1460             else
1461               return result;
1462           }
1463         case ')':
1464           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1465           break;
1466         case ']':
1467           if (SCM_SQUARE_BRACKETS_P)
1468             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1469           /* otherwise fall through */
1470         case EOF:
1471           return SCM_EOF_VAL;
1472         case ':':
1473           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1474             return scm_symbol_to_keyword (scm_read_expression (port));
1475           /* Fall through.  */
1476
1477         default:
1478           {
1479             if (((chr >= '0') && (chr <= '9'))
1480                 || (strchr ("+-.", chr)))
1481               return (scm_read_number (chr, port));
1482             else
1483               return (scm_read_mixed_case_symbol (chr, port));
1484           }
1485         }
1486     }
1487 }
1488 #undef FUNC_NAME
1489
1490 \f
1491 /* Actual reader.  */
1492
1493 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1494             (SCM port),
1495             "Read an s-expression from the input port @var{port}, or from\n"
1496             "the current input port if @var{port} is not specified.\n"
1497             "Any whitespace before the next token is discarded.")
1498 #define FUNC_NAME s_scm_read
1499 {
1500   int c;
1501
1502   if (SCM_UNBNDP (port))
1503     port = scm_current_input_port ();
1504   SCM_VALIDATE_OPINPORT (1, port);
1505
1506   c = flush_ws (port, (char *) NULL);
1507   if (EOF == c)
1508     return SCM_EOF_VAL;
1509   scm_ungetc (c, port);
1510
1511   return (scm_read_expression (port));
1512 }
1513 #undef FUNC_NAME
1514
1515
1516 \f
1517
1518 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1519 static SCM
1520 recsexpr (SCM obj, long line, int column, SCM filename)
1521 {
1522   if (!scm_is_pair(obj)) {
1523     return obj;
1524   } else {
1525     SCM tmp = obj, copy;
1526     /* If this sexpr is visible in the read:sharp source, we want to
1527        keep that information, so only record non-constant cons cells
1528        which haven't previously been read by the reader. */
1529     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1530       {
1531         if (SCM_COPY_SOURCE_P)
1532           {
1533             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1534                              SCM_UNDEFINED);
1535             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1536               {
1537                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1538                                                       line,
1539                                                       column,
1540                                                       filename),
1541                                             SCM_UNDEFINED));
1542                 copy = SCM_CDR (copy);
1543               }
1544             SCM_SETCDR (copy, tmp);
1545           }
1546         else
1547           {
1548             recsexpr (SCM_CAR (obj), line, column, filename);
1549             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1550               recsexpr (SCM_CAR (tmp), line, column, filename);
1551             copy = SCM_UNDEFINED;
1552           }
1553         scm_whash_insert (scm_source_whash,
1554                           obj,
1555                           scm_make_srcprops (line,
1556                                              column,
1557                                              filename,
1558                                              copy,
1559                                              SCM_EOL));
1560       }
1561     return obj;
1562   }
1563 }
1564
1565 /* Manipulate the read-hash-procedures alist.  This could be written in
1566    Scheme, but maybe it will also be used by C code during initialisation.  */
1567 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1568             (SCM chr, SCM proc),
1569             "Install the procedure @var{proc} for reading expressions\n"
1570             "starting with the character sequence @code{#} and @var{chr}.\n"
1571             "@var{proc} will be called with two arguments:  the character\n"
1572             "@var{chr} and the port to read further data from. The object\n"
1573             "returned will be the return value of @code{read}. \n"
1574             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1575             )
1576 #define FUNC_NAME s_scm_read_hash_extend
1577 {
1578   SCM this;
1579   SCM prev;
1580
1581   SCM_VALIDATE_CHAR (1, chr);
1582   SCM_ASSERT (scm_is_false (proc)
1583               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1584               proc, SCM_ARG2, FUNC_NAME);
1585
1586   /* Check if chr is already in the alist.  */
1587   this = scm_i_read_hash_procedures_ref ();
1588   prev = SCM_BOOL_F;
1589   while (1)
1590     {
1591       if (scm_is_null (this))
1592         {
1593           /* not found, so add it to the beginning.  */
1594           if (scm_is_true (proc))
1595             {
1596               SCM new = scm_cons (scm_cons (chr, proc),
1597                                   scm_i_read_hash_procedures_ref ());
1598               scm_i_read_hash_procedures_set_x (new);
1599             }
1600           break;
1601         }
1602       if (scm_is_eq (chr, SCM_CAAR (this)))
1603         {
1604           /* already in the alist.  */
1605           if (scm_is_false (proc))
1606             {
1607               /* remove it.  */
1608               if (scm_is_false (prev))
1609                 {
1610                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1611                   scm_i_read_hash_procedures_set_x (rest);
1612                 }
1613               else
1614                 scm_set_cdr_x (prev, SCM_CDR (this));
1615             }
1616           else
1617             {
1618               /* replace it.  */
1619               scm_set_cdr_x (SCM_CAR (this), proc);
1620             }
1621           break;
1622         }
1623       prev = this;
1624       this = SCM_CDR (this);
1625     }
1626
1627   return SCM_UNSPECIFIED;
1628 }
1629 #undef FUNC_NAME
1630
1631 /* Recover the read-hash procedure corresponding to char c.  */
1632 static SCM
1633 scm_get_hash_procedure (int c)
1634 {
1635   SCM rest = scm_i_read_hash_procedures_ref ();
1636
1637   while (1)
1638     {
1639       if (scm_is_null (rest))
1640         return SCM_BOOL_F;
1641
1642       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1643         return SCM_CDAR (rest);
1644
1645       rest = SCM_CDR (rest);
1646     }
1647 }
1648
1649 #define SCM_ENCODING_SEARCH_SIZE (500)
1650
1651 /* Search the first few hundred characters of a file for an Emacs-like coding
1652    declaration.  Returns either NULL or a string whose storage has been
1653    allocated with `scm_gc_malloc ()'.  */
1654 char *
1655 scm_i_scan_for_encoding (SCM port)
1656 {
1657   char header[SCM_ENCODING_SEARCH_SIZE+1];
1658   size_t bytes_read, encoding_length, i;
1659   char *encoding = NULL;
1660   int utf8_bom = 0;
1661   char *pos, *encoding_start;
1662   int in_comment;
1663
1664   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1665     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1666        "guile <(echo '(display "hello")')") so bail out.  */
1667     return NULL;
1668
1669   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1670   header[bytes_read] = '\0';
1671
1672   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1673
1674   if (bytes_read > 3
1675       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1676     utf8_bom = 1;
1677
1678   /* search past "coding[:=]" */
1679   pos = header;
1680   while (1)
1681     {
1682       if ((pos = strstr(pos, "coding")) == NULL)
1683         return NULL;
1684
1685       pos += strlen("coding");
1686       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1687           (*pos == ':' || *pos == '='))
1688         {
1689           pos ++;
1690           break;
1691         }
1692     }
1693
1694   /* skip spaces */
1695   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1696          (*pos == ' ' || *pos == '\t'))
1697     pos ++;
1698
1699   /* grab the next token */
1700   encoding_start = pos;
1701   i = 0;
1702   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1703          && encoding_start + i - header < bytes_read
1704          && (isalnum ((int) encoding_start[i])
1705              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1706     i++;
1707
1708   encoding_length = i;
1709   if (encoding_length == 0)
1710     return NULL;
1711
1712   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1713   for (i = 0; i < encoding_length; i++)
1714     encoding[i] = toupper ((int) encoding[i]);
1715
1716   /* push backwards to make sure we were in a comment */
1717   in_comment = 0;
1718   pos = encoding_start;
1719   while (pos >= header)
1720     {
1721       if (*pos == '\n')
1722         {
1723           /* This wasn't in a semicolon comment. Check for a
1724            hash-bang comment. */
1725           char *beg = strstr (header, "#!");
1726           char *end = strstr (header, "!#");
1727           if (beg < encoding_start && encoding_start + encoding_length < end)
1728             in_comment = 1;
1729           break;
1730         }
1731       if (*pos == ';')
1732         {
1733           in_comment = 1;
1734           break;
1735         }
1736       pos --;
1737     }
1738   if (!in_comment)
1739     /* This wasn't in a comment */
1740     return NULL;
1741
1742   if (utf8_bom && strcmp(encoding, "UTF-8"))
1743     scm_misc_error (NULL,
1744                     "the port input declares the encoding ~s but is encoded as UTF-8",
1745                     scm_list_1 (scm_from_locale_string (encoding)));
1746
1747   return encoding;
1748 }
1749
1750 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1751             (SCM port),
1752             "Scans the port for an Emacs-like character coding declaration\n"
1753             "near the top of the contents of a port with random-acessible contents.\n"
1754             "The coding declaration is of the form\n"
1755             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1756             "\n"
1757             "Returns a string containing the character encoding of the file\n"
1758             "if a declaration was found, or @code{#f} otherwise.\n")
1759 #define FUNC_NAME s_scm_file_encoding
1760 {
1761   char *enc;
1762   SCM s_enc;
1763
1764   enc = scm_i_scan_for_encoding (port);
1765   if (enc == NULL)
1766     return SCM_BOOL_F;
1767   else
1768     {
1769       s_enc = scm_from_locale_string (enc);
1770       return s_enc;
1771     }
1772
1773   return SCM_BOOL_F;
1774 }
1775 #undef FUNC_NAME
1776
1777 void
1778 scm_init_read ()
1779 {
1780   SCM read_hash_procs;
1781
1782   read_hash_procs = scm_make_fluid ();
1783   scm_fluid_set_x (read_hash_procs, SCM_EOL);
1784
1785   scm_i_read_hash_procedures =
1786     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1787
1788   scm_init_opts (scm_read_options, scm_read_opts);
1789 #include "libguile/read.x"
1790 }
1791
1792 /*
1793   Local Variables:
1794   c-file-style: "gnu"
1795   End:
1796 */