libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static inline SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static inline void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static inline int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   register scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 361
 362
 363 static SCM
 364 scm_read_sexp (scm_t_wchar chr, SCM port)
 365 #define FUNC_NAME "scm_i_lreadparen"
 366 {
 367   register int c;
 368   register SCM tmp;
 369   register SCM tl, ans = SCM_EOL;
 370   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 371   const int terminating_char = ((chr == '[') ? ']' : ')');
 372
 373   /* Need to capture line and column numbers here. */
 374   long line = SCM_LINUM (port);
 375   int column = SCM_COL (port) - 1;
 376
 377
 378   c = flush_ws (port, FUNC_NAME);
 379   if (terminating_char == c)
 380     return SCM_EOL;
 381
 382   scm_ungetc (c, port);
 383   if (scm_is_eq (scm_sym_dot,
 384                  (tmp = scm_read_expression (port))))
 385     {
 386       ans = scm_read_expression (port);
 387       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 388         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 389                            SCM_EOL);
 390       return ans;
 391     }
 392
 393   /* Build the head of the list structure. */
 394   ans = tl = scm_cons (tmp, SCM_EOL);
 395
 396   if (SCM_COPY_SOURCE_P)
 397     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 398                            ? copy
 399                            : tmp,
 400                            SCM_EOL);
 401
 402   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 403     {
 404       SCM new_tail;
 405
 406       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 407         scm_i_input_error (FUNC_NAME, port,
 408                            "in pair: mismatched close paren: ~A",
 409                            scm_list_1 (SCM_MAKE_CHAR (c)));
 410
 411       scm_ungetc (c, port);
 412       tmp = scm_read_expression (port);
 413
 414       if (scm_is_eq (scm_sym_dot, tmp))
 415         {
 416           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 417
 418           if (SCM_COPY_SOURCE_P)
 419             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 420                                        SCM_EOL));
 421
 422           c = flush_ws (port, FUNC_NAME);
 423           if (terminating_char != c)
 424             scm_i_input_error (FUNC_NAME, port,
 425                                "in pair: missing close paren", SCM_EOL);
 426           goto exit;
 427         }
 428
 429       new_tail = scm_cons (tmp, SCM_EOL);
 430       SCM_SETCDR (tl, new_tail);
 431       tl = new_tail;
 432
 433       if (SCM_COPY_SOURCE_P)
 434         {
 435           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 436                                     ? copy
 437                                     : tmp, SCM_EOL);
 438           SCM_SETCDR (tl2, new_tail2);
 439           tl2 = new_tail2;
 440         }
 441     }
 442
 443  exit:
 444   if (SCM_RECORD_POSITIONS_P)
 445     scm_hashq_set_x (scm_source_whash,
 446                      ans,
 447                      scm_make_srcprops (line, column,
 448                                         SCM_FILENAME (port),
 449                                         SCM_COPY_SOURCE_P
 450                                         ? ans2
 451                                         : SCM_UNDEFINED,
 452                                         SCM_EOL));
 453   return ans;
 454 }
 455 #undef FUNC_NAME
 456
 457
 458 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 459    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 460    found.  */
 461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 462   do                                                               \
 463     {                                                              \
 464       scm_t_wchar a;                                               \
 465       size_t i = 0;                                                \
 466       c = 0;                                                       \
 467       while (i < ndigits)                                          \
 468         {                                                          \
 469           a = scm_getc (port);                                     \
 470           if (a == EOF)                                            \
 471             goto str_eof;                                          \
 472           if (terminator                                           \
 473               && (a == (scm_t_wchar) terminator)                   \
 474               && (i > 0))                                          \
 475             break;                                                 \
 476           if ('0' <= a && a <= '9')                                \
 477             a -= '0';                                              \
 478           else if ('A' <= a && a <= 'F')                           \
 479             a = a - 'A' + 10;                                      \
 480           else if ('a' <= a && a <= 'f')                           \
 481             a = a - 'a' + 10;                                      \
 482           else                                                     \
 483             {                                                      \
 484               c = a;                                               \
 485               goto bad_escaped;                                    \
 486             }                                                      \
 487           c = c * 16 + a;                                          \
 488           i ++;                                                    \
 489         }                                                          \
 490     } while (0)
 491
 492 static void
 493 skip_intraline_whitespace (SCM port)
 494 {
 495   scm_t_wchar c;
 496
 497   do
 498     {
 499       c = scm_getc (port);
 500       if (c == EOF)
 501         return;
 502     }
 503   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 504
 505   scm_ungetc (c, port);
 506 }
 507
 508 static SCM
 509 scm_read_string (int chr, SCM port)
 510 #define FUNC_NAME "scm_lreadr"
 511 {
 512   /* For strings smaller than C_STR, this function creates only one Scheme
 513      object (the string returned).  */
 514
 515   SCM str = SCM_BOOL_F;
 516   unsigned c_str_len = 0;
 517   scm_t_wchar c;
 518
 519   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 520   while ('"' != (c = scm_getc (port)))
 521     {
 522       if (c == EOF)
 523         {
 524         str_eof:
 525           scm_i_input_error (FUNC_NAME, port,
 526                              "end of file in string constant", SCM_EOL);
 527         }
 528
 529       if (c_str_len + 1 >= scm_i_string_length (str))
 530         {
 531           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 532
 533           str = scm_string_append (scm_list_2 (str, addy));
 534         }
 535
 536       if (c == '\\')
 537         {
 538           switch (c = scm_getc (port))
 539             {
 540             case EOF:
 541               goto str_eof;
 542             case '"':
 543             case '\\':
 544               break;
 545             case '\n':
 546               if (SCM_HUNGRY_EOL_ESCAPES_P)
 547                 skip_intraline_whitespace (port);
 548               continue;
 549             case '0':
 550               c = '\0';
 551               break;
 552             case 'f':
 553               c = '\f';
 554               break;
 555             case 'n':
 556               c = '\n';
 557               break;
 558             case 'r':
 559               c = '\r';
 560               break;
 561             case 't':
 562               c = '\t';
 563               break;
 564             case 'a':
 565               c = '\007';
 566               break;
 567             case 'v':
 568               c = '\v';
 569               break;
 570             case 'b':
 571               c = '\010';
 572               break;
 573             case 'x':
 574               if (SCM_R6RS_ESCAPES_P)
 575                 SCM_READ_HEX_ESCAPE (10, ';');
 576               else
 577                 SCM_READ_HEX_ESCAPE (2, '\0');
 578               break;
 579             case 'u':
 580               if (!SCM_R6RS_ESCAPES_P)
 581                 {
 582                   SCM_READ_HEX_ESCAPE (4, '\0');
 583                   break;
 584                 }
 585             case 'U':
 586               if (!SCM_R6RS_ESCAPES_P)
 587                 {
 588                   SCM_READ_HEX_ESCAPE (6, '\0');
 589                   break;
 590                 }
 591             default:
 592             bad_escaped:
 593               scm_i_input_error (FUNC_NAME, port,
 594                                  "illegal character in escape sequence: ~S",
 595                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 596             }
 597         }
 598       str = scm_i_string_start_writing (str);
 599       scm_i_string_set_x (str, c_str_len++, c);
 600       scm_i_string_stop_writing ();
 601     }
 602
 603   if (c_str_len > 0)
 604     {
 605       return scm_i_substring_copy (str, 0, c_str_len);
 606     }
 607
 608   return scm_nullstr;
 609 }
 610 #undef FUNC_NAME
 611
 612
 613 static SCM
 614 scm_read_number (scm_t_wchar chr, SCM port)
 615 {
 616   SCM result, str = SCM_EOL;
 617   char buffer[READER_BUFFER_SIZE];
 618   char *overflow_buffer = NULL;
 619   size_t bytes_read;
 620   int overflow;
 621   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 622
 623   scm_ungetc (chr, port);
 624   overflow = read_complete_token (port, buffer, sizeof (buffer),
 625                                   &overflow_buffer, &bytes_read);
 626
 627   if (!overflow)
 628     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 629   else
 630     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 631                             pt->ilseq_handler);
 632
 633   result = scm_string_to_number (str, SCM_UNDEFINED);
 634   if (!scm_is_true (result))
 635     {
 636       /* Return a symbol instead of a number */
 637       if (SCM_CASE_INSENSITIVE_P)
 638         str = scm_string_downcase_x (str);
 639       result = scm_string_to_symbol (str);
 640     }
 641
 642   if (overflow)
 643     free (overflow_buffer);
 644   SCM_COL (port) += scm_i_string_length (str);
 645   return result;
 646 }
 647
 648 static SCM
 649 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 650 {
 651   SCM result;
 652   int ends_with_colon = 0;
 653   size_t bytes_read;
 654   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 655   int overflow;
 656   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 657   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 658   SCM str;
 659
 660   scm_ungetc (chr, port);
 661   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 662                                   &overflow_buffer, &bytes_read);
 663   if (bytes_read > 0)
 664     {
 665       if (!overflow)
 666         ends_with_colon = buffer[bytes_read - 1] == ':';
 667       else
 668         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 669     }
 670
 671   if (postfix && ends_with_colon && (bytes_read > 1))
 672     {
 673       if (!overflow)
 674         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 675       else
 676         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 677                                 pt->ilseq_handler);
 678
 679       if (SCM_CASE_INSENSITIVE_P)
 680         str = scm_string_downcase_x (str);
 681       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 682     }
 683   else
 684     {
 685       if (!overflow)
 686         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 687       else
 688         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 689                                 pt->ilseq_handler);
 690
 691       if (SCM_CASE_INSENSITIVE_P)
 692         str = scm_string_downcase_x (str);
 693       result = scm_string_to_symbol (str);
 694     }
 695
 696   if (overflow)
 697     free (overflow_buffer);
 698   SCM_COL (port) += scm_i_string_length (str);
 699   return result;
 700 }
 701
 702 static SCM
 703 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 704 #define FUNC_NAME "scm_lreadr"
 705 {
 706   SCM result;
 707   size_t read;
 708   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 709   int overflow;
 710   unsigned int radix;
 711   SCM str;
 712   scm_t_port *pt;
 713
 714   switch (chr)
 715     {
 716     case 'B':
 717     case 'b':
 718       radix = 2;
 719       break;
 720
 721     case 'o':
 722     case 'O':
 723       radix = 8;
 724       break;
 725
 726     case 'd':
 727     case 'D':
 728       radix = 10;
 729       break;
 730
 731     case 'x':
 732     case 'X':
 733       radix = 16;
 734       break;
 735
 736     default:
 737       scm_ungetc (chr, port);
 738       scm_ungetc ('#', port);
 739       radix = 10;
 740     }
 741
 742   overflow = read_complete_token (port, buffer, sizeof (buffer),
 743                                   &overflow_buffer, &read);
 744
 745   pt = SCM_PTAB_ENTRY (port);
 746   if (!overflow)
 747     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 748   else
 749     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 750                             pt->ilseq_handler);
 751
 752   result = scm_string_to_number (str, scm_from_uint (radix));
 753
 754   if (overflow)
 755     free (overflow_buffer);
 756
 757   SCM_COL (port) += scm_i_string_length (str);
 758
 759   if (scm_is_true (result))
 760     return result;
 761
 762   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 763
 764   return SCM_BOOL_F;
 765 }
 766 #undef FUNC_NAME
 767
 768 static SCM
 769 scm_read_quote (int chr, SCM port)
 770 {
 771   SCM p;
 772   long line = SCM_LINUM (port);
 773   int column = SCM_COL (port) - 1;
 774
 775   switch (chr)
 776     {
 777     case '`':
 778       p = scm_sym_quasiquote;
 779       break;
 780
 781     case '\'':
 782       p = scm_sym_quote;
 783       break;
 784
 785     case ',':
 786       {
 787         scm_t_wchar c;
 788
 789         c = scm_getc (port);
 790         if ('@' == c)
 791           p = scm_sym_uq_splicing;
 792         else
 793           {
 794             scm_ungetc (c, port);
 795             p = scm_sym_unquote;
 796           }
 797         break;
 798       }
 799
 800     default:
 801       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 802                "scm_read_quote", chr);
 803       abort ();
 804     }
 805
 806   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 807   if (SCM_RECORD_POSITIONS_P)
 808     scm_hashq_set_x (scm_source_whash, p,
 809                      scm_make_srcprops (line, column,
 810                                         SCM_FILENAME (port),
 811                                         SCM_COPY_SOURCE_P
 812                                         ? (scm_cons2 (SCM_CAR (p),
 813                                                       SCM_CAR (SCM_CDR (p)),
 814                                                       SCM_EOL))
 815                                         : SCM_UNDEFINED,
 816                                         SCM_EOL));
 817
 818
 819   return p;
 820 }
 821
 822 SCM_SYMBOL (sym_syntax, "syntax");
 823 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 824 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 825 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 826
 827 static SCM
 828 scm_read_syntax (int chr, SCM port)
 829 {
 830   SCM p;
 831   long line = SCM_LINUM (port);
 832   int column = SCM_COL (port) - 1;
 833
 834   switch (chr)
 835     {
 836     case '`':
 837       p = sym_quasisyntax;
 838       break;
 839
 840     case '\'':
 841       p = sym_syntax;
 842       break;
 843
 844     case ',':
 845       {
 846         int c;
 847
 848         c = scm_getc (port);
 849         if ('@' == c)
 850           p = sym_unsyntax_splicing;
 851         else
 852           {
 853             scm_ungetc (c, port);
 854             p = sym_unsyntax;
 855           }
 856         break;
 857       }
 858
 859     default:
 860       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 861                "scm_read_syntax", chr);
 862       abort ();
 863     }
 864
 865   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 866   if (SCM_RECORD_POSITIONS_P)
 867     scm_hashq_set_x (scm_source_whash, p,
 868                      scm_make_srcprops (line, column,
 869                                         SCM_FILENAME (port),
 870                                         SCM_COPY_SOURCE_P
 871                                         ? (scm_cons2 (SCM_CAR (p),
 872                                                       SCM_CAR (SCM_CDR (p)),
 873                                                       SCM_EOL))
 874                                         : SCM_UNDEFINED,
 875                                         SCM_EOL));
 876
 877
 878   return p;
 879 }
 880
 881 static inline SCM
 882 scm_read_nil (int chr, SCM port)
 883 {
 884   SCM id = scm_read_mixed_case_symbol (chr, port);
 885
 886   if (!scm_is_eq (id, sym_nil))
 887     scm_i_input_error ("scm_read_nil", port,
 888                        "unexpected input while reading #nil: ~a",
 889                        scm_list_1 (id));
 890
 891   return SCM_ELISP_NIL;
 892 }
 893
 894 static inline SCM
 895 scm_read_semicolon_comment (int chr, SCM port)
 896 {
 897   int c;
 898
 899   /* We use the get_byte here because there is no need to get the
 900      locale correct with comment input. This presumes that newline
 901      always represents itself no matter what the encoding is.  */
 902   for (c = scm_get_byte_or_eof (port);
 903        (c != EOF) && (c != '\n');
 904        c = scm_get_byte_or_eof (port));
 905
 906   return SCM_UNSPECIFIED;
 907 }
 908
 909 \f
 910 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 911
 912 static SCM
 913 scm_read_boolean (int chr, SCM port)
 914 {
 915   switch (chr)
 916     {
 917     case 't':
 918     case 'T':
 919       return SCM_BOOL_T;
 920
 921     case 'f':
 922     case 'F':
 923       return SCM_BOOL_F;
 924     }
 925
 926   return SCM_UNSPECIFIED;
 927 }
 928
 929 static SCM
 930 scm_read_character (scm_t_wchar chr, SCM port)
 931 #define FUNC_NAME "scm_lreadr"
 932 {
 933   char buffer[READER_CHAR_NAME_MAX_SIZE];
 934   SCM charname;
 935   size_t charname_len, bytes_read;
 936   scm_t_wchar cp;
 937   int overflow;
 938   scm_t_port *pt;
 939
 940   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 941   if (overflow)
 942     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 943
 944   if (bytes_read == 0)
 945     {
 946       chr = scm_getc (port);
 947       if (chr == EOF)
 948         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 949                            "while reading character", SCM_EOL);
 950
 951       /* CHR must be a token delimiter, like a whitespace.  */
 952       return (SCM_MAKE_CHAR (chr));
 953     }
 954
 955   pt = SCM_PTAB_ENTRY (port);
 956
 957   /* Simple ASCII characters can be processed immediately.  Also, simple
 958      ISO-8859-1 characters can be processed immediately if the encoding for this
 959      port is ISO-8859-1.  */
 960   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 961     {
 962       SCM_COL (port) += 1;
 963       return SCM_MAKE_CHAR (buffer[0]);
 964     }
 965
 966   /* Otherwise, convert the buffer into a proper scheme string for
 967      processing.  */
 968   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 969                                pt->ilseq_handler);
 970   charname_len = scm_i_string_length (charname);
 971   SCM_COL (port) += charname_len;
 972   cp = scm_i_string_ref (charname, 0);
 973   if (charname_len == 1)
 974     return SCM_MAKE_CHAR (cp);
 975
 976   /* Ignore dotted circles, which may be used to keep combining characters from
 977      combining with the backslash in #\charname.  */
 978   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 979     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 980
 981   if (cp >= '0' && cp < '8')
 982     {
 983       /* Dirk:FIXME::  This type of character syntax is not R5RS
 984        * compliant.  Further, it should be verified that the constant
 985        * does only consist of octal digits.  */
 986       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 987       if (SCM_I_INUMP (p))
 988         {
 989           scm_t_wchar c = scm_to_uint32 (p);
 990           if (SCM_IS_UNICODE_CHAR (c))
 991             return SCM_MAKE_CHAR (c);
 992           else
 993             scm_i_input_error (FUNC_NAME, port,
 994                                "out-of-range octal character escape: ~a",
 995                                scm_list_1 (charname));
 996         }
 997     }
 998
 999   if (cp == 'x' && (charname_len > 1))
1000     {
1001       SCM p;
1002
1003       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1005                                 scm_from_uint (16));
1006       if (SCM_I_INUMP (p))
1007         {
1008           scm_t_wchar c = scm_to_uint32 (p);
1009           if (SCM_IS_UNICODE_CHAR (c))
1010             return SCM_MAKE_CHAR (c);
1011           else
1012             scm_i_input_error (FUNC_NAME, port,
1013                                "out-of-range hex character escape: ~a",
1014                                scm_list_1 (charname));
1015         }
1016     }
1017
1018   /* The names of characters should never have non-Latin1
1019      characters.  */
1020   if (scm_i_is_narrow_string (charname)
1021       || scm_i_try_narrow_string (charname))
1022     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1023                                        charname_len);
1024       if (scm_is_true (ch))
1025         return ch;
1026     }
1027
1028   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1029                      scm_list_1 (charname));
1030
1031   return SCM_UNSPECIFIED;
1032 }
1033 #undef FUNC_NAME
1034
1035 static inline SCM
1036 scm_read_keyword (int chr, SCM port)
1037 {
1038   SCM symbol;
1039
1040   /* Read the symbol that comprises the keyword.  Doing this instead of
1041      invoking a specific symbol reader function allows `scm_read_keyword ()'
1042      to adapt to the delimiters currently valid of symbols.
1043
1044      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1045   symbol = scm_read_expression (port);
1046   if (!scm_is_symbol (symbol))
1047     scm_i_input_error ("scm_read_keyword", port,
1048                        "keyword prefix `~a' not followed by a symbol: ~s",
1049                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1050
1051   return (scm_symbol_to_keyword (symbol));
1052 }
1053
1054 static inline SCM
1055 scm_read_vector (int chr, SCM port)
1056 {
1057   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058      guarantee that it's going to do what we want.  After all, this is an
1059      implementation detail of `scm_read_vector ()', not a desirable
1060      property.  */
1061   return (scm_vector (scm_read_sexp (chr, port)));
1062 }
1063
1064 static inline SCM
1065 scm_read_srfi4_vector (int chr, SCM port)
1066 {
1067   return scm_i_read_array (port, chr);
1068 }
1069
1070 static SCM
1071 scm_read_bytevector (scm_t_wchar chr, SCM port)
1072 {
1073   chr = scm_getc (port);
1074   if (chr != 'u')
1075     goto syntax;
1076
1077   chr = scm_getc (port);
1078   if (chr != '8')
1079     goto syntax;
1080
1081   chr = scm_getc (port);
1082   if (chr != '(')
1083     goto syntax;
1084
1085   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1086
1087  syntax:
1088   scm_i_input_error ("read_bytevector", port,
1089                      "invalid bytevector prefix",
1090                      SCM_MAKE_CHAR (chr));
1091   return SCM_UNSPECIFIED;
1092 }
1093
1094 static SCM
1095 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1096 {
1097   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1098      terribly inefficient but who cares?  */
1099   SCM s_bits = SCM_EOL;
1100
1101   for (chr = scm_getc (port);
1102        (chr != EOF) && ((chr == '0') || (chr == '1'));
1103        chr = scm_getc (port))
1104     {
1105       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1106     }
1107
1108   if (chr != EOF)
1109     scm_ungetc (chr, port);
1110
1111   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1112 }
1113
1114 static inline SCM
1115 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1116 {
1117   int bang_seen = 0;
1118
1119   for (;;)
1120     {
1121       int c = scm_getc (port);
1122
1123       if (c == EOF)
1124         scm_i_input_error ("skip_block_comment", port,
1125                            "unterminated `#! ... !#' comment", SCM_EOL);
1126
1127       if (c == '!')
1128         bang_seen = 1;
1129       else if (c == '#' && bang_seen)
1130         break;
1131       else
1132         bang_seen = 0;
1133     }
1134
1135   return SCM_UNSPECIFIED;
1136 }
1137
1138 static SCM
1139 scm_read_shebang (scm_t_wchar chr, SCM port)
1140 {
1141   int c = 0;
1142   if ((c = scm_get_byte_or_eof (port)) != 'r')
1143     {
1144       scm_ungetc (c, port);
1145       return scm_read_scsh_block_comment (chr, port);
1146     }
1147   if ((c = scm_get_byte_or_eof (port)) != '6')
1148     {
1149       scm_ungetc (c, port);
1150       scm_ungetc ('r', port);
1151       return scm_read_scsh_block_comment (chr, port);
1152     }
1153   if ((c = scm_get_byte_or_eof (port)) != 'r')
1154     {
1155       scm_ungetc (c, port);
1156       scm_ungetc ('6', port);
1157       scm_ungetc ('r', port);
1158       return scm_read_scsh_block_comment (chr, port);
1159     }
1160   if ((c = scm_get_byte_or_eof (port)) != 's')
1161     {
1162       scm_ungetc (c, port);
1163       scm_ungetc ('r', port);
1164       scm_ungetc ('6', port);
1165       scm_ungetc ('r', port);
1166       return scm_read_scsh_block_comment (chr, port);
1167     }
1168
1169   return SCM_UNSPECIFIED;
1170 }
1171
1172 static SCM
1173 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1174 {
1175   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1176      nested.  So care must be taken.  */
1177   int nesting_level = 1;
1178   int opening_seen = 0, closing_seen = 0;
1179
1180   while (nesting_level > 0)
1181     {
1182       int c = scm_getc (port);
1183
1184       if (c == EOF)
1185         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1186                            "unterminated `#| ... |#' comment", SCM_EOL);
1187
1188       if (opening_seen)
1189         {
1190           if (c == '|')
1191             nesting_level++;
1192           opening_seen = 0;
1193         }
1194       else if (closing_seen)
1195         {
1196           if (c == '#')
1197             nesting_level--;
1198           closing_seen = 0;
1199         }
1200       else if (c == '|')
1201         closing_seen = 1;
1202       else if (c == '#')
1203         opening_seen = 1;
1204       else
1205         opening_seen = closing_seen = 0;
1206     }
1207
1208   return SCM_UNSPECIFIED;
1209 }
1210
1211 static SCM
1212 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1213 {
1214   scm_t_wchar c;
1215
1216   c = flush_ws (port, (char *) NULL);
1217   if (EOF == c)
1218     scm_i_input_error ("read_commented_expression", port,
1219                        "no expression after #; comment", SCM_EOL);
1220   scm_ungetc (c, port);
1221   scm_read_expression (port);
1222   return SCM_UNSPECIFIED;
1223 }
1224
1225 static SCM
1226 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1227 {
1228   /* Guile's extended symbol read syntax looks like this:
1229
1230        #{This is all a symbol name}#
1231
1232      So here, CHR is expected to be `{'.  */
1233   int saw_brace = 0;
1234   size_t len = 0;
1235   SCM buf = scm_i_make_string (1024, NULL, 0);
1236
1237   buf = scm_i_string_start_writing (buf);
1238
1239   while ((chr = scm_getc (port)) != EOF)
1240     {
1241       if (saw_brace)
1242         {
1243           if (chr == '#')
1244             {
1245               break;
1246             }
1247           else
1248             {
1249               saw_brace = 0;
1250               scm_i_string_set_x (buf, len++, '}');
1251             }
1252         }
1253
1254       if (chr == '}')
1255         saw_brace = 1;
1256       else if (chr == '\\')
1257         {
1258           /* It used to be that print.c would print extended-read-syntax
1259              symbols with backslashes before "non-standard" chars, but
1260              this routine wouldn't do anything with those escapes.
1261              Bummer.  What we've done is to change print.c to output
1262              R6RS hex escapes for those characters, relying on the fact
1263              that the extended read syntax would never put a `\' before
1264              an `x'.  For now, we just ignore other instances of
1265              backslash in the string.  */
1266           switch ((chr = scm_getc (port)))
1267             {
1268             case EOF:
1269               goto done;
1270             case 'x':
1271               {
1272                 scm_t_wchar c;
1273
1274                 SCM_READ_HEX_ESCAPE (10, ';');
1275                 scm_i_string_set_x (buf, len++, c);
1276                 break;
1277
1278               str_eof:
1279                 chr = EOF;
1280                 goto done;
1281
1282               bad_escaped:
1283                 scm_i_string_stop_writing ();
1284                 scm_i_input_error ("scm_read_extended_symbol", port,
1285                                    "illegal character in escape sequence: ~S",
1286                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1287                 break;
1288               }
1289             default:
1290               scm_i_string_set_x (buf, len++, chr);
1291               break;
1292             }
1293         }
1294       else
1295         scm_i_string_set_x (buf, len++, chr);
1296
1297       if (len >= scm_i_string_length (buf) - 2)
1298         {
1299           SCM addy;
1300
1301           scm_i_string_stop_writing ();
1302           addy = scm_i_make_string (1024, NULL, 0);
1303           buf = scm_string_append (scm_list_2 (buf, addy));
1304           len = 0;
1305           buf = scm_i_string_start_writing (buf);
1306         }
1307     }
1308
1309  done:
1310   scm_i_string_stop_writing ();
1311   if (chr == EOF)
1312     scm_i_input_error ("scm_read_extended_symbol", port,
1313                        "end of file while reading symbol", SCM_EOL);
1314
1315   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1316 }
1317
1318
1319 \f
1320 /* Top-level token readers, i.e., dispatchers.  */
1321
1322 static SCM
1323 scm_read_sharp_extension (int chr, SCM port)
1324 {
1325   SCM proc;
1326
1327   proc = scm_get_hash_procedure (chr);
1328   if (scm_is_true (scm_procedure_p (proc)))
1329     {
1330       long line = SCM_LINUM (port);
1331       int column = SCM_COL (port) - 2;
1332       SCM got;
1333
1334       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1335       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1336         {
1337           if (SCM_RECORD_POSITIONS_P)
1338             return (recsexpr (got, line, column,
1339                               SCM_FILENAME (port)));
1340           else
1341             return got;
1342         }
1343     }
1344
1345   return SCM_UNSPECIFIED;
1346 }
1347
1348 /* The reader for the sharp `#' character.  It basically dispatches reads
1349    among the above token readers.   */
1350 static SCM
1351 scm_read_sharp (scm_t_wchar chr, SCM port)
1352 #define FUNC_NAME "scm_lreadr"
1353 {
1354   SCM result;
1355
1356   chr = scm_getc (port);
1357
1358   result = scm_read_sharp_extension (chr, port);
1359   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1360     return result;
1361
1362   switch (chr)
1363     {
1364     case '\\':
1365       return (scm_read_character (chr, port));
1366     case '(':
1367       return (scm_read_vector (chr, port));
1368     case 's':
1369     case 'u':
1370     case 'f':
1371     case 'c':
1372       /* This one may return either a boolean or an SRFI-4 vector.  */
1373       return (scm_read_srfi4_vector (chr, port));
1374     case 'v':
1375       return (scm_read_bytevector (chr, port));
1376     case '*':
1377       return (scm_read_guile_bit_vector (chr, port));
1378     case 't':
1379     case 'T':
1380     case 'F':
1381       /* This one may return either a boolean or an SRFI-4 vector.  */
1382       return (scm_read_boolean (chr, port));
1383     case ':':
1384       return (scm_read_keyword (chr, port));
1385     case '0': case '1': case '2': case '3': case '4':
1386     case '5': case '6': case '7': case '8': case '9':
1387     case '@':
1388       return (scm_i_read_array (port, chr));
1389
1390     case 'i':
1391     case 'e':
1392     case 'b':
1393     case 'B':
1394     case 'o':
1395     case 'O':
1396     case 'd':
1397     case 'D':
1398     case 'x':
1399     case 'X':
1400     case 'I':
1401     case 'E':
1402       return (scm_read_number_and_radix (chr, port));
1403     case '{':
1404       return (scm_read_extended_symbol (chr, port));
1405     case '!':
1406       return (scm_read_shebang (chr, port));
1407     case ';':
1408       return (scm_read_commented_expression (chr, port));
1409     case '`':
1410     case '\'':
1411     case ',':
1412       return (scm_read_syntax (chr, port));
1413     case 'n':
1414       return (scm_read_nil (chr, port));
1415     default:
1416       result = scm_read_sharp_extension (chr, port);
1417       if (scm_is_eq (result, SCM_UNSPECIFIED))
1418         {
1419           /* To remain compatible with 1.8 and earlier, the following
1420              characters have lower precedence than `read-hash-extend'
1421              characters.  */
1422           switch (chr)
1423             {
1424             case '|':
1425               return scm_read_r6rs_block_comment (chr, port);
1426             default:
1427               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1428                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1429             }
1430         }
1431       else
1432         return result;
1433     }
1434
1435   return SCM_UNSPECIFIED;
1436 }
1437 #undef FUNC_NAME
1438
1439 static SCM
1440 scm_read_expression (SCM port)
1441 #define FUNC_NAME "scm_read_expression"
1442 {
1443   while (1)
1444     {
1445       register scm_t_wchar chr;
1446
1447       chr = scm_getc (port);
1448
1449       switch (chr)
1450         {
1451         case SCM_WHITE_SPACES:
1452         case SCM_LINE_INCREMENTORS:
1453           break;
1454         case ';':
1455           (void) scm_read_semicolon_comment (chr, port);
1456           break;
1457         case '[':
1458           if (!SCM_SQUARE_BRACKETS_P)
1459             return (scm_read_mixed_case_symbol (chr, port));
1460           /* otherwise fall through */
1461         case '(':
1462           return (scm_read_sexp (chr, port));
1463         case '"':
1464           return (scm_read_string (chr, port));
1465         case '\'':
1466         case '`':
1467         case ',':
1468           return (scm_read_quote (chr, port));
1469         case '#':
1470           {
1471             SCM result;
1472             result = scm_read_sharp (chr, port);
1473             if (scm_is_eq (result, SCM_UNSPECIFIED))
1474               /* We read a comment or some such.  */
1475               break;
1476             else
1477               return result;
1478           }
1479         case ')':
1480           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1481           break;
1482         case ']':
1483           if (SCM_SQUARE_BRACKETS_P)
1484             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1485           /* otherwise fall through */
1486         case EOF:
1487           return SCM_EOF_VAL;
1488         case ':':
1489           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1490             return scm_symbol_to_keyword (scm_read_expression (port));
1491           /* Fall through.  */
1492
1493         default:
1494           {
1495             if (((chr >= '0') && (chr <= '9'))
1496                 || (strchr ("+-.", chr)))
1497               return (scm_read_number (chr, port));
1498             else
1499               return (scm_read_mixed_case_symbol (chr, port));
1500           }
1501         }
1502     }
1503 }
1504 #undef FUNC_NAME
1505
1506 \f
1507 /* Actual reader.  */
1508
1509 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1510             (SCM port),
1511             "Read an s-expression from the input port @var{port}, or from\n"
1512             "the current input port if @var{port} is not specified.\n"
1513             "Any whitespace before the next token is discarded.")
1514 #define FUNC_NAME s_scm_read
1515 {
1516   int c;
1517
1518   if (SCM_UNBNDP (port))
1519     port = scm_current_input_port ();
1520   SCM_VALIDATE_OPINPORT (1, port);
1521
1522   c = flush_ws (port, (char *) NULL);
1523   if (EOF == c)
1524     return SCM_EOF_VAL;
1525   scm_ungetc (c, port);
1526
1527   return (scm_read_expression (port));
1528 }
1529 #undef FUNC_NAME
1530
1531
1532 \f
1533
1534 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1535 static SCM
1536 recsexpr (SCM obj, long line, int column, SCM filename)
1537 {
1538   if (!scm_is_pair(obj)) {
1539     return obj;
1540   } else {
1541     SCM tmp, copy;
1542     /* If this sexpr is visible in the read:sharp source, we want to
1543        keep that information, so only record non-constant cons cells
1544        which haven't previously been read by the reader. */
1545     if (scm_is_false (scm_hashq_ref (scm_source_whash, obj, SCM_BOOL_F)))
1546       {
1547         if (SCM_COPY_SOURCE_P)
1548           {
1549             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1550                              SCM_UNDEFINED);
1551             for (tmp = obj; scm_is_pair (tmp); tmp = SCM_CDR (tmp))
1552               {
1553                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1554                                                       line,
1555                                                       column,
1556                                                       filename),
1557                                             SCM_UNDEFINED));
1558                 copy = SCM_CDR (copy);
1559               }
1560             SCM_SETCDR (copy, tmp);
1561           }
1562         else
1563           {
1564             recsexpr (SCM_CAR (obj), line, column, filename);
1565             for (tmp = obj; scm_is_pair (tmp); tmp = SCM_CDR (tmp))
1566               recsexpr (SCM_CAR (tmp), line, column, filename);
1567             copy = SCM_UNDEFINED;
1568           }
1569         scm_hashq_set_x (scm_source_whash,
1570                          obj,
1571                          scm_make_srcprops (line,
1572                                             column,
1573                                             filename,
1574                                             copy,
1575                                             SCM_EOL));
1576       }
1577     return obj;
1578   }
1579 }
1580
1581 /* Manipulate the read-hash-procedures alist.  This could be written in
1582    Scheme, but maybe it will also be used by C code during initialisation.  */
1583 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1584             (SCM chr, SCM proc),
1585             "Install the procedure @var{proc} for reading expressions\n"
1586             "starting with the character sequence @code{#} and @var{chr}.\n"
1587             "@var{proc} will be called with two arguments:  the character\n"
1588             "@var{chr} and the port to read further data from. The object\n"
1589             "returned will be the return value of @code{read}. \n"
1590             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1591             )
1592 #define FUNC_NAME s_scm_read_hash_extend
1593 {
1594   SCM this;
1595   SCM prev;
1596
1597   SCM_VALIDATE_CHAR (1, chr);
1598   SCM_ASSERT (scm_is_false (proc)
1599               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1600               proc, SCM_ARG2, FUNC_NAME);
1601
1602   /* Check if chr is already in the alist.  */
1603   this = scm_i_read_hash_procedures_ref ();
1604   prev = SCM_BOOL_F;
1605   while (1)
1606     {
1607       if (scm_is_null (this))
1608         {
1609           /* not found, so add it to the beginning.  */
1610           if (scm_is_true (proc))
1611             {
1612               SCM new = scm_cons (scm_cons (chr, proc),
1613                                   scm_i_read_hash_procedures_ref ());
1614               scm_i_read_hash_procedures_set_x (new);
1615             }
1616           break;
1617         }
1618       if (scm_is_eq (chr, SCM_CAAR (this)))
1619         {
1620           /* already in the alist.  */
1621           if (scm_is_false (proc))
1622             {
1623               /* remove it.  */
1624               if (scm_is_false (prev))
1625                 {
1626                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1627                   scm_i_read_hash_procedures_set_x (rest);
1628                 }
1629               else
1630                 scm_set_cdr_x (prev, SCM_CDR (this));
1631             }
1632           else
1633             {
1634               /* replace it.  */
1635               scm_set_cdr_x (SCM_CAR (this), proc);
1636             }
1637           break;
1638         }
1639       prev = this;
1640       this = SCM_CDR (this);
1641     }
1642
1643   return SCM_UNSPECIFIED;
1644 }
1645 #undef FUNC_NAME
1646
1647 /* Recover the read-hash procedure corresponding to char c.  */
1648 static SCM
1649 scm_get_hash_procedure (int c)
1650 {
1651   SCM rest = scm_i_read_hash_procedures_ref ();
1652
1653   while (1)
1654     {
1655       if (scm_is_null (rest))
1656         return SCM_BOOL_F;
1657
1658       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1659         return SCM_CDAR (rest);
1660
1661       rest = SCM_CDR (rest);
1662     }
1663 }
1664
1665 #define SCM_ENCODING_SEARCH_SIZE (500)
1666
1667 /* Search the first few hundred characters of a file for an Emacs-like coding
1668    declaration.  Returns either NULL or a string whose storage has been
1669    allocated with `scm_gc_malloc ()'.  */
1670 char *
1671 scm_i_scan_for_encoding (SCM port)
1672 {
1673   scm_t_port *pt;
1674   char header[SCM_ENCODING_SEARCH_SIZE+1];
1675   size_t bytes_read, encoding_length, i;
1676   char *encoding = NULL;
1677   int utf8_bom = 0;
1678   char *pos, *encoding_start;
1679   int in_comment;
1680
1681   pt = SCM_PTAB_ENTRY (port);
1682
1683   if (pt->rw_active == SCM_PORT_WRITE)
1684     scm_flush (port);
1685
1686   if (pt->rw_random)
1687     pt->rw_active = SCM_PORT_READ;
1688
1689   if (pt->read_pos == pt->read_end)
1690     {
1691       /* We can use the read buffer, and thus avoid a seek. */
1692       if (scm_fill_input (port) == EOF)
1693         return NULL;
1694
1695       bytes_read = pt->read_end - pt->read_pos;
1696       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1697         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1698
1699       if (bytes_read <= 1)
1700         /* An unbuffered port -- don't scan.  */
1701         return NULL;
1702
1703       memcpy (header, pt->read_pos, bytes_read);
1704       header[bytes_read] = '\0';
1705     }
1706   else
1707     {
1708       /* Try to read some bytes and then seek back.  Not all ports
1709          support seeking back; and indeed some file ports (like
1710          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1711          check performed by SCM_FPORT_FDES---but fail to seek
1712          backwards.  Hence this block comes second.  We prefer to use
1713          the read buffer in-place.  */
1714       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1715         return NULL;
1716
1717       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1718       header[bytes_read] = '\0';
1719       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1720     }
1721
1722   if (bytes_read > 3
1723       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1724     utf8_bom = 1;
1725
1726   /* search past "coding[:=]" */
1727   pos = header;
1728   while (1)
1729     {
1730       if ((pos = strstr(pos, "coding")) == NULL)
1731         return NULL;
1732
1733       pos += strlen("coding");
1734       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1735           (*pos == ':' || *pos == '='))
1736         {
1737           pos ++;
1738           break;
1739         }
1740     }
1741
1742   /* skip spaces */
1743   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1744          (*pos == ' ' || *pos == '\t'))
1745     pos ++;
1746
1747   /* grab the next token */
1748   encoding_start = pos;
1749   i = 0;
1750   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1751          && encoding_start + i - header < bytes_read
1752          && (isalnum ((int) encoding_start[i])
1753              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1754     i++;
1755
1756   encoding_length = i;
1757   if (encoding_length == 0)
1758     return NULL;
1759
1760   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1761   for (i = 0; i < encoding_length; i++)
1762     encoding[i] = toupper ((int) encoding[i]);
1763
1764   /* push backwards to make sure we were in a comment */
1765   in_comment = 0;
1766   pos = encoding_start;
1767   while (pos >= header)
1768     {
1769       if (*pos == ';')
1770         {
1771           in_comment = 1;
1772           break;
1773         }
1774       else if (*pos == '\n' || pos == header)
1775         {
1776           /* This wasn't in a semicolon comment. Check for a
1777            hash-bang comment. */
1778           char *beg = strstr (header, "#!");
1779           char *end = strstr (header, "!#");
1780           if (beg < encoding_start && encoding_start + encoding_length <= end)
1781             in_comment = 1;
1782           break;
1783         }
1784       else
1785         {
1786           pos --;
1787           continue;
1788         }
1789     }
1790   if (!in_comment)
1791     /* This wasn't in a comment */
1792     return NULL;
1793
1794   if (utf8_bom && strcmp(encoding, "UTF-8"))
1795     scm_misc_error (NULL,
1796                     "the port input declares the encoding ~s but is encoded as UTF-8",
1797                     scm_list_1 (scm_from_locale_string (encoding)));
1798
1799   return encoding;
1800 }
1801
1802 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1803             (SCM port),
1804             "Scans the port for an Emacs-like character coding declaration\n"
1805             "near the top of the contents of a port with random-accessible contents.\n"
1806             "The coding declaration is of the form\n"
1807             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1808             "\n"
1809             "Returns a string containing the character encoding of the file\n"
1810             "if a declaration was found, or @code{#f} otherwise.\n")
1811 #define FUNC_NAME s_scm_file_encoding
1812 {
1813   char *enc;
1814   SCM s_enc;
1815
1816   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1817
1818   enc = scm_i_scan_for_encoding (port);
1819   if (enc == NULL)
1820     return SCM_BOOL_F;
1821   else
1822     {
1823       s_enc = scm_from_locale_string (enc);
1824       return s_enc;
1825     }
1826
1827   return SCM_BOOL_F;
1828 }
1829 #undef FUNC_NAME
1830
1831 void
1832 scm_init_read ()
1833 {
1834   SCM read_hash_procs;
1835
1836   read_hash_procs = scm_make_fluid ();
1837   scm_fluid_set_x (read_hash_procs, SCM_EOL);
1838
1839   scm_i_read_hash_procedures =
1840     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1841
1842   scm_init_opts (scm_read_options, scm_read_opts);
1843 #include "libguile/read.x"
1844 }
1845
1846 /*
1847   Local Variables:
1848   c-file-style: "gnu"
1849   End:
1850 */