libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360
 361
 362 static SCM
 363 maybe_annotate_source (SCM x, SCM port, long line, int column)
 364 {
 365   if (SCM_RECORD_POSITIONS_P)
 366     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 367   return x;
 368 }
 369
 370 static SCM
 371 scm_read_sexp (scm_t_wchar chr, SCM port)
 372 #define FUNC_NAME "scm_i_lreadparen"
 373 {
 374   int c;
 375   SCM tmp, tl, ans = SCM_EOL;
 376   const int terminating_char = ((chr == '[') ? ']' : ')');
 377
 378   /* Need to capture line and column numbers here. */
 379   long line = SCM_LINUM (port);
 380   int column = SCM_COL (port) - 1;
 381
 382   c = flush_ws (port, FUNC_NAME);
 383   if (terminating_char == c)
 384     return SCM_EOL;
 385
 386   scm_ungetc (c, port);
 387   tmp = scm_read_expression (port);
 388
 389   /* Note that it is possible for scm_read_expression to return
 390      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 391      check that it's a real dot by checking `c'.  */
 392   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 393     {
 394       ans = scm_read_expression (port);
 395       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 396         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 397                            SCM_EOL);
 398       return ans;
 399     }
 400
 401   /* Build the head of the list structure. */
 402   ans = tl = scm_cons (tmp, SCM_EOL);
 403
 404   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 405     {
 406       SCM new_tail;
 407
 408       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 409         scm_i_input_error (FUNC_NAME, port,
 410                            "in pair: mismatched close paren: ~A",
 411                            scm_list_1 (SCM_MAKE_CHAR (c)));
 412
 413       scm_ungetc (c, port);
 414       tmp = scm_read_expression (port);
 415
 416       /* See above note about scm_sym_dot.  */
 417       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 418         {
 419           SCM_SETCDR (tl, scm_read_expression (port));
 420
 421           c = flush_ws (port, FUNC_NAME);
 422           if (terminating_char != c)
 423             scm_i_input_error (FUNC_NAME, port,
 424                                "in pair: missing close paren", SCM_EOL);
 425           goto exit;
 426         }
 427
 428       new_tail = scm_cons (tmp, SCM_EOL);
 429       SCM_SETCDR (tl, new_tail);
 430       tl = new_tail;
 431     }
 432
 433  exit:
 434   return maybe_annotate_source (ans, port, line, column);
 435 }
 436 #undef FUNC_NAME
 437
 438
 439 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 440    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 441    found.  */
 442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 443   do                                                               \
 444     {                                                              \
 445       scm_t_wchar a;                                               \
 446       size_t i = 0;                                                \
 447       c = 0;                                                       \
 448       while (i < ndigits)                                          \
 449         {                                                          \
 450           a = scm_getc (port);                                     \
 451           if (a == EOF)                                            \
 452             goto str_eof;                                          \
 453           if (terminator                                           \
 454               && (a == (scm_t_wchar) terminator)                   \
 455               && (i > 0))                                          \
 456             break;                                                 \
 457           if ('0' <= a && a <= '9')                                \
 458             a -= '0';                                              \
 459           else if ('A' <= a && a <= 'F')                           \
 460             a = a - 'A' + 10;                                      \
 461           else if ('a' <= a && a <= 'f')                           \
 462             a = a - 'a' + 10;                                      \
 463           else                                                     \
 464             {                                                      \
 465               c = a;                                               \
 466               goto bad_escaped;                                    \
 467             }                                                      \
 468           c = c * 16 + a;                                          \
 469           i ++;                                                    \
 470         }                                                          \
 471     } while (0)
 472
 473 static void
 474 skip_intraline_whitespace (SCM port)
 475 {
 476   scm_t_wchar c;
 477
 478   do
 479     {
 480       c = scm_getc (port);
 481       if (c == EOF)
 482         return;
 483     }
 484   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 485
 486   scm_ungetc (c, port);
 487 }
 488
 489 static SCM
 490 scm_read_string (int chr, SCM port)
 491 #define FUNC_NAME "scm_lreadr"
 492 {
 493   /* For strings smaller than C_STR, this function creates only one Scheme
 494      object (the string returned).  */
 495
 496   SCM str = SCM_BOOL_F;
 497   unsigned c_str_len = 0;
 498   scm_t_wchar c;
 499
 500   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 501   while ('"' != (c = scm_getc (port)))
 502     {
 503       if (c == EOF)
 504         {
 505         str_eof:
 506           scm_i_input_error (FUNC_NAME, port,
 507                              "end of file in string constant", SCM_EOL);
 508         }
 509
 510       if (c_str_len + 1 >= scm_i_string_length (str))
 511         {
 512           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 513
 514           str = scm_string_append (scm_list_2 (str, addy));
 515         }
 516
 517       if (c == '\\')
 518         {
 519           switch (c = scm_getc (port))
 520             {
 521             case EOF:
 522               goto str_eof;
 523             case '"':
 524             case '\\':
 525               break;
 526             case '\n':
 527               if (SCM_HUNGRY_EOL_ESCAPES_P)
 528                 skip_intraline_whitespace (port);
 529               continue;
 530             case '0':
 531               c = '\0';
 532               break;
 533             case 'f':
 534               c = '\f';
 535               break;
 536             case 'n':
 537               c = '\n';
 538               break;
 539             case 'r':
 540               c = '\r';
 541               break;
 542             case 't':
 543               c = '\t';
 544               break;
 545             case 'a':
 546               c = '\007';
 547               break;
 548             case 'v':
 549               c = '\v';
 550               break;
 551             case 'b':
 552               c = '\010';
 553               break;
 554             case 'x':
 555               if (SCM_R6RS_ESCAPES_P)
 556                 SCM_READ_HEX_ESCAPE (10, ';');
 557               else
 558                 SCM_READ_HEX_ESCAPE (2, '\0');
 559               break;
 560             case 'u':
 561               if (!SCM_R6RS_ESCAPES_P)
 562                 {
 563                   SCM_READ_HEX_ESCAPE (4, '\0');
 564                   break;
 565                 }
 566             case 'U':
 567               if (!SCM_R6RS_ESCAPES_P)
 568                 {
 569                   SCM_READ_HEX_ESCAPE (6, '\0');
 570                   break;
 571                 }
 572             default:
 573             bad_escaped:
 574               scm_i_input_error (FUNC_NAME, port,
 575                                  "illegal character in escape sequence: ~S",
 576                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 577             }
 578         }
 579       str = scm_i_string_start_writing (str);
 580       scm_i_string_set_x (str, c_str_len++, c);
 581       scm_i_string_stop_writing ();
 582     }
 583   return scm_i_substring_copy (str, 0, c_str_len);
 584 }
 585 #undef FUNC_NAME
 586
 587
 588 static SCM
 589 scm_read_number (scm_t_wchar chr, SCM port)
 590 {
 591   SCM result, str = SCM_EOL;
 592   char buffer[READER_BUFFER_SIZE];
 593   char *overflow_buffer = NULL;
 594   size_t bytes_read;
 595   int overflow;
 596   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 597
 598   scm_ungetc (chr, port);
 599   overflow = read_complete_token (port, buffer, sizeof (buffer),
 600                                   &overflow_buffer, &bytes_read);
 601
 602   if (!overflow)
 603     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 604   else
 605     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 606                             pt->ilseq_handler);
 607
 608   result = scm_string_to_number (str, SCM_UNDEFINED);
 609   if (!scm_is_true (result))
 610     {
 611       /* Return a symbol instead of a number */
 612       if (SCM_CASE_INSENSITIVE_P)
 613         str = scm_string_downcase_x (str);
 614       result = scm_string_to_symbol (str);
 615     }
 616
 617   if (overflow)
 618     free (overflow_buffer);
 619   SCM_COL (port) += scm_i_string_length (str);
 620   return result;
 621 }
 622
 623 static SCM
 624 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 625 {
 626   SCM result;
 627   int ends_with_colon = 0;
 628   size_t bytes_read;
 629   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 630   int overflow;
 631   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 632   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 633   SCM str;
 634
 635   scm_ungetc (chr, port);
 636   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 637                                   &overflow_buffer, &bytes_read);
 638   if (bytes_read > 0)
 639     {
 640       if (!overflow)
 641         ends_with_colon = buffer[bytes_read - 1] == ':';
 642       else
 643         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 644     }
 645
 646   if (postfix && ends_with_colon && (bytes_read > 1))
 647     {
 648       if (!overflow)
 649         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 650       else
 651         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 652                                 pt->ilseq_handler);
 653
 654       if (SCM_CASE_INSENSITIVE_P)
 655         str = scm_string_downcase_x (str);
 656       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 657     }
 658   else
 659     {
 660       if (!overflow)
 661         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 662       else
 663         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 664                                 pt->ilseq_handler);
 665
 666       if (SCM_CASE_INSENSITIVE_P)
 667         str = scm_string_downcase_x (str);
 668       result = scm_string_to_symbol (str);
 669     }
 670
 671   if (overflow)
 672     free (overflow_buffer);
 673   SCM_COL (port) += scm_i_string_length (str);
 674   return result;
 675 }
 676
 677 static SCM
 678 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 679 #define FUNC_NAME "scm_lreadr"
 680 {
 681   SCM result;
 682   size_t read;
 683   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 684   int overflow;
 685   unsigned int radix;
 686   SCM str;
 687   scm_t_port *pt;
 688
 689   switch (chr)
 690     {
 691     case 'B':
 692     case 'b':
 693       radix = 2;
 694       break;
 695
 696     case 'o':
 697     case 'O':
 698       radix = 8;
 699       break;
 700
 701     case 'd':
 702     case 'D':
 703       radix = 10;
 704       break;
 705
 706     case 'x':
 707     case 'X':
 708       radix = 16;
 709       break;
 710
 711     default:
 712       scm_ungetc (chr, port);
 713       scm_ungetc ('#', port);
 714       radix = 10;
 715     }
 716
 717   overflow = read_complete_token (port, buffer, sizeof (buffer),
 718                                   &overflow_buffer, &read);
 719
 720   pt = SCM_PTAB_ENTRY (port);
 721   if (!overflow)
 722     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 723   else
 724     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 725                             pt->ilseq_handler);
 726
 727   result = scm_string_to_number (str, scm_from_uint (radix));
 728
 729   if (overflow)
 730     free (overflow_buffer);
 731
 732   SCM_COL (port) += scm_i_string_length (str);
 733
 734   if (scm_is_true (result))
 735     return result;
 736
 737   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 738
 739   return SCM_BOOL_F;
 740 }
 741 #undef FUNC_NAME
 742
 743 static SCM
 744 scm_read_quote (int chr, SCM port)
 745 {
 746   SCM p;
 747   long line = SCM_LINUM (port);
 748   int column = SCM_COL (port) - 1;
 749
 750   switch (chr)
 751     {
 752     case '`':
 753       p = scm_sym_quasiquote;
 754       break;
 755
 756     case '\'':
 757       p = scm_sym_quote;
 758       break;
 759
 760     case ',':
 761       {
 762         scm_t_wchar c;
 763
 764         c = scm_getc (port);
 765         if ('@' == c)
 766           p = scm_sym_uq_splicing;
 767         else
 768           {
 769             scm_ungetc (c, port);
 770             p = scm_sym_unquote;
 771           }
 772         break;
 773       }
 774
 775     default:
 776       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 777                "scm_read_quote", chr);
 778       abort ();
 779     }
 780
 781   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 782   return maybe_annotate_source (p, port, line, column);
 783 }
 784
 785 SCM_SYMBOL (sym_syntax, "syntax");
 786 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 787 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 788 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 789
 790 static SCM
 791 scm_read_syntax (int chr, SCM port)
 792 {
 793   SCM p;
 794   long line = SCM_LINUM (port);
 795   int column = SCM_COL (port) - 1;
 796
 797   switch (chr)
 798     {
 799     case '`':
 800       p = sym_quasisyntax;
 801       break;
 802
 803     case '\'':
 804       p = sym_syntax;
 805       break;
 806
 807     case ',':
 808       {
 809         int c;
 810
 811         c = scm_getc (port);
 812         if ('@' == c)
 813           p = sym_unsyntax_splicing;
 814         else
 815           {
 816             scm_ungetc (c, port);
 817             p = sym_unsyntax;
 818           }
 819         break;
 820       }
 821
 822     default:
 823       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 824                "scm_read_syntax", chr);
 825       abort ();
 826     }
 827
 828   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 829   return maybe_annotate_source (p, port, line, column);
 830 }
 831
 832 static SCM
 833 scm_read_nil (int chr, SCM port)
 834 {
 835   SCM id = scm_read_mixed_case_symbol (chr, port);
 836
 837   if (!scm_is_eq (id, sym_nil))
 838     scm_i_input_error ("scm_read_nil", port,
 839                        "unexpected input while reading #nil: ~a",
 840                        scm_list_1 (id));
 841
 842   return SCM_ELISP_NIL;
 843 }
 844
 845 static SCM
 846 scm_read_semicolon_comment (int chr, SCM port)
 847 {
 848   int c;
 849
 850   /* We use the get_byte here because there is no need to get the
 851      locale correct with comment input. This presumes that newline
 852      always represents itself no matter what the encoding is.  */
 853   for (c = scm_get_byte_or_eof (port);
 854        (c != EOF) && (c != '\n');
 855        c = scm_get_byte_or_eof (port));
 856
 857   return SCM_UNSPECIFIED;
 858 }
 859
 860 \f
 861 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 862
 863 static SCM
 864 scm_read_boolean (int chr, SCM port)
 865 {
 866   switch (chr)
 867     {
 868     case 't':
 869     case 'T':
 870       return SCM_BOOL_T;
 871
 872     case 'f':
 873     case 'F':
 874       return SCM_BOOL_F;
 875     }
 876
 877   return SCM_UNSPECIFIED;
 878 }
 879
 880 static SCM
 881 scm_read_character (scm_t_wchar chr, SCM port)
 882 #define FUNC_NAME "scm_lreadr"
 883 {
 884   char buffer[READER_CHAR_NAME_MAX_SIZE];
 885   SCM charname;
 886   size_t charname_len, bytes_read;
 887   scm_t_wchar cp;
 888   int overflow;
 889   scm_t_port *pt;
 890
 891   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 892   if (overflow)
 893     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 894
 895   if (bytes_read == 0)
 896     {
 897       chr = scm_getc (port);
 898       if (chr == EOF)
 899         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 900                            "while reading character", SCM_EOL);
 901
 902       /* CHR must be a token delimiter, like a whitespace.  */
 903       return (SCM_MAKE_CHAR (chr));
 904     }
 905
 906   pt = SCM_PTAB_ENTRY (port);
 907
 908   /* Simple ASCII characters can be processed immediately.  Also, simple
 909      ISO-8859-1 characters can be processed immediately if the encoding for this
 910      port is ISO-8859-1.  */
 911   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 912     {
 913       SCM_COL (port) += 1;
 914       return SCM_MAKE_CHAR (buffer[0]);
 915     }
 916
 917   /* Otherwise, convert the buffer into a proper scheme string for
 918      processing.  */
 919   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 920                                pt->ilseq_handler);
 921   charname_len = scm_i_string_length (charname);
 922   SCM_COL (port) += charname_len;
 923   cp = scm_i_string_ref (charname, 0);
 924   if (charname_len == 1)
 925     return SCM_MAKE_CHAR (cp);
 926
 927   /* Ignore dotted circles, which may be used to keep combining characters from
 928      combining with the backslash in #\charname.  */
 929   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 930     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 931
 932   if (cp >= '0' && cp < '8')
 933     {
 934       /* Dirk:FIXME::  This type of character syntax is not R5RS
 935        * compliant.  Further, it should be verified that the constant
 936        * does only consist of octal digits.  */
 937       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 938       if (SCM_I_INUMP (p))
 939         {
 940           scm_t_wchar c = scm_to_uint32 (p);
 941           if (SCM_IS_UNICODE_CHAR (c))
 942             return SCM_MAKE_CHAR (c);
 943           else
 944             scm_i_input_error (FUNC_NAME, port,
 945                                "out-of-range octal character escape: ~a",
 946                                scm_list_1 (charname));
 947         }
 948     }
 949
 950   if (cp == 'x' && (charname_len > 1))
 951     {
 952       SCM p;
 953
 954       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 955       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 956                                 scm_from_uint (16));
 957       if (SCM_I_INUMP (p))
 958         {
 959           scm_t_wchar c = scm_to_uint32 (p);
 960           if (SCM_IS_UNICODE_CHAR (c))
 961             return SCM_MAKE_CHAR (c);
 962           else
 963             scm_i_input_error (FUNC_NAME, port,
 964                                "out-of-range hex character escape: ~a",
 965                                scm_list_1 (charname));
 966         }
 967     }
 968
 969   /* The names of characters should never have non-Latin1
 970      characters.  */
 971   if (scm_i_is_narrow_string (charname)
 972       || scm_i_try_narrow_string (charname))
 973     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 974                                        charname_len);
 975       if (scm_is_true (ch))
 976         return ch;
 977     }
 978
 979   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 980                      scm_list_1 (charname));
 981
 982   return SCM_UNSPECIFIED;
 983 }
 984 #undef FUNC_NAME
 985
 986 static SCM
 987 scm_read_keyword (int chr, SCM port)
 988 {
 989   SCM symbol;
 990
 991   /* Read the symbol that comprises the keyword.  Doing this instead of
 992      invoking a specific symbol reader function allows `scm_read_keyword ()'
 993      to adapt to the delimiters currently valid of symbols.
 994
 995      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 996   symbol = scm_read_expression (port);
 997   if (!scm_is_symbol (symbol))
 998     scm_i_input_error ("scm_read_keyword", port,
 999                        "keyword prefix `~a' not followed by a symbol: ~s",
1000                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1001
1002   return (scm_symbol_to_keyword (symbol));
1003 }
1004
1005 static SCM
1006 scm_read_vector (int chr, SCM port)
1007 {
1008   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1009      guarantee that it's going to do what we want.  After all, this is an
1010      implementation detail of `scm_read_vector ()', not a desirable
1011      property.  */
1012   return (scm_vector (scm_read_sexp (chr, port)));
1013 }
1014
1015 static SCM
1016 scm_read_srfi4_vector (int chr, SCM port)
1017 {
1018   return scm_i_read_array (port, chr);
1019 }
1020
1021 static SCM
1022 scm_read_bytevector (scm_t_wchar chr, SCM port)
1023 {
1024   chr = scm_getc (port);
1025   if (chr != 'u')
1026     goto syntax;
1027
1028   chr = scm_getc (port);
1029   if (chr != '8')
1030     goto syntax;
1031
1032   chr = scm_getc (port);
1033   if (chr != '(')
1034     goto syntax;
1035
1036   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1037
1038  syntax:
1039   scm_i_input_error ("read_bytevector", port,
1040                      "invalid bytevector prefix",
1041                      SCM_MAKE_CHAR (chr));
1042   return SCM_UNSPECIFIED;
1043 }
1044
1045 static SCM
1046 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1047 {
1048   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1049      terribly inefficient but who cares?  */
1050   SCM s_bits = SCM_EOL;
1051
1052   for (chr = scm_getc (port);
1053        (chr != EOF) && ((chr == '0') || (chr == '1'));
1054        chr = scm_getc (port))
1055     {
1056       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1057     }
1058
1059   if (chr != EOF)
1060     scm_ungetc (chr, port);
1061
1062   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1063 }
1064
1065 static SCM
1066 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1067 {
1068   int bang_seen = 0;
1069
1070   for (;;)
1071     {
1072       int c = scm_getc (port);
1073
1074       if (c == EOF)
1075         scm_i_input_error ("skip_block_comment", port,
1076                            "unterminated `#! ... !#' comment", SCM_EOL);
1077
1078       if (c == '!')
1079         bang_seen = 1;
1080       else if (c == '#' && bang_seen)
1081         break;
1082       else
1083         bang_seen = 0;
1084     }
1085
1086   return SCM_UNSPECIFIED;
1087 }
1088
1089 static SCM
1090 scm_read_shebang (scm_t_wchar chr, SCM port)
1091 {
1092   int c = 0;
1093   if ((c = scm_get_byte_or_eof (port)) != 'r')
1094     {
1095       scm_ungetc (c, port);
1096       return scm_read_scsh_block_comment (chr, port);
1097     }
1098   if ((c = scm_get_byte_or_eof (port)) != '6')
1099     {
1100       scm_ungetc (c, port);
1101       scm_ungetc ('r', port);
1102       return scm_read_scsh_block_comment (chr, port);
1103     }
1104   if ((c = scm_get_byte_or_eof (port)) != 'r')
1105     {
1106       scm_ungetc (c, port);
1107       scm_ungetc ('6', port);
1108       scm_ungetc ('r', port);
1109       return scm_read_scsh_block_comment (chr, port);
1110     }
1111   if ((c = scm_get_byte_or_eof (port)) != 's')
1112     {
1113       scm_ungetc (c, port);
1114       scm_ungetc ('r', port);
1115       scm_ungetc ('6', port);
1116       scm_ungetc ('r', port);
1117       return scm_read_scsh_block_comment (chr, port);
1118     }
1119
1120   return SCM_UNSPECIFIED;
1121 }
1122
1123 static SCM
1124 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1125 {
1126   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1127      nested.  So care must be taken.  */
1128   int nesting_level = 1;
1129
1130   int a = scm_getc (port);
1131
1132   if (a == EOF)
1133     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1134                        "unterminated `#| ... |#' comment", SCM_EOL);
1135
1136   while (nesting_level > 0)
1137     {
1138       int b = scm_getc (port);
1139
1140       if (b == EOF)
1141         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1142                            "unterminated `#| ... |#' comment", SCM_EOL);
1143
1144       if (a == '|' && b == '#')
1145         {
1146           nesting_level--;
1147           b = EOF;
1148         }
1149       else if (a == '#' && b == '|')
1150         {
1151           nesting_level++;
1152           b = EOF;
1153         }
1154
1155       a = b;
1156     }
1157
1158   return SCM_UNSPECIFIED;
1159 }
1160
1161 static SCM
1162 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1163 {
1164   scm_t_wchar c;
1165
1166   c = flush_ws (port, (char *) NULL);
1167   if (EOF == c)
1168     scm_i_input_error ("read_commented_expression", port,
1169                        "no expression after #; comment", SCM_EOL);
1170   scm_ungetc (c, port);
1171   scm_read_expression (port);
1172   return SCM_UNSPECIFIED;
1173 }
1174
1175 static SCM
1176 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1177 {
1178   /* Guile's extended symbol read syntax looks like this:
1179
1180        #{This is all a symbol name}#
1181
1182      So here, CHR is expected to be `{'.  */
1183   int saw_brace = 0;
1184   size_t len = 0;
1185   SCM buf = scm_i_make_string (1024, NULL, 0);
1186
1187   buf = scm_i_string_start_writing (buf);
1188
1189   while ((chr = scm_getc (port)) != EOF)
1190     {
1191       if (saw_brace)
1192         {
1193           if (chr == '#')
1194             {
1195               break;
1196             }
1197           else
1198             {
1199               saw_brace = 0;
1200               scm_i_string_set_x (buf, len++, '}');
1201             }
1202         }
1203
1204       if (chr == '}')
1205         saw_brace = 1;
1206       else if (chr == '\\')
1207         {
1208           /* It used to be that print.c would print extended-read-syntax
1209              symbols with backslashes before "non-standard" chars, but
1210              this routine wouldn't do anything with those escapes.
1211              Bummer.  What we've done is to change print.c to output
1212              R6RS hex escapes for those characters, relying on the fact
1213              that the extended read syntax would never put a `\' before
1214              an `x'.  For now, we just ignore other instances of
1215              backslash in the string.  */
1216           switch ((chr = scm_getc (port)))
1217             {
1218             case EOF:
1219               goto done;
1220             case 'x':
1221               {
1222                 scm_t_wchar c;
1223
1224                 SCM_READ_HEX_ESCAPE (10, ';');
1225                 scm_i_string_set_x (buf, len++, c);
1226                 break;
1227
1228               str_eof:
1229                 chr = EOF;
1230                 goto done;
1231
1232               bad_escaped:
1233                 scm_i_string_stop_writing ();
1234                 scm_i_input_error ("scm_read_extended_symbol", port,
1235                                    "illegal character in escape sequence: ~S",
1236                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1237                 break;
1238               }
1239             default:
1240               scm_i_string_set_x (buf, len++, chr);
1241               break;
1242             }
1243         }
1244       else
1245         scm_i_string_set_x (buf, len++, chr);
1246
1247       if (len >= scm_i_string_length (buf) - 2)
1248         {
1249           SCM addy;
1250
1251           scm_i_string_stop_writing ();
1252           addy = scm_i_make_string (1024, NULL, 0);
1253           buf = scm_string_append (scm_list_2 (buf, addy));
1254           len = 0;
1255           buf = scm_i_string_start_writing (buf);
1256         }
1257     }
1258
1259  done:
1260   scm_i_string_stop_writing ();
1261   if (chr == EOF)
1262     scm_i_input_error ("scm_read_extended_symbol", port,
1263                        "end of file while reading symbol", SCM_EOL);
1264
1265   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1266 }
1267
1268
1269 \f
1270 /* Top-level token readers, i.e., dispatchers.  */
1271
1272 static SCM
1273 scm_read_sharp_extension (int chr, SCM port)
1274 {
1275   SCM proc;
1276
1277   proc = scm_get_hash_procedure (chr);
1278   if (scm_is_true (scm_procedure_p (proc)))
1279     {
1280       long line = SCM_LINUM (port);
1281       int column = SCM_COL (port) - 2;
1282       SCM got;
1283
1284       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1285
1286       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1287         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1288
1289       return got;
1290     }
1291
1292   return SCM_UNSPECIFIED;
1293 }
1294
1295 /* The reader for the sharp `#' character.  It basically dispatches reads
1296    among the above token readers.   */
1297 static SCM
1298 scm_read_sharp (scm_t_wchar chr, SCM port)
1299 #define FUNC_NAME "scm_lreadr"
1300 {
1301   SCM result;
1302
1303   chr = scm_getc (port);
1304
1305   result = scm_read_sharp_extension (chr, port);
1306   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1307     return result;
1308
1309   switch (chr)
1310     {
1311     case '\\':
1312       return (scm_read_character (chr, port));
1313     case '(':
1314       return (scm_read_vector (chr, port));
1315     case 's':
1316     case 'u':
1317     case 'f':
1318     case 'c':
1319       /* This one may return either a boolean or an SRFI-4 vector.  */
1320       return (scm_read_srfi4_vector (chr, port));
1321     case 'v':
1322       return (scm_read_bytevector (chr, port));
1323     case '*':
1324       return (scm_read_guile_bit_vector (chr, port));
1325     case 't':
1326     case 'T':
1327     case 'F':
1328       return (scm_read_boolean (chr, port));
1329     case ':':
1330       return (scm_read_keyword (chr, port));
1331     case '0': case '1': case '2': case '3': case '4':
1332     case '5': case '6': case '7': case '8': case '9':
1333     case '@':
1334 #if SCM_ENABLE_DEPRECATED
1335       /* See below for 'i' and 'e'. */
1336     case 'a':
1337     case 'y':
1338     case 'h':
1339     case 'l':
1340 #endif
1341       return (scm_i_read_array (port, chr));
1342
1343     case 'i':
1344     case 'e':
1345 #if SCM_ENABLE_DEPRECATED
1346       {
1347         /* When next char is '(', it really is an old-style
1348            uniform array. */
1349         scm_t_wchar next_c = scm_getc (port);
1350         if (next_c != EOF)
1351           scm_ungetc (next_c, port);
1352         if (next_c == '(')
1353           return scm_i_read_array (port, chr);
1354         /* Fall through. */
1355       }
1356 #endif
1357     case 'b':
1358     case 'B':
1359     case 'o':
1360     case 'O':
1361     case 'd':
1362     case 'D':
1363     case 'x':
1364     case 'X':
1365     case 'I':
1366     case 'E':
1367       return (scm_read_number_and_radix (chr, port));
1368     case '{':
1369       return (scm_read_extended_symbol (chr, port));
1370     case '!':
1371       return (scm_read_shebang (chr, port));
1372     case ';':
1373       return (scm_read_commented_expression (chr, port));
1374     case '`':
1375     case '\'':
1376     case ',':
1377       return (scm_read_syntax (chr, port));
1378     case 'n':
1379       return (scm_read_nil (chr, port));
1380     default:
1381       result = scm_read_sharp_extension (chr, port);
1382       if (scm_is_eq (result, SCM_UNSPECIFIED))
1383         {
1384           /* To remain compatible with 1.8 and earlier, the following
1385              characters have lower precedence than `read-hash-extend'
1386              characters.  */
1387           switch (chr)
1388             {
1389             case '|':
1390               return scm_read_r6rs_block_comment (chr, port);
1391             default:
1392               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1393                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1394             }
1395         }
1396       else
1397         return result;
1398     }
1399
1400   return SCM_UNSPECIFIED;
1401 }
1402 #undef FUNC_NAME
1403
1404 static SCM
1405 scm_read_expression (SCM port)
1406 #define FUNC_NAME "scm_read_expression"
1407 {
1408   while (1)
1409     {
1410       scm_t_wchar chr;
1411
1412       chr = scm_getc (port);
1413
1414       switch (chr)
1415         {
1416         case SCM_WHITE_SPACES:
1417         case SCM_LINE_INCREMENTORS:
1418           break;
1419         case ';':
1420           (void) scm_read_semicolon_comment (chr, port);
1421           break;
1422         case '[':
1423           if (!SCM_SQUARE_BRACKETS_P)
1424             return (scm_read_mixed_case_symbol (chr, port));
1425           /* otherwise fall through */
1426         case '(':
1427           return (scm_read_sexp (chr, port));
1428         case '"':
1429           return (scm_read_string (chr, port));
1430         case '\'':
1431         case '`':
1432         case ',':
1433           return (scm_read_quote (chr, port));
1434         case '#':
1435           {
1436             SCM result;
1437             result = scm_read_sharp (chr, port);
1438             if (scm_is_eq (result, SCM_UNSPECIFIED))
1439               /* We read a comment or some such.  */
1440               break;
1441             else
1442               return result;
1443           }
1444         case ')':
1445           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1446           break;
1447         case ']':
1448           if (SCM_SQUARE_BRACKETS_P)
1449             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1450           /* otherwise fall through */
1451         case EOF:
1452           return SCM_EOF_VAL;
1453         case ':':
1454           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1455             return scm_symbol_to_keyword (scm_read_expression (port));
1456           /* Fall through.  */
1457
1458         default:
1459           {
1460             if (((chr >= '0') && (chr <= '9'))
1461                 || (strchr ("+-.", chr)))
1462               return (scm_read_number (chr, port));
1463             else
1464               return (scm_read_mixed_case_symbol (chr, port));
1465           }
1466         }
1467     }
1468 }
1469 #undef FUNC_NAME
1470
1471 \f
1472 /* Actual reader.  */
1473
1474 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1475             (SCM port),
1476             "Read an s-expression from the input port @var{port}, or from\n"
1477             "the current input port if @var{port} is not specified.\n"
1478             "Any whitespace before the next token is discarded.")
1479 #define FUNC_NAME s_scm_read
1480 {
1481   int c;
1482
1483   if (SCM_UNBNDP (port))
1484     port = scm_current_input_port ();
1485   SCM_VALIDATE_OPINPORT (1, port);
1486
1487   c = flush_ws (port, (char *) NULL);
1488   if (EOF == c)
1489     return SCM_EOF_VAL;
1490   scm_ungetc (c, port);
1491
1492   return (scm_read_expression (port));
1493 }
1494 #undef FUNC_NAME
1495
1496
1497 \f
1498
1499 /* Manipulate the read-hash-procedures alist.  This could be written in
1500    Scheme, but maybe it will also be used by C code during initialisation.  */
1501 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1502             (SCM chr, SCM proc),
1503             "Install the procedure @var{proc} for reading expressions\n"
1504             "starting with the character sequence @code{#} and @var{chr}.\n"
1505             "@var{proc} will be called with two arguments:  the character\n"
1506             "@var{chr} and the port to read further data from. The object\n"
1507             "returned will be the return value of @code{read}. \n"
1508             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1509             )
1510 #define FUNC_NAME s_scm_read_hash_extend
1511 {
1512   SCM this;
1513   SCM prev;
1514
1515   SCM_VALIDATE_CHAR (1, chr);
1516   SCM_ASSERT (scm_is_false (proc)
1517               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1518               proc, SCM_ARG2, FUNC_NAME);
1519
1520   /* Check if chr is already in the alist.  */
1521   this = scm_i_read_hash_procedures_ref ();
1522   prev = SCM_BOOL_F;
1523   while (1)
1524     {
1525       if (scm_is_null (this))
1526         {
1527           /* not found, so add it to the beginning.  */
1528           if (scm_is_true (proc))
1529             {
1530               SCM new = scm_cons (scm_cons (chr, proc),
1531                                   scm_i_read_hash_procedures_ref ());
1532               scm_i_read_hash_procedures_set_x (new);
1533             }
1534           break;
1535         }
1536       if (scm_is_eq (chr, SCM_CAAR (this)))
1537         {
1538           /* already in the alist.  */
1539           if (scm_is_false (proc))
1540             {
1541               /* remove it.  */
1542               if (scm_is_false (prev))
1543                 {
1544                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1545                   scm_i_read_hash_procedures_set_x (rest);
1546                 }
1547               else
1548                 scm_set_cdr_x (prev, SCM_CDR (this));
1549             }
1550           else
1551             {
1552               /* replace it.  */
1553               scm_set_cdr_x (SCM_CAR (this), proc);
1554             }
1555           break;
1556         }
1557       prev = this;
1558       this = SCM_CDR (this);
1559     }
1560
1561   return SCM_UNSPECIFIED;
1562 }
1563 #undef FUNC_NAME
1564
1565 /* Recover the read-hash procedure corresponding to char c.  */
1566 static SCM
1567 scm_get_hash_procedure (int c)
1568 {
1569   SCM rest = scm_i_read_hash_procedures_ref ();
1570
1571   while (1)
1572     {
1573       if (scm_is_null (rest))
1574         return SCM_BOOL_F;
1575
1576       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1577         return SCM_CDAR (rest);
1578
1579       rest = SCM_CDR (rest);
1580     }
1581 }
1582
1583 #define SCM_ENCODING_SEARCH_SIZE (500)
1584
1585 /* Search the first few hundred characters of a file for an Emacs-like coding
1586    declaration.  Returns either NULL or a string whose storage has been
1587    allocated with `scm_gc_malloc ()'.  */
1588 char *
1589 scm_i_scan_for_encoding (SCM port)
1590 {
1591   scm_t_port *pt;
1592   char header[SCM_ENCODING_SEARCH_SIZE+1];
1593   size_t bytes_read, encoding_length, i;
1594   char *encoding = NULL;
1595   int utf8_bom = 0;
1596   char *pos, *encoding_start;
1597   int in_comment;
1598
1599   pt = SCM_PTAB_ENTRY (port);
1600
1601   if (pt->rw_active == SCM_PORT_WRITE)
1602     scm_flush (port);
1603
1604   if (pt->rw_random)
1605     pt->rw_active = SCM_PORT_READ;
1606
1607   if (pt->read_pos == pt->read_end)
1608     {
1609       /* We can use the read buffer, and thus avoid a seek. */
1610       if (scm_fill_input (port) == EOF)
1611         return NULL;
1612
1613       bytes_read = pt->read_end - pt->read_pos;
1614       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1615         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1616
1617       if (bytes_read <= 1)
1618         /* An unbuffered port -- don't scan.  */
1619         return NULL;
1620
1621       memcpy (header, pt->read_pos, bytes_read);
1622       header[bytes_read] = '\0';
1623     }
1624   else
1625     {
1626       /* Try to read some bytes and then seek back.  Not all ports
1627          support seeking back; and indeed some file ports (like
1628          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1629          check performed by SCM_FPORT_FDES---but fail to seek
1630          backwards.  Hence this block comes second.  We prefer to use
1631          the read buffer in-place.  */
1632       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1633         return NULL;
1634
1635       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1636       header[bytes_read] = '\0';
1637       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1638     }
1639
1640   if (bytes_read > 3
1641       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1642     utf8_bom = 1;
1643
1644   /* search past "coding[:=]" */
1645   pos = header;
1646   while (1)
1647     {
1648       if ((pos = strstr(pos, "coding")) == NULL)
1649         return NULL;
1650
1651       pos += strlen("coding");
1652       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1653           (*pos == ':' || *pos == '='))
1654         {
1655           pos ++;
1656           break;
1657         }
1658     }
1659
1660   /* skip spaces */
1661   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1662          (*pos == ' ' || *pos == '\t'))
1663     pos ++;
1664
1665   /* grab the next token */
1666   encoding_start = pos;
1667   i = 0;
1668   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1669          && encoding_start + i - header < bytes_read
1670          && (isalnum ((int) encoding_start[i])
1671              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1672     i++;
1673
1674   encoding_length = i;
1675   if (encoding_length == 0)
1676     return NULL;
1677
1678   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1679   for (i = 0; i < encoding_length; i++)
1680     encoding[i] = toupper ((int) encoding[i]);
1681
1682   /* push backwards to make sure we were in a comment */
1683   in_comment = 0;
1684   pos = encoding_start;
1685   while (pos >= header)
1686     {
1687       if (*pos == ';')
1688         {
1689           in_comment = 1;
1690           break;
1691         }
1692       else if (*pos == '\n' || pos == header)
1693         {
1694           /* This wasn't in a semicolon comment. Check for a
1695            hash-bang comment. */
1696           char *beg = strstr (header, "#!");
1697           char *end = strstr (header, "!#");
1698           if (beg < encoding_start && encoding_start + encoding_length <= end)
1699             in_comment = 1;
1700           break;
1701         }
1702       else
1703         {
1704           pos --;
1705           continue;
1706         }
1707     }
1708   if (!in_comment)
1709     /* This wasn't in a comment */
1710     return NULL;
1711
1712   if (utf8_bom && strcmp(encoding, "UTF-8"))
1713     scm_misc_error (NULL,
1714                     "the port input declares the encoding ~s but is encoded as UTF-8",
1715                     scm_list_1 (scm_from_locale_string (encoding)));
1716
1717   return encoding;
1718 }
1719
1720 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1721             (SCM port),
1722             "Scans the port for an Emacs-like character coding declaration\n"
1723             "near the top of the contents of a port with random-accessible contents.\n"
1724             "The coding declaration is of the form\n"
1725             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1726             "\n"
1727             "Returns a string containing the character encoding of the file\n"
1728             "if a declaration was found, or @code{#f} otherwise.\n")
1729 #define FUNC_NAME s_scm_file_encoding
1730 {
1731   char *enc;
1732   SCM s_enc;
1733
1734   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1735
1736   enc = scm_i_scan_for_encoding (port);
1737   if (enc == NULL)
1738     return SCM_BOOL_F;
1739   else
1740     {
1741       s_enc = scm_from_locale_string (enc);
1742       return s_enc;
1743     }
1744
1745   return SCM_BOOL_F;
1746 }
1747 #undef FUNC_NAME
1748
1749 void
1750 scm_init_read ()
1751 {
1752   SCM read_hash_procs;
1753
1754   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1755
1756   scm_i_read_hash_procedures =
1757     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1758
1759   scm_init_opts (scm_read_options, scm_read_opts);
1760 #include "libguile/read.x"
1761 }
1762
1763 /*
1764   Local Variables:
1765   c-file-style: "gnu"
1766   End:
1767 */