libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static inline SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static inline void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static inline int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   register scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360
 361
 362 static SCM
 363 scm_read_sexp (scm_t_wchar chr, SCM port)
 364 #define FUNC_NAME "scm_i_lreadparen"
 365 {
 366   int c;
 367   SCM tmp, tl, ans = SCM_EOL;
 368   const int terminating_char = ((chr == '[') ? ']' : ')');
 369
 370   /* Need to capture line and column numbers here. */
 371   long line = SCM_LINUM (port);
 372   int column = SCM_COL (port) - 1;
 373
 374   c = flush_ws (port, FUNC_NAME);
 375   if (terminating_char == c)
 376     return SCM_EOL;
 377
 378   scm_ungetc (c, port);
 379   tmp = scm_read_expression (port);
 380
 381   /* Note that it is possible for scm_read_expression to return
 382      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 383      check that it's a real dot by checking `c'.  */
 384   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 385     {
 386       ans = scm_read_expression (port);
 387       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 388         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 389                            SCM_EOL);
 390       return ans;
 391     }
 392
 393   /* Build the head of the list structure. */
 394   ans = tl = scm_cons (tmp, SCM_EOL);
 395
 396   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 397     {
 398       SCM new_tail;
 399
 400       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 401         scm_i_input_error (FUNC_NAME, port,
 402                            "in pair: mismatched close paren: ~A",
 403                            scm_list_1 (SCM_MAKE_CHAR (c)));
 404
 405       scm_ungetc (c, port);
 406       tmp = scm_read_expression (port);
 407
 408       /* See above note about scm_sym_dot.  */
 409       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 410         {
 411           SCM_SETCDR (tl, scm_read_expression (port));
 412
 413           c = flush_ws (port, FUNC_NAME);
 414           if (terminating_char != c)
 415             scm_i_input_error (FUNC_NAME, port,
 416                                "in pair: missing close paren", SCM_EOL);
 417           goto exit;
 418         }
 419
 420       new_tail = scm_cons (tmp, SCM_EOL);
 421       SCM_SETCDR (tl, new_tail);
 422       tl = new_tail;
 423     }
 424
 425  exit:
 426   if (SCM_RECORD_POSITIONS_P)
 427     scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
 428
 429   return ans;
 430 }
 431 #undef FUNC_NAME
 432
 433
 434 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 435    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 436    found.  */
 437 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 438   do                                                               \
 439     {                                                              \
 440       scm_t_wchar a;                                               \
 441       size_t i = 0;                                                \
 442       c = 0;                                                       \
 443       while (i < ndigits)                                          \
 444         {                                                          \
 445           a = scm_getc (port);                                     \
 446           if (a == EOF)                                            \
 447             goto str_eof;                                          \
 448           if (terminator                                           \
 449               && (a == (scm_t_wchar) terminator)                   \
 450               && (i > 0))                                          \
 451             break;                                                 \
 452           if ('0' <= a && a <= '9')                                \
 453             a -= '0';                                              \
 454           else if ('A' <= a && a <= 'F')                           \
 455             a = a - 'A' + 10;                                      \
 456           else if ('a' <= a && a <= 'f')                           \
 457             a = a - 'a' + 10;                                      \
 458           else                                                     \
 459             {                                                      \
 460               c = a;                                               \
 461               goto bad_escaped;                                    \
 462             }                                                      \
 463           c = c * 16 + a;                                          \
 464           i ++;                                                    \
 465         }                                                          \
 466     } while (0)
 467
 468 static void
 469 skip_intraline_whitespace (SCM port)
 470 {
 471   scm_t_wchar c;
 472
 473   do
 474     {
 475       c = scm_getc (port);
 476       if (c == EOF)
 477         return;
 478     }
 479   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 480
 481   scm_ungetc (c, port);
 482 }
 483
 484 static SCM
 485 scm_read_string (int chr, SCM port)
 486 #define FUNC_NAME "scm_lreadr"
 487 {
 488   /* For strings smaller than C_STR, this function creates only one Scheme
 489      object (the string returned).  */
 490
 491   SCM str = SCM_BOOL_F;
 492   unsigned c_str_len = 0;
 493   scm_t_wchar c;
 494
 495   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 496   while ('"' != (c = scm_getc (port)))
 497     {
 498       if (c == EOF)
 499         {
 500         str_eof:
 501           scm_i_input_error (FUNC_NAME, port,
 502                              "end of file in string constant", SCM_EOL);
 503         }
 504
 505       if (c_str_len + 1 >= scm_i_string_length (str))
 506         {
 507           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 508
 509           str = scm_string_append (scm_list_2 (str, addy));
 510         }
 511
 512       if (c == '\\')
 513         {
 514           switch (c = scm_getc (port))
 515             {
 516             case EOF:
 517               goto str_eof;
 518             case '"':
 519             case '\\':
 520               break;
 521             case '\n':
 522               if (SCM_HUNGRY_EOL_ESCAPES_P)
 523                 skip_intraline_whitespace (port);
 524               continue;
 525             case '0':
 526               c = '\0';
 527               break;
 528             case 'f':
 529               c = '\f';
 530               break;
 531             case 'n':
 532               c = '\n';
 533               break;
 534             case 'r':
 535               c = '\r';
 536               break;
 537             case 't':
 538               c = '\t';
 539               break;
 540             case 'a':
 541               c = '\007';
 542               break;
 543             case 'v':
 544               c = '\v';
 545               break;
 546             case 'b':
 547               c = '\010';
 548               break;
 549             case 'x':
 550               if (SCM_R6RS_ESCAPES_P)
 551                 SCM_READ_HEX_ESCAPE (10, ';');
 552               else
 553                 SCM_READ_HEX_ESCAPE (2, '\0');
 554               break;
 555             case 'u':
 556               if (!SCM_R6RS_ESCAPES_P)
 557                 {
 558                   SCM_READ_HEX_ESCAPE (4, '\0');
 559                   break;
 560                 }
 561             case 'U':
 562               if (!SCM_R6RS_ESCAPES_P)
 563                 {
 564                   SCM_READ_HEX_ESCAPE (6, '\0');
 565                   break;
 566                 }
 567             default:
 568             bad_escaped:
 569               scm_i_input_error (FUNC_NAME, port,
 570                                  "illegal character in escape sequence: ~S",
 571                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 572             }
 573         }
 574       str = scm_i_string_start_writing (str);
 575       scm_i_string_set_x (str, c_str_len++, c);
 576       scm_i_string_stop_writing ();
 577     }
 578
 579   if (c_str_len > 0)
 580     {
 581       return scm_i_substring_copy (str, 0, c_str_len);
 582     }
 583
 584   return scm_nullstr;
 585 }
 586 #undef FUNC_NAME
 587
 588
 589 static SCM
 590 scm_read_number (scm_t_wchar chr, SCM port)
 591 {
 592   SCM result, str = SCM_EOL;
 593   char buffer[READER_BUFFER_SIZE];
 594   char *overflow_buffer = NULL;
 595   size_t bytes_read;
 596   int overflow;
 597   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 598
 599   scm_ungetc (chr, port);
 600   overflow = read_complete_token (port, buffer, sizeof (buffer),
 601                                   &overflow_buffer, &bytes_read);
 602
 603   if (!overflow)
 604     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 605   else
 606     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 607                             pt->ilseq_handler);
 608
 609   result = scm_string_to_number (str, SCM_UNDEFINED);
 610   if (!scm_is_true (result))
 611     {
 612       /* Return a symbol instead of a number */
 613       if (SCM_CASE_INSENSITIVE_P)
 614         str = scm_string_downcase_x (str);
 615       result = scm_string_to_symbol (str);
 616     }
 617
 618   if (overflow)
 619     free (overflow_buffer);
 620   SCM_COL (port) += scm_i_string_length (str);
 621   return result;
 622 }
 623
 624 static SCM
 625 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 626 {
 627   SCM result;
 628   int ends_with_colon = 0;
 629   size_t bytes_read;
 630   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 631   int overflow;
 632   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 633   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 634   SCM str;
 635
 636   scm_ungetc (chr, port);
 637   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 638                                   &overflow_buffer, &bytes_read);
 639   if (bytes_read > 0)
 640     {
 641       if (!overflow)
 642         ends_with_colon = buffer[bytes_read - 1] == ':';
 643       else
 644         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 645     }
 646
 647   if (postfix && ends_with_colon && (bytes_read > 1))
 648     {
 649       if (!overflow)
 650         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 651       else
 652         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 653                                 pt->ilseq_handler);
 654
 655       if (SCM_CASE_INSENSITIVE_P)
 656         str = scm_string_downcase_x (str);
 657       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 658     }
 659   else
 660     {
 661       if (!overflow)
 662         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 663       else
 664         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 665                                 pt->ilseq_handler);
 666
 667       if (SCM_CASE_INSENSITIVE_P)
 668         str = scm_string_downcase_x (str);
 669       result = scm_string_to_symbol (str);
 670     }
 671
 672   if (overflow)
 673     free (overflow_buffer);
 674   SCM_COL (port) += scm_i_string_length (str);
 675   return result;
 676 }
 677
 678 static SCM
 679 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 680 #define FUNC_NAME "scm_lreadr"
 681 {
 682   SCM result;
 683   size_t read;
 684   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 685   int overflow;
 686   unsigned int radix;
 687   SCM str;
 688   scm_t_port *pt;
 689
 690   switch (chr)
 691     {
 692     case 'B':
 693     case 'b':
 694       radix = 2;
 695       break;
 696
 697     case 'o':
 698     case 'O':
 699       radix = 8;
 700       break;
 701
 702     case 'd':
 703     case 'D':
 704       radix = 10;
 705       break;
 706
 707     case 'x':
 708     case 'X':
 709       radix = 16;
 710       break;
 711
 712     default:
 713       scm_ungetc (chr, port);
 714       scm_ungetc ('#', port);
 715       radix = 10;
 716     }
 717
 718   overflow = read_complete_token (port, buffer, sizeof (buffer),
 719                                   &overflow_buffer, &read);
 720
 721   pt = SCM_PTAB_ENTRY (port);
 722   if (!overflow)
 723     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 724   else
 725     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 726                             pt->ilseq_handler);
 727
 728   result = scm_string_to_number (str, scm_from_uint (radix));
 729
 730   if (overflow)
 731     free (overflow_buffer);
 732
 733   SCM_COL (port) += scm_i_string_length (str);
 734
 735   if (scm_is_true (result))
 736     return result;
 737
 738   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 739
 740   return SCM_BOOL_F;
 741 }
 742 #undef FUNC_NAME
 743
 744 static SCM
 745 scm_read_quote (int chr, SCM port)
 746 {
 747   SCM p;
 748   long line = SCM_LINUM (port);
 749   int column = SCM_COL (port) - 1;
 750
 751   switch (chr)
 752     {
 753     case '`':
 754       p = scm_sym_quasiquote;
 755       break;
 756
 757     case '\'':
 758       p = scm_sym_quote;
 759       break;
 760
 761     case ',':
 762       {
 763         scm_t_wchar c;
 764
 765         c = scm_getc (port);
 766         if ('@' == c)
 767           p = scm_sym_uq_splicing;
 768         else
 769           {
 770             scm_ungetc (c, port);
 771             p = scm_sym_unquote;
 772           }
 773         break;
 774       }
 775
 776     default:
 777       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 778                "scm_read_quote", chr);
 779       abort ();
 780     }
 781
 782   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 783   if (SCM_RECORD_POSITIONS_P)
 784     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 785
 786   return p;
 787 }
 788
 789 SCM_SYMBOL (sym_syntax, "syntax");
 790 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 791 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 792 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 793
 794 static SCM
 795 scm_read_syntax (int chr, SCM port)
 796 {
 797   SCM p;
 798   long line = SCM_LINUM (port);
 799   int column = SCM_COL (port) - 1;
 800
 801   switch (chr)
 802     {
 803     case '`':
 804       p = sym_quasisyntax;
 805       break;
 806
 807     case '\'':
 808       p = sym_syntax;
 809       break;
 810
 811     case ',':
 812       {
 813         int c;
 814
 815         c = scm_getc (port);
 816         if ('@' == c)
 817           p = sym_unsyntax_splicing;
 818         else
 819           {
 820             scm_ungetc (c, port);
 821             p = sym_unsyntax;
 822           }
 823         break;
 824       }
 825
 826     default:
 827       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 828                "scm_read_syntax", chr);
 829       abort ();
 830     }
 831
 832   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 833   if (SCM_RECORD_POSITIONS_P)
 834     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 835
 836   return p;
 837 }
 838
 839 static inline SCM
 840 scm_read_nil (int chr, SCM port)
 841 {
 842   SCM id = scm_read_mixed_case_symbol (chr, port);
 843
 844   if (!scm_is_eq (id, sym_nil))
 845     scm_i_input_error ("scm_read_nil", port,
 846                        "unexpected input while reading #nil: ~a",
 847                        scm_list_1 (id));
 848
 849   return SCM_ELISP_NIL;
 850 }
 851
 852 static inline SCM
 853 scm_read_semicolon_comment (int chr, SCM port)
 854 {
 855   int c;
 856
 857   /* We use the get_byte here because there is no need to get the
 858      locale correct with comment input. This presumes that newline
 859      always represents itself no matter what the encoding is.  */
 860   for (c = scm_get_byte_or_eof (port);
 861        (c != EOF) && (c != '\n');
 862        c = scm_get_byte_or_eof (port));
 863
 864   return SCM_UNSPECIFIED;
 865 }
 866
 867 \f
 868 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 869
 870 static SCM
 871 scm_read_boolean (int chr, SCM port)
 872 {
 873   switch (chr)
 874     {
 875     case 't':
 876     case 'T':
 877       return SCM_BOOL_T;
 878
 879     case 'f':
 880     case 'F':
 881       return SCM_BOOL_F;
 882     }
 883
 884   return SCM_UNSPECIFIED;
 885 }
 886
 887 static SCM
 888 scm_read_character (scm_t_wchar chr, SCM port)
 889 #define FUNC_NAME "scm_lreadr"
 890 {
 891   char buffer[READER_CHAR_NAME_MAX_SIZE];
 892   SCM charname;
 893   size_t charname_len, bytes_read;
 894   scm_t_wchar cp;
 895   int overflow;
 896   scm_t_port *pt;
 897
 898   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 899   if (overflow)
 900     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 901
 902   if (bytes_read == 0)
 903     {
 904       chr = scm_getc (port);
 905       if (chr == EOF)
 906         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 907                            "while reading character", SCM_EOL);
 908
 909       /* CHR must be a token delimiter, like a whitespace.  */
 910       return (SCM_MAKE_CHAR (chr));
 911     }
 912
 913   pt = SCM_PTAB_ENTRY (port);
 914
 915   /* Simple ASCII characters can be processed immediately.  Also, simple
 916      ISO-8859-1 characters can be processed immediately if the encoding for this
 917      port is ISO-8859-1.  */
 918   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 919     {
 920       SCM_COL (port) += 1;
 921       return SCM_MAKE_CHAR (buffer[0]);
 922     }
 923
 924   /* Otherwise, convert the buffer into a proper scheme string for
 925      processing.  */
 926   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 927                                pt->ilseq_handler);
 928   charname_len = scm_i_string_length (charname);
 929   SCM_COL (port) += charname_len;
 930   cp = scm_i_string_ref (charname, 0);
 931   if (charname_len == 1)
 932     return SCM_MAKE_CHAR (cp);
 933
 934   /* Ignore dotted circles, which may be used to keep combining characters from
 935      combining with the backslash in #\charname.  */
 936   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 937     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 938
 939   if (cp >= '0' && cp < '8')
 940     {
 941       /* Dirk:FIXME::  This type of character syntax is not R5RS
 942        * compliant.  Further, it should be verified that the constant
 943        * does only consist of octal digits.  */
 944       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 945       if (SCM_I_INUMP (p))
 946         {
 947           scm_t_wchar c = scm_to_uint32 (p);
 948           if (SCM_IS_UNICODE_CHAR (c))
 949             return SCM_MAKE_CHAR (c);
 950           else
 951             scm_i_input_error (FUNC_NAME, port,
 952                                "out-of-range octal character escape: ~a",
 953                                scm_list_1 (charname));
 954         }
 955     }
 956
 957   if (cp == 'x' && (charname_len > 1))
 958     {
 959       SCM p;
 960
 961       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 962       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 963                                 scm_from_uint (16));
 964       if (SCM_I_INUMP (p))
 965         {
 966           scm_t_wchar c = scm_to_uint32 (p);
 967           if (SCM_IS_UNICODE_CHAR (c))
 968             return SCM_MAKE_CHAR (c);
 969           else
 970             scm_i_input_error (FUNC_NAME, port,
 971                                "out-of-range hex character escape: ~a",
 972                                scm_list_1 (charname));
 973         }
 974     }
 975
 976   /* The names of characters should never have non-Latin1
 977      characters.  */
 978   if (scm_i_is_narrow_string (charname)
 979       || scm_i_try_narrow_string (charname))
 980     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 981                                        charname_len);
 982       if (scm_is_true (ch))
 983         return ch;
 984     }
 985
 986   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 987                      scm_list_1 (charname));
 988
 989   return SCM_UNSPECIFIED;
 990 }
 991 #undef FUNC_NAME
 992
 993 static inline SCM
 994 scm_read_keyword (int chr, SCM port)
 995 {
 996   SCM symbol;
 997
 998   /* Read the symbol that comprises the keyword.  Doing this instead of
 999      invoking a specific symbol reader function allows `scm_read_keyword ()'
1000      to adapt to the delimiters currently valid of symbols.
1001
1002      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1003   symbol = scm_read_expression (port);
1004   if (!scm_is_symbol (symbol))
1005     scm_i_input_error ("scm_read_keyword", port,
1006                        "keyword prefix `~a' not followed by a symbol: ~s",
1007                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009   return (scm_symbol_to_keyword (symbol));
1010 }
1011
1012 static inline SCM
1013 scm_read_vector (int chr, SCM port)
1014 {
1015   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016      guarantee that it's going to do what we want.  After all, this is an
1017      implementation detail of `scm_read_vector ()', not a desirable
1018      property.  */
1019   return (scm_vector (scm_read_sexp (chr, port)));
1020 }
1021
1022 static inline SCM
1023 scm_read_srfi4_vector (int chr, SCM port)
1024 {
1025   return scm_i_read_array (port, chr);
1026 }
1027
1028 static SCM
1029 scm_read_bytevector (scm_t_wchar chr, SCM port)
1030 {
1031   chr = scm_getc (port);
1032   if (chr != 'u')
1033     goto syntax;
1034
1035   chr = scm_getc (port);
1036   if (chr != '8')
1037     goto syntax;
1038
1039   chr = scm_getc (port);
1040   if (chr != '(')
1041     goto syntax;
1042
1043   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045  syntax:
1046   scm_i_input_error ("read_bytevector", port,
1047                      "invalid bytevector prefix",
1048                      SCM_MAKE_CHAR (chr));
1049   return SCM_UNSPECIFIED;
1050 }
1051
1052 static SCM
1053 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1054 {
1055   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1056      terribly inefficient but who cares?  */
1057   SCM s_bits = SCM_EOL;
1058
1059   for (chr = scm_getc (port);
1060        (chr != EOF) && ((chr == '0') || (chr == '1'));
1061        chr = scm_getc (port))
1062     {
1063       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1064     }
1065
1066   if (chr != EOF)
1067     scm_ungetc (chr, port);
1068
1069   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070 }
1071
1072 static inline SCM
1073 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1074 {
1075   int bang_seen = 0;
1076
1077   for (;;)
1078     {
1079       int c = scm_getc (port);
1080
1081       if (c == EOF)
1082         scm_i_input_error ("skip_block_comment", port,
1083                            "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085       if (c == '!')
1086         bang_seen = 1;
1087       else if (c == '#' && bang_seen)
1088         break;
1089       else
1090         bang_seen = 0;
1091     }
1092
1093   return SCM_UNSPECIFIED;
1094 }
1095
1096 static SCM
1097 scm_read_shebang (scm_t_wchar chr, SCM port)
1098 {
1099   int c = 0;
1100   if ((c = scm_get_byte_or_eof (port)) != 'r')
1101     {
1102       scm_ungetc (c, port);
1103       return scm_read_scsh_block_comment (chr, port);
1104     }
1105   if ((c = scm_get_byte_or_eof (port)) != '6')
1106     {
1107       scm_ungetc (c, port);
1108       scm_ungetc ('r', port);
1109       return scm_read_scsh_block_comment (chr, port);
1110     }
1111   if ((c = scm_get_byte_or_eof (port)) != 'r')
1112     {
1113       scm_ungetc (c, port);
1114       scm_ungetc ('6', port);
1115       scm_ungetc ('r', port);
1116       return scm_read_scsh_block_comment (chr, port);
1117     }
1118   if ((c = scm_get_byte_or_eof (port)) != 's')
1119     {
1120       scm_ungetc (c, port);
1121       scm_ungetc ('r', port);
1122       scm_ungetc ('6', port);
1123       scm_ungetc ('r', port);
1124       return scm_read_scsh_block_comment (chr, port);
1125     }
1126
1127   return SCM_UNSPECIFIED;
1128 }
1129
1130 static SCM
1131 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132 {
1133   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134      nested.  So care must be taken.  */
1135   int nesting_level = 1;
1136   int opening_seen = 0, closing_seen = 0;
1137
1138   while (nesting_level > 0)
1139     {
1140       int c = scm_getc (port);
1141
1142       if (c == EOF)
1143         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1144                            "unterminated `#| ... |#' comment", SCM_EOL);
1145
1146       if (opening_seen)
1147         {
1148           if (c == '|')
1149             nesting_level++;
1150           opening_seen = 0;
1151         }
1152       else if (closing_seen)
1153         {
1154           if (c == '#')
1155             nesting_level--;
1156           closing_seen = 0;
1157         }
1158       else if (c == '|')
1159         closing_seen = 1;
1160       else if (c == '#')
1161         opening_seen = 1;
1162       else
1163         opening_seen = closing_seen = 0;
1164     }
1165
1166   return SCM_UNSPECIFIED;
1167 }
1168
1169 static SCM
1170 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1171 {
1172   scm_t_wchar c;
1173
1174   c = flush_ws (port, (char *) NULL);
1175   if (EOF == c)
1176     scm_i_input_error ("read_commented_expression", port,
1177                        "no expression after #; comment", SCM_EOL);
1178   scm_ungetc (c, port);
1179   scm_read_expression (port);
1180   return SCM_UNSPECIFIED;
1181 }
1182
1183 static SCM
1184 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1185 {
1186   /* Guile's extended symbol read syntax looks like this:
1187
1188        #{This is all a symbol name}#
1189
1190      So here, CHR is expected to be `{'.  */
1191   int saw_brace = 0;
1192   size_t len = 0;
1193   SCM buf = scm_i_make_string (1024, NULL, 0);
1194
1195   buf = scm_i_string_start_writing (buf);
1196
1197   while ((chr = scm_getc (port)) != EOF)
1198     {
1199       if (saw_brace)
1200         {
1201           if (chr == '#')
1202             {
1203               break;
1204             }
1205           else
1206             {
1207               saw_brace = 0;
1208               scm_i_string_set_x (buf, len++, '}');
1209             }
1210         }
1211
1212       if (chr == '}')
1213         saw_brace = 1;
1214       else if (chr == '\\')
1215         {
1216           /* It used to be that print.c would print extended-read-syntax
1217              symbols with backslashes before "non-standard" chars, but
1218              this routine wouldn't do anything with those escapes.
1219              Bummer.  What we've done is to change print.c to output
1220              R6RS hex escapes for those characters, relying on the fact
1221              that the extended read syntax would never put a `\' before
1222              an `x'.  For now, we just ignore other instances of
1223              backslash in the string.  */
1224           switch ((chr = scm_getc (port)))
1225             {
1226             case EOF:
1227               goto done;
1228             case 'x':
1229               {
1230                 scm_t_wchar c;
1231
1232                 SCM_READ_HEX_ESCAPE (10, ';');
1233                 scm_i_string_set_x (buf, len++, c);
1234                 break;
1235
1236               str_eof:
1237                 chr = EOF;
1238                 goto done;
1239
1240               bad_escaped:
1241                 scm_i_string_stop_writing ();
1242                 scm_i_input_error ("scm_read_extended_symbol", port,
1243                                    "illegal character in escape sequence: ~S",
1244                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1245                 break;
1246               }
1247             default:
1248               scm_i_string_set_x (buf, len++, chr);
1249               break;
1250             }
1251         }
1252       else
1253         scm_i_string_set_x (buf, len++, chr);
1254
1255       if (len >= scm_i_string_length (buf) - 2)
1256         {
1257           SCM addy;
1258
1259           scm_i_string_stop_writing ();
1260           addy = scm_i_make_string (1024, NULL, 0);
1261           buf = scm_string_append (scm_list_2 (buf, addy));
1262           len = 0;
1263           buf = scm_i_string_start_writing (buf);
1264         }
1265     }
1266
1267  done:
1268   scm_i_string_stop_writing ();
1269   if (chr == EOF)
1270     scm_i_input_error ("scm_read_extended_symbol", port,
1271                        "end of file while reading symbol", SCM_EOL);
1272
1273   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1274 }
1275
1276
1277 \f
1278 /* Top-level token readers, i.e., dispatchers.  */
1279
1280 static SCM
1281 scm_read_sharp_extension (int chr, SCM port)
1282 {
1283   SCM proc;
1284
1285   proc = scm_get_hash_procedure (chr);
1286   if (scm_is_true (scm_procedure_p (proc)))
1287     {
1288       long line = SCM_LINUM (port);
1289       int column = SCM_COL (port) - 2;
1290       SCM got;
1291
1292       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1293
1294       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1295         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1296
1297       return got;
1298     }
1299
1300   return SCM_UNSPECIFIED;
1301 }
1302
1303 /* The reader for the sharp `#' character.  It basically dispatches reads
1304    among the above token readers.   */
1305 static SCM
1306 scm_read_sharp (scm_t_wchar chr, SCM port)
1307 #define FUNC_NAME "scm_lreadr"
1308 {
1309   SCM result;
1310
1311   chr = scm_getc (port);
1312
1313   result = scm_read_sharp_extension (chr, port);
1314   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1315     return result;
1316
1317   switch (chr)
1318     {
1319     case '\\':
1320       return (scm_read_character (chr, port));
1321     case '(':
1322       return (scm_read_vector (chr, port));
1323     case 's':
1324     case 'u':
1325     case 'f':
1326     case 'c':
1327       /* This one may return either a boolean or an SRFI-4 vector.  */
1328       return (scm_read_srfi4_vector (chr, port));
1329     case 'v':
1330       return (scm_read_bytevector (chr, port));
1331     case '*':
1332       return (scm_read_guile_bit_vector (chr, port));
1333     case 't':
1334     case 'T':
1335     case 'F':
1336       /* This one may return either a boolean or an SRFI-4 vector.  */
1337       return (scm_read_boolean (chr, port));
1338     case ':':
1339       return (scm_read_keyword (chr, port));
1340     case '0': case '1': case '2': case '3': case '4':
1341     case '5': case '6': case '7': case '8': case '9':
1342     case '@':
1343 #if SCM_ENABLE_DEPRECATED
1344       /* See below for 'i' and 'e'. */
1345     case 'a':
1346     case 'y':
1347     case 'h':
1348     case 'l':
1349 #endif
1350       return (scm_i_read_array (port, chr));
1351
1352     case 'i':
1353     case 'e':
1354 #if SCM_ENABLE_DEPRECATED
1355       {
1356         /* When next char is '(', it really is an old-style
1357            uniform array. */
1358         scm_t_wchar next_c = scm_getc (port);
1359         if (next_c != EOF)
1360           scm_ungetc (next_c, port);
1361         if (next_c == '(')
1362           return scm_i_read_array (port, chr);
1363         /* Fall through. */
1364       }
1365 #endif
1366     case 'b':
1367     case 'B':
1368     case 'o':
1369     case 'O':
1370     case 'd':
1371     case 'D':
1372     case 'x':
1373     case 'X':
1374     case 'I':
1375     case 'E':
1376       return (scm_read_number_and_radix (chr, port));
1377     case '{':
1378       return (scm_read_extended_symbol (chr, port));
1379     case '!':
1380       return (scm_read_shebang (chr, port));
1381     case ';':
1382       return (scm_read_commented_expression (chr, port));
1383     case '`':
1384     case '\'':
1385     case ',':
1386       return (scm_read_syntax (chr, port));
1387     case 'n':
1388       return (scm_read_nil (chr, port));
1389     default:
1390       result = scm_read_sharp_extension (chr, port);
1391       if (scm_is_eq (result, SCM_UNSPECIFIED))
1392         {
1393           /* To remain compatible with 1.8 and earlier, the following
1394              characters have lower precedence than `read-hash-extend'
1395              characters.  */
1396           switch (chr)
1397             {
1398             case '|':
1399               return scm_read_r6rs_block_comment (chr, port);
1400             default:
1401               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1402                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1403             }
1404         }
1405       else
1406         return result;
1407     }
1408
1409   return SCM_UNSPECIFIED;
1410 }
1411 #undef FUNC_NAME
1412
1413 static SCM
1414 scm_read_expression (SCM port)
1415 #define FUNC_NAME "scm_read_expression"
1416 {
1417   while (1)
1418     {
1419       register scm_t_wchar chr;
1420
1421       chr = scm_getc (port);
1422
1423       switch (chr)
1424         {
1425         case SCM_WHITE_SPACES:
1426         case SCM_LINE_INCREMENTORS:
1427           break;
1428         case ';':
1429           (void) scm_read_semicolon_comment (chr, port);
1430           break;
1431         case '[':
1432           if (!SCM_SQUARE_BRACKETS_P)
1433             return (scm_read_mixed_case_symbol (chr, port));
1434           /* otherwise fall through */
1435         case '(':
1436           return (scm_read_sexp (chr, port));
1437         case '"':
1438           return (scm_read_string (chr, port));
1439         case '\'':
1440         case '`':
1441         case ',':
1442           return (scm_read_quote (chr, port));
1443         case '#':
1444           {
1445             SCM result;
1446             result = scm_read_sharp (chr, port);
1447             if (scm_is_eq (result, SCM_UNSPECIFIED))
1448               /* We read a comment or some such.  */
1449               break;
1450             else
1451               return result;
1452           }
1453         case ')':
1454           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1455           break;
1456         case ']':
1457           if (SCM_SQUARE_BRACKETS_P)
1458             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1459           /* otherwise fall through */
1460         case EOF:
1461           return SCM_EOF_VAL;
1462         case ':':
1463           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1464             return scm_symbol_to_keyword (scm_read_expression (port));
1465           /* Fall through.  */
1466
1467         default:
1468           {
1469             if (((chr >= '0') && (chr <= '9'))
1470                 || (strchr ("+-.", chr)))
1471               return (scm_read_number (chr, port));
1472             else
1473               return (scm_read_mixed_case_symbol (chr, port));
1474           }
1475         }
1476     }
1477 }
1478 #undef FUNC_NAME
1479
1480 \f
1481 /* Actual reader.  */
1482
1483 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1484             (SCM port),
1485             "Read an s-expression from the input port @var{port}, or from\n"
1486             "the current input port if @var{port} is not specified.\n"
1487             "Any whitespace before the next token is discarded.")
1488 #define FUNC_NAME s_scm_read
1489 {
1490   int c;
1491
1492   if (SCM_UNBNDP (port))
1493     port = scm_current_input_port ();
1494   SCM_VALIDATE_OPINPORT (1, port);
1495
1496   c = flush_ws (port, (char *) NULL);
1497   if (EOF == c)
1498     return SCM_EOF_VAL;
1499   scm_ungetc (c, port);
1500
1501   return (scm_read_expression (port));
1502 }
1503 #undef FUNC_NAME
1504
1505
1506 \f
1507
1508 /* Manipulate the read-hash-procedures alist.  This could be written in
1509    Scheme, but maybe it will also be used by C code during initialisation.  */
1510 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1511             (SCM chr, SCM proc),
1512             "Install the procedure @var{proc} for reading expressions\n"
1513             "starting with the character sequence @code{#} and @var{chr}.\n"
1514             "@var{proc} will be called with two arguments:  the character\n"
1515             "@var{chr} and the port to read further data from. The object\n"
1516             "returned will be the return value of @code{read}. \n"
1517             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1518             )
1519 #define FUNC_NAME s_scm_read_hash_extend
1520 {
1521   SCM this;
1522   SCM prev;
1523
1524   SCM_VALIDATE_CHAR (1, chr);
1525   SCM_ASSERT (scm_is_false (proc)
1526               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1527               proc, SCM_ARG2, FUNC_NAME);
1528
1529   /* Check if chr is already in the alist.  */
1530   this = scm_i_read_hash_procedures_ref ();
1531   prev = SCM_BOOL_F;
1532   while (1)
1533     {
1534       if (scm_is_null (this))
1535         {
1536           /* not found, so add it to the beginning.  */
1537           if (scm_is_true (proc))
1538             {
1539               SCM new = scm_cons (scm_cons (chr, proc),
1540                                   scm_i_read_hash_procedures_ref ());
1541               scm_i_read_hash_procedures_set_x (new);
1542             }
1543           break;
1544         }
1545       if (scm_is_eq (chr, SCM_CAAR (this)))
1546         {
1547           /* already in the alist.  */
1548           if (scm_is_false (proc))
1549             {
1550               /* remove it.  */
1551               if (scm_is_false (prev))
1552                 {
1553                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1554                   scm_i_read_hash_procedures_set_x (rest);
1555                 }
1556               else
1557                 scm_set_cdr_x (prev, SCM_CDR (this));
1558             }
1559           else
1560             {
1561               /* replace it.  */
1562               scm_set_cdr_x (SCM_CAR (this), proc);
1563             }
1564           break;
1565         }
1566       prev = this;
1567       this = SCM_CDR (this);
1568     }
1569
1570   return SCM_UNSPECIFIED;
1571 }
1572 #undef FUNC_NAME
1573
1574 /* Recover the read-hash procedure corresponding to char c.  */
1575 static SCM
1576 scm_get_hash_procedure (int c)
1577 {
1578   SCM rest = scm_i_read_hash_procedures_ref ();
1579
1580   while (1)
1581     {
1582       if (scm_is_null (rest))
1583         return SCM_BOOL_F;
1584
1585       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1586         return SCM_CDAR (rest);
1587
1588       rest = SCM_CDR (rest);
1589     }
1590 }
1591
1592 #define SCM_ENCODING_SEARCH_SIZE (500)
1593
1594 /* Search the first few hundred characters of a file for an Emacs-like coding
1595    declaration.  Returns either NULL or a string whose storage has been
1596    allocated with `scm_gc_malloc ()'.  */
1597 char *
1598 scm_i_scan_for_encoding (SCM port)
1599 {
1600   scm_t_port *pt;
1601   char header[SCM_ENCODING_SEARCH_SIZE+1];
1602   size_t bytes_read, encoding_length, i;
1603   char *encoding = NULL;
1604   int utf8_bom = 0;
1605   char *pos, *encoding_start;
1606   int in_comment;
1607
1608   pt = SCM_PTAB_ENTRY (port);
1609
1610   if (pt->rw_active == SCM_PORT_WRITE)
1611     scm_flush (port);
1612
1613   if (pt->rw_random)
1614     pt->rw_active = SCM_PORT_READ;
1615
1616   if (pt->read_pos == pt->read_end)
1617     {
1618       /* We can use the read buffer, and thus avoid a seek. */
1619       if (scm_fill_input (port) == EOF)
1620         return NULL;
1621
1622       bytes_read = pt->read_end - pt->read_pos;
1623       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1624         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1625
1626       if (bytes_read <= 1)
1627         /* An unbuffered port -- don't scan.  */
1628         return NULL;
1629
1630       memcpy (header, pt->read_pos, bytes_read);
1631       header[bytes_read] = '\0';
1632     }
1633   else
1634     {
1635       /* Try to read some bytes and then seek back.  Not all ports
1636          support seeking back; and indeed some file ports (like
1637          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1638          check performed by SCM_FPORT_FDES---but fail to seek
1639          backwards.  Hence this block comes second.  We prefer to use
1640          the read buffer in-place.  */
1641       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1642         return NULL;
1643
1644       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1645       header[bytes_read] = '\0';
1646       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1647     }
1648
1649   if (bytes_read > 3
1650       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1651     utf8_bom = 1;
1652
1653   /* search past "coding[:=]" */
1654   pos = header;
1655   while (1)
1656     {
1657       if ((pos = strstr(pos, "coding")) == NULL)
1658         return NULL;
1659
1660       pos += strlen("coding");
1661       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1662           (*pos == ':' || *pos == '='))
1663         {
1664           pos ++;
1665           break;
1666         }
1667     }
1668
1669   /* skip spaces */
1670   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1671          (*pos == ' ' || *pos == '\t'))
1672     pos ++;
1673
1674   /* grab the next token */
1675   encoding_start = pos;
1676   i = 0;
1677   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1678          && encoding_start + i - header < bytes_read
1679          && (isalnum ((int) encoding_start[i])
1680              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1681     i++;
1682
1683   encoding_length = i;
1684   if (encoding_length == 0)
1685     return NULL;
1686
1687   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1688   for (i = 0; i < encoding_length; i++)
1689     encoding[i] = toupper ((int) encoding[i]);
1690
1691   /* push backwards to make sure we were in a comment */
1692   in_comment = 0;
1693   pos = encoding_start;
1694   while (pos >= header)
1695     {
1696       if (*pos == ';')
1697         {
1698           in_comment = 1;
1699           break;
1700         }
1701       else if (*pos == '\n' || pos == header)
1702         {
1703           /* This wasn't in a semicolon comment. Check for a
1704            hash-bang comment. */
1705           char *beg = strstr (header, "#!");
1706           char *end = strstr (header, "!#");
1707           if (beg < encoding_start && encoding_start + encoding_length <= end)
1708             in_comment = 1;
1709           break;
1710         }
1711       else
1712         {
1713           pos --;
1714           continue;
1715         }
1716     }
1717   if (!in_comment)
1718     /* This wasn't in a comment */
1719     return NULL;
1720
1721   if (utf8_bom && strcmp(encoding, "UTF-8"))
1722     scm_misc_error (NULL,
1723                     "the port input declares the encoding ~s but is encoded as UTF-8",
1724                     scm_list_1 (scm_from_locale_string (encoding)));
1725
1726   return encoding;
1727 }
1728
1729 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1730             (SCM port),
1731             "Scans the port for an Emacs-like character coding declaration\n"
1732             "near the top of the contents of a port with random-accessible contents.\n"
1733             "The coding declaration is of the form\n"
1734             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1735             "\n"
1736             "Returns a string containing the character encoding of the file\n"
1737             "if a declaration was found, or @code{#f} otherwise.\n")
1738 #define FUNC_NAME s_scm_file_encoding
1739 {
1740   char *enc;
1741   SCM s_enc;
1742
1743   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1744
1745   enc = scm_i_scan_for_encoding (port);
1746   if (enc == NULL)
1747     return SCM_BOOL_F;
1748   else
1749     {
1750       s_enc = scm_from_locale_string (enc);
1751       return s_enc;
1752     }
1753
1754   return SCM_BOOL_F;
1755 }
1756 #undef FUNC_NAME
1757
1758 void
1759 scm_init_read ()
1760 {
1761   SCM read_hash_procs;
1762
1763   read_hash_procs = scm_make_fluid ();
1764   scm_fluid_set_x (read_hash_procs, SCM_EOL);
1765
1766   scm_i_read_hash_procedures =
1767     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1768
1769   scm_init_opts (scm_read_options, scm_read_opts);
1770 #include "libguile/read.x"
1771 }
1772
1773 /*
1774   Local Variables:
1775   c-file-style: "gnu"
1776   End:
1777 */