libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port, long line, int column);
 360
 361
 362 static SCM
 363 maybe_annotate_source (SCM x, SCM port, long line, int column)
 364 {
 365   if (SCM_RECORD_POSITIONS_P)
 366     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 367   return x;
 368 }
 369
 370 static SCM
 371 scm_read_sexp (scm_t_wchar chr, SCM port)
 372 #define FUNC_NAME "scm_i_lreadparen"
 373 {
 374   int c;
 375   SCM tmp, tl, ans = SCM_EOL;
 376   const int terminating_char = ((chr == '[') ? ']' : ')');
 377
 378   /* Need to capture line and column numbers here. */
 379   long line = SCM_LINUM (port);
 380   int column = SCM_COL (port) - 1;
 381
 382   c = flush_ws (port, FUNC_NAME);
 383   if (terminating_char == c)
 384     return SCM_EOL;
 385
 386   scm_ungetc (c, port);
 387   tmp = scm_read_expression (port);
 388
 389   /* Note that it is possible for scm_read_expression to return
 390      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 391      check that it's a real dot by checking `c'.  */
 392   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 393     {
 394       ans = scm_read_expression (port);
 395       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 396         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 397                            SCM_EOL);
 398       return ans;
 399     }
 400
 401   /* Build the head of the list structure. */
 402   ans = tl = scm_cons (tmp, SCM_EOL);
 403
 404   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 405     {
 406       SCM new_tail;
 407
 408       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 409         scm_i_input_error (FUNC_NAME, port,
 410                            "in pair: mismatched close paren: ~A",
 411                            scm_list_1 (SCM_MAKE_CHAR (c)));
 412
 413       scm_ungetc (c, port);
 414       tmp = scm_read_expression (port);
 415
 416       /* See above note about scm_sym_dot.  */
 417       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 418         {
 419           SCM_SETCDR (tl, scm_read_expression (port));
 420
 421           c = flush_ws (port, FUNC_NAME);
 422           if (terminating_char != c)
 423             scm_i_input_error (FUNC_NAME, port,
 424                                "in pair: missing close paren", SCM_EOL);
 425           goto exit;
 426         }
 427
 428       new_tail = scm_cons (tmp, SCM_EOL);
 429       SCM_SETCDR (tl, new_tail);
 430       tl = new_tail;
 431     }
 432
 433  exit:
 434   return maybe_annotate_source (ans, port, line, column);
 435 }
 436 #undef FUNC_NAME
 437
 438
 439 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 440    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 441    found.  */
 442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 443   do                                                               \
 444     {                                                              \
 445       scm_t_wchar a;                                               \
 446       size_t i = 0;                                                \
 447       c = 0;                                                       \
 448       while (i < ndigits)                                          \
 449         {                                                          \
 450           a = scm_getc (port);                                     \
 451           if (a == EOF)                                            \
 452             goto str_eof;                                          \
 453           if (terminator                                           \
 454               && (a == (scm_t_wchar) terminator)                   \
 455               && (i > 0))                                          \
 456             break;                                                 \
 457           if ('0' <= a && a <= '9')                                \
 458             a -= '0';                                              \
 459           else if ('A' <= a && a <= 'F')                           \
 460             a = a - 'A' + 10;                                      \
 461           else if ('a' <= a && a <= 'f')                           \
 462             a = a - 'a' + 10;                                      \
 463           else                                                     \
 464             {                                                      \
 465               c = a;                                               \
 466               goto bad_escaped;                                    \
 467             }                                                      \
 468           c = c * 16 + a;                                          \
 469           i ++;                                                    \
 470         }                                                          \
 471     } while (0)
 472
 473 static void
 474 skip_intraline_whitespace (SCM port)
 475 {
 476   scm_t_wchar c;
 477
 478   do
 479     {
 480       c = scm_getc (port);
 481       if (c == EOF)
 482         return;
 483     }
 484   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 485
 486   scm_ungetc (c, port);
 487 }
 488
 489 static SCM
 490 scm_read_string (int chr, SCM port)
 491 #define FUNC_NAME "scm_lreadr"
 492 {
 493   /* For strings smaller than C_STR, this function creates only one Scheme
 494      object (the string returned).  */
 495
 496   SCM str = SCM_BOOL_F;
 497   unsigned c_str_len = 0;
 498   scm_t_wchar c;
 499
 500   /* Need to capture line and column numbers here. */
 501   long line = SCM_LINUM (port);
 502   int column = SCM_COL (port) - 1;
 503
 504   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 505   while ('"' != (c = scm_getc (port)))
 506     {
 507       if (c == EOF)
 508         {
 509         str_eof:
 510           scm_i_input_error (FUNC_NAME, port,
 511                              "end of file in string constant", SCM_EOL);
 512         }
 513
 514       if (c_str_len + 1 >= scm_i_string_length (str))
 515         {
 516           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 517
 518           str = scm_string_append (scm_list_2 (str, addy));
 519         }
 520
 521       if (c == '\\')
 522         {
 523           switch (c = scm_getc (port))
 524             {
 525             case EOF:
 526               goto str_eof;
 527             case '"':
 528             case '\\':
 529               break;
 530             case '\n':
 531               if (SCM_HUNGRY_EOL_ESCAPES_P)
 532                 skip_intraline_whitespace (port);
 533               continue;
 534             case '0':
 535               c = '\0';
 536               break;
 537             case 'f':
 538               c = '\f';
 539               break;
 540             case 'n':
 541               c = '\n';
 542               break;
 543             case 'r':
 544               c = '\r';
 545               break;
 546             case 't':
 547               c = '\t';
 548               break;
 549             case 'a':
 550               c = '\007';
 551               break;
 552             case 'v':
 553               c = '\v';
 554               break;
 555             case 'b':
 556               c = '\010';
 557               break;
 558             case 'x':
 559               if (SCM_R6RS_ESCAPES_P)
 560                 SCM_READ_HEX_ESCAPE (10, ';');
 561               else
 562                 SCM_READ_HEX_ESCAPE (2, '\0');
 563               break;
 564             case 'u':
 565               if (!SCM_R6RS_ESCAPES_P)
 566                 {
 567                   SCM_READ_HEX_ESCAPE (4, '\0');
 568                   break;
 569                 }
 570             case 'U':
 571               if (!SCM_R6RS_ESCAPES_P)
 572                 {
 573                   SCM_READ_HEX_ESCAPE (6, '\0');
 574                   break;
 575                 }
 576             default:
 577             bad_escaped:
 578               scm_i_input_error (FUNC_NAME, port,
 579                                  "illegal character in escape sequence: ~S",
 580                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 581             }
 582         }
 583       str = scm_i_string_start_writing (str);
 584       scm_i_string_set_x (str, c_str_len++, c);
 585       scm_i_string_stop_writing ();
 586     }
 587   return maybe_annotate_source (scm_i_substring_copy (str, 0, c_str_len),
 588                                 port, line, column);
 589 }
 590 #undef FUNC_NAME
 591
 592
 593 static SCM
 594 scm_read_number (scm_t_wchar chr, SCM port)
 595 {
 596   SCM result, str = SCM_EOL;
 597   char buffer[READER_BUFFER_SIZE];
 598   char *overflow_buffer = NULL;
 599   size_t bytes_read;
 600   int overflow;
 601   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 602
 603   scm_ungetc (chr, port);
 604   overflow = read_complete_token (port, buffer, sizeof (buffer),
 605                                   &overflow_buffer, &bytes_read);
 606
 607   if (!overflow)
 608     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 609   else
 610     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 611                             pt->ilseq_handler);
 612
 613   result = scm_string_to_number (str, SCM_UNDEFINED);
 614   if (!scm_is_true (result))
 615     {
 616       /* Return a symbol instead of a number */
 617       if (SCM_CASE_INSENSITIVE_P)
 618         str = scm_string_downcase_x (str);
 619       result = scm_string_to_symbol (str);
 620     }
 621
 622   if (overflow)
 623     free (overflow_buffer);
 624   SCM_COL (port) += scm_i_string_length (str);
 625   return result;
 626 }
 627
 628 static SCM
 629 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 630 {
 631   SCM result;
 632   int ends_with_colon = 0;
 633   size_t bytes_read;
 634   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 635   int overflow;
 636   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 637   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 638   SCM str;
 639
 640   scm_ungetc (chr, port);
 641   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 642                                   &overflow_buffer, &bytes_read);
 643   if (bytes_read > 0)
 644     {
 645       if (!overflow)
 646         ends_with_colon = buffer[bytes_read - 1] == ':';
 647       else
 648         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 649     }
 650
 651   if (postfix && ends_with_colon && (bytes_read > 1))
 652     {
 653       if (!overflow)
 654         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 655       else
 656         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 657                                 pt->ilseq_handler);
 658
 659       if (SCM_CASE_INSENSITIVE_P)
 660         str = scm_string_downcase_x (str);
 661       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 662     }
 663   else
 664     {
 665       if (!overflow)
 666         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 667       else
 668         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 669                                 pt->ilseq_handler);
 670
 671       if (SCM_CASE_INSENSITIVE_P)
 672         str = scm_string_downcase_x (str);
 673       result = scm_string_to_symbol (str);
 674     }
 675
 676   if (overflow)
 677     free (overflow_buffer);
 678   SCM_COL (port) += scm_i_string_length (str);
 679   return result;
 680 }
 681
 682 static SCM
 683 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 684 #define FUNC_NAME "scm_lreadr"
 685 {
 686   SCM result;
 687   size_t read;
 688   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 689   int overflow;
 690   unsigned int radix;
 691   SCM str;
 692   scm_t_port *pt;
 693
 694   switch (chr)
 695     {
 696     case 'B':
 697     case 'b':
 698       radix = 2;
 699       break;
 700
 701     case 'o':
 702     case 'O':
 703       radix = 8;
 704       break;
 705
 706     case 'd':
 707     case 'D':
 708       radix = 10;
 709       break;
 710
 711     case 'x':
 712     case 'X':
 713       radix = 16;
 714       break;
 715
 716     default:
 717       scm_ungetc (chr, port);
 718       scm_ungetc ('#', port);
 719       radix = 10;
 720     }
 721
 722   overflow = read_complete_token (port, buffer, sizeof (buffer),
 723                                   &overflow_buffer, &read);
 724
 725   pt = SCM_PTAB_ENTRY (port);
 726   if (!overflow)
 727     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 728   else
 729     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 730                             pt->ilseq_handler);
 731
 732   result = scm_string_to_number (str, scm_from_uint (radix));
 733
 734   if (overflow)
 735     free (overflow_buffer);
 736
 737   SCM_COL (port) += scm_i_string_length (str);
 738
 739   if (scm_is_true (result))
 740     return result;
 741
 742   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 743
 744   return SCM_BOOL_F;
 745 }
 746 #undef FUNC_NAME
 747
 748 static SCM
 749 scm_read_quote (int chr, SCM port)
 750 {
 751   SCM p;
 752   long line = SCM_LINUM (port);
 753   int column = SCM_COL (port) - 1;
 754
 755   switch (chr)
 756     {
 757     case '`':
 758       p = scm_sym_quasiquote;
 759       break;
 760
 761     case '\'':
 762       p = scm_sym_quote;
 763       break;
 764
 765     case ',':
 766       {
 767         scm_t_wchar c;
 768
 769         c = scm_getc (port);
 770         if ('@' == c)
 771           p = scm_sym_uq_splicing;
 772         else
 773           {
 774             scm_ungetc (c, port);
 775             p = scm_sym_unquote;
 776           }
 777         break;
 778       }
 779
 780     default:
 781       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 782                "scm_read_quote", chr);
 783       abort ();
 784     }
 785
 786   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 787   return maybe_annotate_source (p, port, line, column);
 788 }
 789
 790 SCM_SYMBOL (sym_syntax, "syntax");
 791 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 792 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 793 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 794
 795 static SCM
 796 scm_read_syntax (int chr, SCM port)
 797 {
 798   SCM p;
 799   long line = SCM_LINUM (port);
 800   int column = SCM_COL (port) - 1;
 801
 802   switch (chr)
 803     {
 804     case '`':
 805       p = sym_quasisyntax;
 806       break;
 807
 808     case '\'':
 809       p = sym_syntax;
 810       break;
 811
 812     case ',':
 813       {
 814         int c;
 815
 816         c = scm_getc (port);
 817         if ('@' == c)
 818           p = sym_unsyntax_splicing;
 819         else
 820           {
 821             scm_ungetc (c, port);
 822             p = sym_unsyntax;
 823           }
 824         break;
 825       }
 826
 827     default:
 828       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 829                "scm_read_syntax", chr);
 830       abort ();
 831     }
 832
 833   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 834   return maybe_annotate_source (p, port, line, column);
 835 }
 836
 837 static SCM
 838 scm_read_nil (int chr, SCM port)
 839 {
 840   SCM id = scm_read_mixed_case_symbol (chr, port);
 841
 842   if (!scm_is_eq (id, sym_nil))
 843     scm_i_input_error ("scm_read_nil", port,
 844                        "unexpected input while reading #nil: ~a",
 845                        scm_list_1 (id));
 846
 847   return SCM_ELISP_NIL;
 848 }
 849
 850 static SCM
 851 scm_read_semicolon_comment (int chr, SCM port)
 852 {
 853   int c;
 854
 855   /* We use the get_byte here because there is no need to get the
 856      locale correct with comment input. This presumes that newline
 857      always represents itself no matter what the encoding is.  */
 858   for (c = scm_get_byte_or_eof (port);
 859        (c != EOF) && (c != '\n');
 860        c = scm_get_byte_or_eof (port));
 861
 862   return SCM_UNSPECIFIED;
 863 }
 864
 865 \f
 866 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 867
 868 static SCM
 869 scm_read_boolean (int chr, SCM port)
 870 {
 871   switch (chr)
 872     {
 873     case 't':
 874     case 'T':
 875       return SCM_BOOL_T;
 876
 877     case 'f':
 878     case 'F':
 879       return SCM_BOOL_F;
 880     }
 881
 882   return SCM_UNSPECIFIED;
 883 }
 884
 885 static SCM
 886 scm_read_character (scm_t_wchar chr, SCM port)
 887 #define FUNC_NAME "scm_lreadr"
 888 {
 889   char buffer[READER_CHAR_NAME_MAX_SIZE];
 890   SCM charname;
 891   size_t charname_len, bytes_read;
 892   scm_t_wchar cp;
 893   int overflow;
 894   scm_t_port *pt;
 895
 896   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 897   if (overflow)
 898     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 899
 900   if (bytes_read == 0)
 901     {
 902       chr = scm_getc (port);
 903       if (chr == EOF)
 904         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 905                            "while reading character", SCM_EOL);
 906
 907       /* CHR must be a token delimiter, like a whitespace.  */
 908       return (SCM_MAKE_CHAR (chr));
 909     }
 910
 911   pt = SCM_PTAB_ENTRY (port);
 912
 913   /* Simple ASCII characters can be processed immediately.  Also, simple
 914      ISO-8859-1 characters can be processed immediately if the encoding for this
 915      port is ISO-8859-1.  */
 916   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 917     {
 918       SCM_COL (port) += 1;
 919       return SCM_MAKE_CHAR (buffer[0]);
 920     }
 921
 922   /* Otherwise, convert the buffer into a proper scheme string for
 923      processing.  */
 924   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 925                                pt->ilseq_handler);
 926   charname_len = scm_i_string_length (charname);
 927   SCM_COL (port) += charname_len;
 928   cp = scm_i_string_ref (charname, 0);
 929   if (charname_len == 1)
 930     return SCM_MAKE_CHAR (cp);
 931
 932   /* Ignore dotted circles, which may be used to keep combining characters from
 933      combining with the backslash in #\charname.  */
 934   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 935     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 936
 937   if (cp >= '0' && cp < '8')
 938     {
 939       /* Dirk:FIXME::  This type of character syntax is not R5RS
 940        * compliant.  Further, it should be verified that the constant
 941        * does only consist of octal digits.  */
 942       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 943       if (SCM_I_INUMP (p))
 944         {
 945           scm_t_wchar c = scm_to_uint32 (p);
 946           if (SCM_IS_UNICODE_CHAR (c))
 947             return SCM_MAKE_CHAR (c);
 948           else
 949             scm_i_input_error (FUNC_NAME, port,
 950                                "out-of-range octal character escape: ~a",
 951                                scm_list_1 (charname));
 952         }
 953     }
 954
 955   if (cp == 'x' && (charname_len > 1))
 956     {
 957       SCM p;
 958
 959       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 960       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 961                                 scm_from_uint (16));
 962       if (SCM_I_INUMP (p))
 963         {
 964           scm_t_wchar c = scm_to_uint32 (p);
 965           if (SCM_IS_UNICODE_CHAR (c))
 966             return SCM_MAKE_CHAR (c);
 967           else
 968             scm_i_input_error (FUNC_NAME, port,
 969                                "out-of-range hex character escape: ~a",
 970                                scm_list_1 (charname));
 971         }
 972     }
 973
 974   /* The names of characters should never have non-Latin1
 975      characters.  */
 976   if (scm_i_is_narrow_string (charname)
 977       || scm_i_try_narrow_string (charname))
 978     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 979                                        charname_len);
 980       if (scm_is_true (ch))
 981         return ch;
 982     }
 983
 984   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 985                      scm_list_1 (charname));
 986
 987   return SCM_UNSPECIFIED;
 988 }
 989 #undef FUNC_NAME
 990
 991 static SCM
 992 scm_read_keyword (int chr, SCM port)
 993 {
 994   SCM symbol;
 995
 996   /* Read the symbol that comprises the keyword.  Doing this instead of
 997      invoking a specific symbol reader function allows `scm_read_keyword ()'
 998      to adapt to the delimiters currently valid of symbols.
 999
1000      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1001   symbol = scm_read_expression (port);
1002   if (!scm_is_symbol (symbol))
1003     scm_i_input_error ("scm_read_keyword", port,
1004                        "keyword prefix `~a' not followed by a symbol: ~s",
1005                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1006
1007   return (scm_symbol_to_keyword (symbol));
1008 }
1009
1010 static SCM
1011 scm_read_vector (int chr, SCM port, long line, int column)
1012 {
1013   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1014      guarantee that it's going to do what we want.  After all, this is an
1015      implementation detail of `scm_read_vector ()', not a desirable
1016      property.  */
1017   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1018                                 port, line, column);
1019 }
1020
1021 static SCM
1022 scm_read_array (int chr, SCM port, long line, int column)
1023 {
1024   SCM result = scm_i_read_array (port, chr);
1025   if (scm_is_false (result))
1026     return result;
1027   else
1028     return maybe_annotate_source (result, port, line, column);
1029 }
1030
1031 static SCM
1032 scm_read_srfi4_vector (int chr, SCM port, long line, int column)
1033 {
1034   return scm_read_array (chr, port, line, column);
1035 }
1036
1037 static SCM
1038 scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
1039 {
1040   chr = scm_getc (port);
1041   if (chr != 'u')
1042     goto syntax;
1043
1044   chr = scm_getc (port);
1045   if (chr != '8')
1046     goto syntax;
1047
1048   chr = scm_getc (port);
1049   if (chr != '(')
1050     goto syntax;
1051
1052   return maybe_annotate_source
1053     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1054      port, line, column);
1055
1056  syntax:
1057   scm_i_input_error ("read_bytevector", port,
1058                      "invalid bytevector prefix",
1059                      SCM_MAKE_CHAR (chr));
1060   return SCM_UNSPECIFIED;
1061 }
1062
1063 static SCM
1064 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
1065 {
1066   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1067      terribly inefficient but who cares?  */
1068   SCM s_bits = SCM_EOL;
1069
1070   for (chr = scm_getc (port);
1071        (chr != EOF) && ((chr == '0') || (chr == '1'));
1072        chr = scm_getc (port))
1073     {
1074       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1075     }
1076
1077   if (chr != EOF)
1078     scm_ungetc (chr, port);
1079
1080   return maybe_annotate_source
1081     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1082      port, line, column);
1083 }
1084
1085 static SCM
1086 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1087 {
1088   int bang_seen = 0;
1089
1090   for (;;)
1091     {
1092       int c = scm_getc (port);
1093
1094       if (c == EOF)
1095         scm_i_input_error ("skip_block_comment", port,
1096                            "unterminated `#! ... !#' comment", SCM_EOL);
1097
1098       if (c == '!')
1099         bang_seen = 1;
1100       else if (c == '#' && bang_seen)
1101         break;
1102       else
1103         bang_seen = 0;
1104     }
1105
1106   return SCM_UNSPECIFIED;
1107 }
1108
1109 static SCM
1110 scm_read_shebang (scm_t_wchar chr, SCM port)
1111 {
1112   int c = 0;
1113   if ((c = scm_get_byte_or_eof (port)) != 'r')
1114     {
1115       scm_ungetc (c, port);
1116       return scm_read_scsh_block_comment (chr, port);
1117     }
1118   if ((c = scm_get_byte_or_eof (port)) != '6')
1119     {
1120       scm_ungetc (c, port);
1121       scm_ungetc ('r', port);
1122       return scm_read_scsh_block_comment (chr, port);
1123     }
1124   if ((c = scm_get_byte_or_eof (port)) != 'r')
1125     {
1126       scm_ungetc (c, port);
1127       scm_ungetc ('6', port);
1128       scm_ungetc ('r', port);
1129       return scm_read_scsh_block_comment (chr, port);
1130     }
1131   if ((c = scm_get_byte_or_eof (port)) != 's')
1132     {
1133       scm_ungetc (c, port);
1134       scm_ungetc ('r', port);
1135       scm_ungetc ('6', port);
1136       scm_ungetc ('r', port);
1137       return scm_read_scsh_block_comment (chr, port);
1138     }
1139
1140   return SCM_UNSPECIFIED;
1141 }
1142
1143 static SCM
1144 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1145 {
1146   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1147      nested.  So care must be taken.  */
1148   int nesting_level = 1;
1149
1150   int a = scm_getc (port);
1151
1152   if (a == EOF)
1153     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1154                        "unterminated `#| ... |#' comment", SCM_EOL);
1155
1156   while (nesting_level > 0)
1157     {
1158       int b = scm_getc (port);
1159
1160       if (b == EOF)
1161         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1162                            "unterminated `#| ... |#' comment", SCM_EOL);
1163
1164       if (a == '|' && b == '#')
1165         {
1166           nesting_level--;
1167           b = EOF;
1168         }
1169       else if (a == '#' && b == '|')
1170         {
1171           nesting_level++;
1172           b = EOF;
1173         }
1174
1175       a = b;
1176     }
1177
1178   return SCM_UNSPECIFIED;
1179 }
1180
1181 static SCM
1182 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1183 {
1184   scm_t_wchar c;
1185
1186   c = flush_ws (port, (char *) NULL);
1187   if (EOF == c)
1188     scm_i_input_error ("read_commented_expression", port,
1189                        "no expression after #; comment", SCM_EOL);
1190   scm_ungetc (c, port);
1191   scm_read_expression (port);
1192   return SCM_UNSPECIFIED;
1193 }
1194
1195 static SCM
1196 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1197 {
1198   /* Guile's extended symbol read syntax looks like this:
1199
1200        #{This is all a symbol name}#
1201
1202      So here, CHR is expected to be `{'.  */
1203   int saw_brace = 0;
1204   size_t len = 0;
1205   SCM buf = scm_i_make_string (1024, NULL, 0);
1206
1207   buf = scm_i_string_start_writing (buf);
1208
1209   while ((chr = scm_getc (port)) != EOF)
1210     {
1211       if (saw_brace)
1212         {
1213           if (chr == '#')
1214             {
1215               break;
1216             }
1217           else
1218             {
1219               saw_brace = 0;
1220               scm_i_string_set_x (buf, len++, '}');
1221             }
1222         }
1223
1224       if (chr == '}')
1225         saw_brace = 1;
1226       else if (chr == '\\')
1227         {
1228           /* It used to be that print.c would print extended-read-syntax
1229              symbols with backslashes before "non-standard" chars, but
1230              this routine wouldn't do anything with those escapes.
1231              Bummer.  What we've done is to change print.c to output
1232              R6RS hex escapes for those characters, relying on the fact
1233              that the extended read syntax would never put a `\' before
1234              an `x'.  For now, we just ignore other instances of
1235              backslash in the string.  */
1236           switch ((chr = scm_getc (port)))
1237             {
1238             case EOF:
1239               goto done;
1240             case 'x':
1241               {
1242                 scm_t_wchar c;
1243
1244                 SCM_READ_HEX_ESCAPE (10, ';');
1245                 scm_i_string_set_x (buf, len++, c);
1246                 break;
1247
1248               str_eof:
1249                 chr = EOF;
1250                 goto done;
1251
1252               bad_escaped:
1253                 scm_i_string_stop_writing ();
1254                 scm_i_input_error ("scm_read_extended_symbol", port,
1255                                    "illegal character in escape sequence: ~S",
1256                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1257                 break;
1258               }
1259             default:
1260               scm_i_string_set_x (buf, len++, chr);
1261               break;
1262             }
1263         }
1264       else
1265         scm_i_string_set_x (buf, len++, chr);
1266
1267       if (len >= scm_i_string_length (buf) - 2)
1268         {
1269           SCM addy;
1270
1271           scm_i_string_stop_writing ();
1272           addy = scm_i_make_string (1024, NULL, 0);
1273           buf = scm_string_append (scm_list_2 (buf, addy));
1274           len = 0;
1275           buf = scm_i_string_start_writing (buf);
1276         }
1277     }
1278
1279  done:
1280   scm_i_string_stop_writing ();
1281   if (chr == EOF)
1282     scm_i_input_error ("scm_read_extended_symbol", port,
1283                        "end of file while reading symbol", SCM_EOL);
1284
1285   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1286 }
1287
1288
1289 \f
1290 /* Top-level token readers, i.e., dispatchers.  */
1291
1292 static SCM
1293 scm_read_sharp_extension (int chr, SCM port)
1294 {
1295   SCM proc;
1296
1297   proc = scm_get_hash_procedure (chr);
1298   if (scm_is_true (scm_procedure_p (proc)))
1299     {
1300       long line = SCM_LINUM (port);
1301       int column = SCM_COL (port) - 2;
1302       SCM got;
1303
1304       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1305
1306       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1307         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1308
1309       return got;
1310     }
1311
1312   return SCM_UNSPECIFIED;
1313 }
1314
1315 /* The reader for the sharp `#' character.  It basically dispatches reads
1316    among the above token readers.   */
1317 static SCM
1318 scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
1319 #define FUNC_NAME "scm_lreadr"
1320 {
1321   SCM result;
1322
1323   chr = scm_getc (port);
1324
1325   result = scm_read_sharp_extension (chr, port);
1326   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1327     return result;
1328
1329   switch (chr)
1330     {
1331     case '\\':
1332       return (scm_read_character (chr, port));
1333     case '(':
1334       return (scm_read_vector (chr, port, line, column));
1335     case 's':
1336     case 'u':
1337     case 'f':
1338     case 'c':
1339       /* This one may return either a boolean or an SRFI-4 vector.  */
1340       return (scm_read_srfi4_vector (chr, port, line, column));
1341     case 'v':
1342       return (scm_read_bytevector (chr, port, line, column));
1343     case '*':
1344       return (scm_read_guile_bit_vector (chr, port, line, column));
1345     case 't':
1346     case 'T':
1347     case 'F':
1348       return (scm_read_boolean (chr, port));
1349     case ':':
1350       return (scm_read_keyword (chr, port));
1351     case '0': case '1': case '2': case '3': case '4':
1352     case '5': case '6': case '7': case '8': case '9':
1353     case '@':
1354 #if SCM_ENABLE_DEPRECATED
1355       /* See below for 'i' and 'e'. */
1356     case 'a':
1357     case 'y':
1358     case 'h':
1359     case 'l':
1360 #endif
1361       return (scm_read_array (chr, port, line, column));
1362
1363     case 'i':
1364     case 'e':
1365 #if SCM_ENABLE_DEPRECATED
1366       {
1367         /* When next char is '(', it really is an old-style
1368            uniform array. */
1369         scm_t_wchar next_c = scm_getc (port);
1370         if (next_c != EOF)
1371           scm_ungetc (next_c, port);
1372         if (next_c == '(')
1373           return scm_read_array (chr, port, line, column);
1374         /* Fall through. */
1375       }
1376 #endif
1377     case 'b':
1378     case 'B':
1379     case 'o':
1380     case 'O':
1381     case 'd':
1382     case 'D':
1383     case 'x':
1384     case 'X':
1385     case 'I':
1386     case 'E':
1387       return (scm_read_number_and_radix (chr, port));
1388     case '{':
1389       return (scm_read_extended_symbol (chr, port));
1390     case '!':
1391       return (scm_read_shebang (chr, port));
1392     case ';':
1393       return (scm_read_commented_expression (chr, port));
1394     case '`':
1395     case '\'':
1396     case ',':
1397       return (scm_read_syntax (chr, port));
1398     case 'n':
1399       return (scm_read_nil (chr, port));
1400     default:
1401       result = scm_read_sharp_extension (chr, port);
1402       if (scm_is_eq (result, SCM_UNSPECIFIED))
1403         {
1404           /* To remain compatible with 1.8 and earlier, the following
1405              characters have lower precedence than `read-hash-extend'
1406              characters.  */
1407           switch (chr)
1408             {
1409             case '|':
1410               return scm_read_r6rs_block_comment (chr, port);
1411             default:
1412               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1413                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1414             }
1415         }
1416       else
1417         return result;
1418     }
1419
1420   return SCM_UNSPECIFIED;
1421 }
1422 #undef FUNC_NAME
1423
1424 static SCM
1425 scm_read_expression (SCM port)
1426 #define FUNC_NAME "scm_read_expression"
1427 {
1428   while (1)
1429     {
1430       scm_t_wchar chr;
1431
1432       chr = scm_getc (port);
1433
1434       switch (chr)
1435         {
1436         case SCM_WHITE_SPACES:
1437         case SCM_LINE_INCREMENTORS:
1438           break;
1439         case ';':
1440           (void) scm_read_semicolon_comment (chr, port);
1441           break;
1442         case '[':
1443           if (!SCM_SQUARE_BRACKETS_P)
1444             return (scm_read_mixed_case_symbol (chr, port));
1445           /* otherwise fall through */
1446         case '(':
1447           return (scm_read_sexp (chr, port));
1448         case '"':
1449           return (scm_read_string (chr, port));
1450         case '\'':
1451         case '`':
1452         case ',':
1453           return (scm_read_quote (chr, port));
1454         case '#':
1455           {
1456             long line  = SCM_LINUM (port);
1457             int column = SCM_COL (port) - 1;
1458             SCM result = scm_read_sharp (chr, port, line, column);
1459             if (scm_is_eq (result, SCM_UNSPECIFIED))
1460               /* We read a comment or some such.  */
1461               break;
1462             else
1463               return result;
1464           }
1465         case ')':
1466           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1467           break;
1468         case ']':
1469           if (SCM_SQUARE_BRACKETS_P)
1470             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1471           /* otherwise fall through */
1472         case EOF:
1473           return SCM_EOF_VAL;
1474         case ':':
1475           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1476             return scm_symbol_to_keyword (scm_read_expression (port));
1477           /* Fall through.  */
1478
1479         default:
1480           {
1481             if (((chr >= '0') && (chr <= '9'))
1482                 || (strchr ("+-.", chr)))
1483               return (scm_read_number (chr, port));
1484             else
1485               return (scm_read_mixed_case_symbol (chr, port));
1486           }
1487         }
1488     }
1489 }
1490 #undef FUNC_NAME
1491
1492 \f
1493 /* Actual reader.  */
1494
1495 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1496             (SCM port),
1497             "Read an s-expression from the input port @var{port}, or from\n"
1498             "the current input port if @var{port} is not specified.\n"
1499             "Any whitespace before the next token is discarded.")
1500 #define FUNC_NAME s_scm_read
1501 {
1502   int c;
1503
1504   if (SCM_UNBNDP (port))
1505     port = scm_current_input_port ();
1506   SCM_VALIDATE_OPINPORT (1, port);
1507
1508   c = flush_ws (port, (char *) NULL);
1509   if (EOF == c)
1510     return SCM_EOF_VAL;
1511   scm_ungetc (c, port);
1512
1513   return (scm_read_expression (port));
1514 }
1515 #undef FUNC_NAME
1516
1517
1518 \f
1519
1520 /* Manipulate the read-hash-procedures alist.  This could be written in
1521    Scheme, but maybe it will also be used by C code during initialisation.  */
1522 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1523             (SCM chr, SCM proc),
1524             "Install the procedure @var{proc} for reading expressions\n"
1525             "starting with the character sequence @code{#} and @var{chr}.\n"
1526             "@var{proc} will be called with two arguments:  the character\n"
1527             "@var{chr} and the port to read further data from. The object\n"
1528             "returned will be the return value of @code{read}. \n"
1529             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1530             )
1531 #define FUNC_NAME s_scm_read_hash_extend
1532 {
1533   SCM this;
1534   SCM prev;
1535
1536   SCM_VALIDATE_CHAR (1, chr);
1537   SCM_ASSERT (scm_is_false (proc)
1538               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1539               proc, SCM_ARG2, FUNC_NAME);
1540
1541   /* Check if chr is already in the alist.  */
1542   this = scm_i_read_hash_procedures_ref ();
1543   prev = SCM_BOOL_F;
1544   while (1)
1545     {
1546       if (scm_is_null (this))
1547         {
1548           /* not found, so add it to the beginning.  */
1549           if (scm_is_true (proc))
1550             {
1551               SCM new = scm_cons (scm_cons (chr, proc),
1552                                   scm_i_read_hash_procedures_ref ());
1553               scm_i_read_hash_procedures_set_x (new);
1554             }
1555           break;
1556         }
1557       if (scm_is_eq (chr, SCM_CAAR (this)))
1558         {
1559           /* already in the alist.  */
1560           if (scm_is_false (proc))
1561             {
1562               /* remove it.  */
1563               if (scm_is_false (prev))
1564                 {
1565                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1566                   scm_i_read_hash_procedures_set_x (rest);
1567                 }
1568               else
1569                 scm_set_cdr_x (prev, SCM_CDR (this));
1570             }
1571           else
1572             {
1573               /* replace it.  */
1574               scm_set_cdr_x (SCM_CAR (this), proc);
1575             }
1576           break;
1577         }
1578       prev = this;
1579       this = SCM_CDR (this);
1580     }
1581
1582   return SCM_UNSPECIFIED;
1583 }
1584 #undef FUNC_NAME
1585
1586 /* Recover the read-hash procedure corresponding to char c.  */
1587 static SCM
1588 scm_get_hash_procedure (int c)
1589 {
1590   SCM rest = scm_i_read_hash_procedures_ref ();
1591
1592   while (1)
1593     {
1594       if (scm_is_null (rest))
1595         return SCM_BOOL_F;
1596
1597       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1598         return SCM_CDAR (rest);
1599
1600       rest = SCM_CDR (rest);
1601     }
1602 }
1603
1604 #define SCM_ENCODING_SEARCH_SIZE (500)
1605
1606 /* Search the first few hundred characters of a file for an Emacs-like coding
1607    declaration.  Returns either NULL or a string whose storage has been
1608    allocated with `scm_gc_malloc ()'.  */
1609 char *
1610 scm_i_scan_for_encoding (SCM port)
1611 {
1612   scm_t_port *pt;
1613   char header[SCM_ENCODING_SEARCH_SIZE+1];
1614   size_t bytes_read, encoding_length, i;
1615   char *encoding = NULL;
1616   int utf8_bom = 0;
1617   char *pos, *encoding_start;
1618   int in_comment;
1619
1620   pt = SCM_PTAB_ENTRY (port);
1621
1622   if (pt->rw_active == SCM_PORT_WRITE)
1623     scm_flush (port);
1624
1625   if (pt->rw_random)
1626     pt->rw_active = SCM_PORT_READ;
1627
1628   if (pt->read_pos == pt->read_end)
1629     {
1630       /* We can use the read buffer, and thus avoid a seek. */
1631       if (scm_fill_input (port) == EOF)
1632         return NULL;
1633
1634       bytes_read = pt->read_end - pt->read_pos;
1635       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1636         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1637
1638       if (bytes_read <= 1)
1639         /* An unbuffered port -- don't scan.  */
1640         return NULL;
1641
1642       memcpy (header, pt->read_pos, bytes_read);
1643       header[bytes_read] = '\0';
1644     }
1645   else
1646     {
1647       /* Try to read some bytes and then seek back.  Not all ports
1648          support seeking back; and indeed some file ports (like
1649          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1650          check performed by SCM_FPORT_FDES---but fail to seek
1651          backwards.  Hence this block comes second.  We prefer to use
1652          the read buffer in-place.  */
1653       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1654         return NULL;
1655
1656       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1657       header[bytes_read] = '\0';
1658       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1659     }
1660
1661   if (bytes_read > 3
1662       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1663     utf8_bom = 1;
1664
1665   /* search past "coding[:=]" */
1666   pos = header;
1667   while (1)
1668     {
1669       if ((pos = strstr(pos, "coding")) == NULL)
1670         return NULL;
1671
1672       pos += strlen("coding");
1673       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1674           (*pos == ':' || *pos == '='))
1675         {
1676           pos ++;
1677           break;
1678         }
1679     }
1680
1681   /* skip spaces */
1682   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1683          (*pos == ' ' || *pos == '\t'))
1684     pos ++;
1685
1686   /* grab the next token */
1687   encoding_start = pos;
1688   i = 0;
1689   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1690          && encoding_start + i - header < bytes_read
1691          && (isalnum ((int) encoding_start[i])
1692              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1693     i++;
1694
1695   encoding_length = i;
1696   if (encoding_length == 0)
1697     return NULL;
1698
1699   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1700   for (i = 0; i < encoding_length; i++)
1701     encoding[i] = toupper ((int) encoding[i]);
1702
1703   /* push backwards to make sure we were in a comment */
1704   in_comment = 0;
1705   pos = encoding_start;
1706   while (pos >= header)
1707     {
1708       if (*pos == ';')
1709         {
1710           in_comment = 1;
1711           break;
1712         }
1713       else if (*pos == '\n' || pos == header)
1714         {
1715           /* This wasn't in a semicolon comment. Check for a
1716            hash-bang comment. */
1717           char *beg = strstr (header, "#!");
1718           char *end = strstr (header, "!#");
1719           if (beg < encoding_start && encoding_start + encoding_length <= end)
1720             in_comment = 1;
1721           break;
1722         }
1723       else
1724         {
1725           pos --;
1726           continue;
1727         }
1728     }
1729   if (!in_comment)
1730     /* This wasn't in a comment */
1731     return NULL;
1732
1733   if (utf8_bom && strcmp(encoding, "UTF-8"))
1734     scm_misc_error (NULL,
1735                     "the port input declares the encoding ~s but is encoded as UTF-8",
1736                     scm_list_1 (scm_from_locale_string (encoding)));
1737
1738   return encoding;
1739 }
1740
1741 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1742             (SCM port),
1743             "Scans the port for an Emacs-like character coding declaration\n"
1744             "near the top of the contents of a port with random-accessible contents.\n"
1745             "The coding declaration is of the form\n"
1746             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1747             "\n"
1748             "Returns a string containing the character encoding of the file\n"
1749             "if a declaration was found, or @code{#f} otherwise.\n")
1750 #define FUNC_NAME s_scm_file_encoding
1751 {
1752   char *enc;
1753   SCM s_enc;
1754
1755   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1756
1757   enc = scm_i_scan_for_encoding (port);
1758   if (enc == NULL)
1759     return SCM_BOOL_F;
1760   else
1761     {
1762       s_enc = scm_from_locale_string (enc);
1763       return s_enc;
1764     }
1765
1766   return SCM_BOOL_F;
1767 }
1768 #undef FUNC_NAME
1769
1770 void
1771 scm_init_read ()
1772 {
1773   SCM read_hash_procs;
1774
1775   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1776
1777   scm_i_read_hash_procedures =
1778     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1779
1780   scm_init_opts (scm_read_options, scm_read_opts);
1781 #include "libguile/read.x"
1782 }
1783
1784 /*
1785   Local Variables:
1786   c-file-style: "gnu"
1787   End:
1788 */