libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /* Internal read options structure.  This is initialized by 'scm_read'
  85    from the global and per-port read options, and a pointer is passed
  86    down to all helper functions. */
  87
  88 enum t_keyword_style
  89   {
  90     KEYWORD_STYLE_HASH_PREFIX,
  91     KEYWORD_STYLE_PREFIX,
  92     KEYWORD_STYLE_POSTFIX
  93   };
  94
  95 struct t_read_opts
  96 {
  97   enum t_keyword_style keyword_style;
  98   unsigned int copy_source_p        : 1;
  99   unsigned int record_positions_p   : 1;
 100   unsigned int case_insensitive_p   : 1;
 101   unsigned int r6rs_escapes_p       : 1;
 102   unsigned int square_brackets_p    : 1;
 103   unsigned int hungry_eol_escapes_p : 1;
 104 };
 105
 106 typedef struct t_read_opts scm_t_read_opts;
 107
 108
 109 /*
 110   Give meaningful error messages for errors
 111
 112   We use the format
 113
 114   FILE:LINE:COL: MESSAGE
 115   This happened in ....
 116
 117   This is not standard GNU format, but the test-suite likes the real
 118   message to be in front.
 119
 120  */
 121
 122
 123 void
 124 scm_i_input_error (char const *function,
 125                    SCM port, const char *message, SCM arg)
 126 {
 127   SCM fn = (scm_is_string (SCM_FILENAME(port))
 128             ? SCM_FILENAME(port)
 129             : scm_from_locale_string ("#<unknown port>"));
 130
 131   SCM string_port = scm_open_output_string ();
 132   SCM string = SCM_EOL;
 133   scm_simple_format (string_port,
 134                      scm_from_locale_string ("~A:~S:~S: ~A"),
 135                      scm_list_4 (fn,
 136                                  scm_from_long (SCM_LINUM (port) + 1),
 137                                  scm_from_int (SCM_COL (port) + 1),
 138                                  scm_from_locale_string (message)));
 139
 140   string = scm_get_output_string (string_port);
 141   scm_close_output_port (string_port);
 142   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 143                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 144                  string,
 145                  arg,
 146                  SCM_BOOL_F);
 147 }
 148
 149
 150 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 151             (SCM setting),
 152             "Option interface for the read options. Instead of using\n"
 153             "this procedure directly, use the procedures @code{read-enable},\n"
 154             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 155 #define FUNC_NAME s_scm_read_options
 156 {
 157   SCM ans = scm_options (setting,
 158                          scm_read_opts,
 159                          FUNC_NAME);
 160   if (SCM_COPY_SOURCE_P)
 161     SCM_RECORD_POSITIONS_P = 1;
 162   return ans;
 163 }
 164 #undef FUNC_NAME
 165
 166 /* A fluid referring to an association list mapping extra hash
 167    characters to procedures.  */
 168 static SCM *scm_i_read_hash_procedures;
 169
 170 static SCM
 171 scm_i_read_hash_procedures_ref (void)
 172 {
 173   return scm_fluid_ref (*scm_i_read_hash_procedures);
 174 }
 175
 176 static void
 177 scm_i_read_hash_procedures_set_x (SCM value)
 178 {
 179   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 180 }
 181
 182 \f
 183 /* Token readers.  */
 184
 185
 186 /* Size of the C buffer used to read symbols and numbers.  */
 187 #define READER_BUFFER_SIZE            128
 188
 189 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 190 #define READER_STRING_BUFFER_SIZE     128
 191
 192 /* The maximum size of Scheme character names.  */
 193 #define READER_CHAR_NAME_MAX_SIZE      50
 194
 195 /* The maximum size of reader directive names.  */
 196 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 197
 198
 199 /* `isblank' is only in C99.  */
 200 #define CHAR_IS_BLANK_(_chr)                                    \
 201   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 202    || ((_chr) == '\f') || ((_chr) == '\r'))
 203
 204 #ifdef MSDOS
 205 # define CHAR_IS_BLANK(_chr)                    \
 206   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 207 #else
 208 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 209 #endif
 210
 211
 212 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 213    structure'').  */
 214 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 215   (CHAR_IS_BLANK (c)                                            \
 216    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 217
 218 #define CHAR_IS_DELIMITER(c)                                    \
 219   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 220    || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
 221
 222 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 223    Structure''.  */
 224 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 225   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 226    || ((_chr) == 'd') || ((_chr) == 'l'))
 227
 228 /* Read an SCSH block comment.  */
 229 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 230 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 231 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 232 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 233 static SCM scm_get_hash_procedure (int);
 234
 235 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 236    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 237    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 238    bytes actually read.  */
 239 static int
 240 read_token (SCM port, scm_t_read_opts *opts,
 241             char *buf, size_t buf_size, size_t *read)
 242 {
 243    *read = 0;
 244
 245    while (*read < buf_size)
 246      {
 247        int chr;
 248
 249        chr = scm_get_byte_or_eof (port);
 250
 251        if (chr == EOF)
 252         return 0;
 253       else if (CHAR_IS_DELIMITER (chr))
 254         {
 255           scm_unget_byte (chr, port);
 256           return 0;
 257         }
 258       else
 259         {
 260           *buf = (char) chr;
 261           buf++, (*read)++;
 262         }
 263      }
 264
 265    return 1;
 266  }
 267
 268 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 269    if the token doesn't fit in BUFFER_SIZE bytes.  */
 270 static char *
 271 read_complete_token (SCM port, scm_t_read_opts *opts,
 272                      char *buffer, size_t buffer_size, size_t *read)
 273 {
 274   int overflow = 0;
 275   size_t bytes_read, overflow_size = 0;
 276   char *overflow_buffer = NULL;
 277
 278   do
 279     {
 280       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 281       if (bytes_read == 0)
 282         break;
 283       if (overflow || overflow_size != 0)
 284         {
 285           if (overflow_size == 0)
 286             {
 287               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 288               memcpy (overflow_buffer, buffer, bytes_read);
 289               overflow_size = bytes_read;
 290             }
 291           else
 292             {
 293               char *new_buf =
 294                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 295
 296               memcpy (new_buf, overflow_buffer, overflow_size);
 297               memcpy (new_buf + overflow_size, buffer, bytes_read);
 298
 299               overflow_buffer = new_buf;
 300               overflow_size += bytes_read;
 301             }
 302         }
 303     }
 304   while (overflow);
 305
 306   if (overflow_size)
 307     *read = overflow_size;
 308   else
 309     *read = bytes_read;
 310
 311   return (overflow_size > 0 ? overflow_buffer : buffer);
 312 }
 313
 314 /* Skip whitespace from PORT and return the first non-whitespace character
 315    read.  Raise an error on end-of-file.  */
 316 static int
 317 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 318 {
 319   scm_t_wchar c;
 320   while (1)
 321     switch (c = scm_getc (port))
 322       {
 323       case EOF:
 324       goteof:
 325         if (eoferr)
 326           {
 327             scm_i_input_error (eoferr,
 328                                port,
 329                                "end of file",
 330                                SCM_EOL);
 331           }
 332         return c;
 333
 334       case ';':
 335       lp:
 336         switch (c = scm_getc (port))
 337           {
 338           case EOF:
 339             goto goteof;
 340           default:
 341             goto lp;
 342           case SCM_LINE_INCREMENTORS:
 343             break;
 344           }
 345         break;
 346
 347       case '#':
 348         switch (c = scm_getc (port))
 349           {
 350           case EOF:
 351             eoferr = "read_sharp";
 352             goto goteof;
 353           case '!':
 354             scm_read_shebang (c, port, opts);
 355             break;
 356           case ';':
 357             scm_read_commented_expression (c, port, opts);
 358             break;
 359           case '|':
 360             if (scm_is_false (scm_get_hash_procedure (c)))
 361               {
 362                 scm_read_r6rs_block_comment (c, port);
 363                 break;
 364               }
 365             /* fall through */
 366           default:
 367             scm_ungetc (c, port);
 368             return '#';
 369           }
 370         break;
 371
 372       case SCM_LINE_INCREMENTORS:
 373       case SCM_SINGLE_SPACES:
 374       case '\t':
 375         break;
 376
 377       default:
 378         return c;
 379       }
 380
 381   return 0;
 382 }
 383
 384
 385 \f
 386 /* Token readers.  */
 387
 388 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 389 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 390                            long line, int column);
 391
 392
 393 static SCM
 394 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 395                        long line, int column)
 396 {
 397   if (opts->record_positions_p)
 398     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 399   return x;
 400 }
 401
 402 static SCM
 403 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 404 #define FUNC_NAME "scm_i_lreadparen"
 405 {
 406   int c;
 407   SCM tmp, tl, ans = SCM_EOL;
 408   const int terminating_char = ((chr == '[') ? ']' : ')');
 409
 410   /* Need to capture line and column numbers here. */
 411   long line = SCM_LINUM (port);
 412   int column = SCM_COL (port) - 1;
 413
 414   c = flush_ws (port, opts, FUNC_NAME);
 415   if (terminating_char == c)
 416     return SCM_EOL;
 417
 418   scm_ungetc (c, port);
 419   tmp = scm_read_expression (port, opts);
 420
 421   /* Note that it is possible for scm_read_expression to return
 422      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 423      check that it's a real dot by checking `c'.  */
 424   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 425     {
 426       ans = scm_read_expression (port, opts);
 427       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 428         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 429                            SCM_EOL);
 430       return ans;
 431     }
 432
 433   /* Build the head of the list structure. */
 434   ans = tl = scm_cons (tmp, SCM_EOL);
 435
 436   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 437     {
 438       SCM new_tail;
 439
 440       if (c == ')' || (c == ']' && opts->square_brackets_p))
 441         scm_i_input_error (FUNC_NAME, port,
 442                            "in pair: mismatched close paren: ~A",
 443                            scm_list_1 (SCM_MAKE_CHAR (c)));
 444
 445       scm_ungetc (c, port);
 446       tmp = scm_read_expression (port, opts);
 447
 448       /* See above note about scm_sym_dot.  */
 449       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 450         {
 451           SCM_SETCDR (tl, scm_read_expression (port, opts));
 452
 453           c = flush_ws (port, opts, FUNC_NAME);
 454           if (terminating_char != c)
 455             scm_i_input_error (FUNC_NAME, port,
 456                                "in pair: missing close paren", SCM_EOL);
 457           goto exit;
 458         }
 459
 460       new_tail = scm_cons (tmp, SCM_EOL);
 461       SCM_SETCDR (tl, new_tail);
 462       tl = new_tail;
 463     }
 464
 465  exit:
 466   return maybe_annotate_source (ans, port, opts, line, column);
 467 }
 468 #undef FUNC_NAME
 469
 470
 471 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 472    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 473    found.  */
 474 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 475   do                                                               \
 476     {                                                              \
 477       scm_t_wchar a;                                               \
 478       size_t i = 0;                                                \
 479       c = 0;                                                       \
 480       while (i < ndigits)                                          \
 481         {                                                          \
 482           a = scm_getc (port);                                     \
 483           if (a == EOF)                                            \
 484             goto str_eof;                                          \
 485           if (terminator                                           \
 486               && (a == (scm_t_wchar) terminator)                   \
 487               && (i > 0))                                          \
 488             break;                                                 \
 489           if ('0' <= a && a <= '9')                                \
 490             a -= '0';                                              \
 491           else if ('A' <= a && a <= 'F')                           \
 492             a = a - 'A' + 10;                                      \
 493           else if ('a' <= a && a <= 'f')                           \
 494             a = a - 'a' + 10;                                      \
 495           else                                                     \
 496             {                                                      \
 497               c = a;                                               \
 498               goto bad_escaped;                                    \
 499             }                                                      \
 500           c = c * 16 + a;                                          \
 501           i ++;                                                    \
 502         }                                                          \
 503     } while (0)
 504
 505 static void
 506 skip_intraline_whitespace (SCM port)
 507 {
 508   scm_t_wchar c;
 509
 510   do
 511     {
 512       c = scm_getc (port);
 513       if (c == EOF)
 514         return;
 515     }
 516   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 517
 518   scm_ungetc (c, port);
 519 }
 520
 521 static SCM
 522 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 523 #define FUNC_NAME "scm_lreadr"
 524 {
 525   /* For strings smaller than C_STR, this function creates only one Scheme
 526      object (the string returned).  */
 527
 528   SCM str = SCM_EOL;
 529   size_t c_str_len = 0;
 530   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 531
 532   /* Need to capture line and column numbers here. */
 533   long line = SCM_LINUM (port);
 534   int column = SCM_COL (port) - 1;
 535
 536   while ('"' != (c = scm_getc (port)))
 537     {
 538       if (c == EOF)
 539         {
 540         str_eof:
 541           scm_i_input_error (FUNC_NAME, port,
 542                              "end of file in string constant", SCM_EOL);
 543         }
 544
 545       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 546         {
 547           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 548           c_str_len = 0;
 549         }
 550
 551       if (c == '\\')
 552         {
 553           switch (c = scm_getc (port))
 554             {
 555             case EOF:
 556               goto str_eof;
 557             case '"':
 558             case '\\':
 559               break;
 560             case '\n':
 561               if (opts->hungry_eol_escapes_p)
 562                 skip_intraline_whitespace (port);
 563               continue;
 564             case '0':
 565               c = '\0';
 566               break;
 567             case 'f':
 568               c = '\f';
 569               break;
 570             case 'n':
 571               c = '\n';
 572               break;
 573             case 'r':
 574               c = '\r';
 575               break;
 576             case 't':
 577               c = '\t';
 578               break;
 579             case 'a':
 580               c = '\007';
 581               break;
 582             case 'v':
 583               c = '\v';
 584               break;
 585             case 'b':
 586               c = '\010';
 587               break;
 588             case 'x':
 589               if (opts->r6rs_escapes_p)
 590                 SCM_READ_HEX_ESCAPE (10, ';');
 591               else
 592                 SCM_READ_HEX_ESCAPE (2, '\0');
 593               break;
 594             case 'u':
 595               if (!opts->r6rs_escapes_p)
 596                 {
 597                   SCM_READ_HEX_ESCAPE (4, '\0');
 598                   break;
 599                 }
 600             case 'U':
 601               if (!opts->r6rs_escapes_p)
 602                 {
 603                   SCM_READ_HEX_ESCAPE (6, '\0');
 604                   break;
 605                 }
 606             default:
 607             bad_escaped:
 608               scm_i_input_error (FUNC_NAME, port,
 609                                  "illegal character in escape sequence: ~S",
 610                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 611             }
 612         }
 613
 614       c_str[c_str_len++] = c;
 615     }
 616
 617   if (scm_is_null (str))
 618     /* Fast path: we got a string that fits in C_STR.  */
 619     str = scm_from_utf32_stringn (c_str, c_str_len);
 620   else
 621     {
 622       if (c_str_len > 0)
 623         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 624
 625       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 626     }
 627
 628   return maybe_annotate_source (str, port, opts, line, column);
 629 }
 630 #undef FUNC_NAME
 631
 632
 633 static SCM
 634 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 635 {
 636   SCM result, str = SCM_EOL;
 637   char local_buffer[READER_BUFFER_SIZE], *buffer;
 638   size_t bytes_read;
 639   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 640
 641   /* Need to capture line and column numbers here. */
 642   long line = SCM_LINUM (port);
 643   int column = SCM_COL (port) - 1;
 644
 645   scm_ungetc (chr, port);
 646   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 647                                 &bytes_read);
 648
 649   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 650
 651   result = scm_string_to_number (str, SCM_UNDEFINED);
 652   if (scm_is_false (result))
 653     {
 654       /* Return a symbol instead of a number */
 655       if (opts->case_insensitive_p)
 656         str = scm_string_downcase_x (str);
 657       result = scm_string_to_symbol (str);
 658     }
 659   else if (SCM_NIMP (result))
 660     result = maybe_annotate_source (result, port, opts, line, column);
 661
 662   SCM_COL (port) += scm_i_string_length (str);
 663   return result;
 664 }
 665
 666 static SCM
 667 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 668 {
 669   SCM result;
 670   int ends_with_colon = 0;
 671   size_t bytes_read;
 672   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 673   char local_buffer[READER_BUFFER_SIZE], *buffer;
 674   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 675   SCM str;
 676
 677   scm_ungetc (chr, port);
 678   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 679                                 &bytes_read);
 680   if (bytes_read > 0)
 681     ends_with_colon = buffer[bytes_read - 1] == ':';
 682
 683   if (postfix && ends_with_colon && (bytes_read > 1))
 684     {
 685       str = scm_from_stringn (buffer, bytes_read - 1,
 686                               pt->encoding, pt->ilseq_handler);
 687
 688       if (opts->case_insensitive_p)
 689         str = scm_string_downcase_x (str);
 690       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 691     }
 692   else
 693     {
 694       str = scm_from_stringn (buffer, bytes_read,
 695                               pt->encoding, pt->ilseq_handler);
 696
 697       if (opts->case_insensitive_p)
 698         str = scm_string_downcase_x (str);
 699       result = scm_string_to_symbol (str);
 700     }
 701
 702   SCM_COL (port) += scm_i_string_length (str);
 703   return result;
 704 }
 705
 706 static SCM
 707 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 708 #define FUNC_NAME "scm_lreadr"
 709 {
 710   SCM result;
 711   size_t read;
 712   char local_buffer[READER_BUFFER_SIZE], *buffer;
 713   unsigned int radix;
 714   SCM str;
 715   scm_t_port *pt;
 716
 717   switch (chr)
 718     {
 719     case 'B':
 720     case 'b':
 721       radix = 2;
 722       break;
 723
 724     case 'o':
 725     case 'O':
 726       radix = 8;
 727       break;
 728
 729     case 'd':
 730     case 'D':
 731       radix = 10;
 732       break;
 733
 734     case 'x':
 735     case 'X':
 736       radix = 16;
 737       break;
 738
 739     default:
 740       scm_ungetc (chr, port);
 741       scm_ungetc ('#', port);
 742       radix = 10;
 743     }
 744
 745   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 746                                 &read);
 747
 748   pt = SCM_PTAB_ENTRY (port);
 749   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 750
 751   result = scm_string_to_number (str, scm_from_uint (radix));
 752
 753   SCM_COL (port) += scm_i_string_length (str);
 754
 755   if (scm_is_true (result))
 756     return result;
 757
 758   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 759
 760   return SCM_BOOL_F;
 761 }
 762 #undef FUNC_NAME
 763
 764 static SCM
 765 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 766 {
 767   SCM p;
 768   long line = SCM_LINUM (port);
 769   int column = SCM_COL (port) - 1;
 770
 771   switch (chr)
 772     {
 773     case '`':
 774       p = scm_sym_quasiquote;
 775       break;
 776
 777     case '\'':
 778       p = scm_sym_quote;
 779       break;
 780
 781     case ',':
 782       {
 783         scm_t_wchar c;
 784
 785         c = scm_getc (port);
 786         if ('@' == c)
 787           p = scm_sym_uq_splicing;
 788         else
 789           {
 790             scm_ungetc (c, port);
 791             p = scm_sym_unquote;
 792           }
 793         break;
 794       }
 795
 796     default:
 797       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 798                "scm_read_quote", chr);
 799       abort ();
 800     }
 801
 802   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 803   return maybe_annotate_source (p, port, opts, line, column);
 804 }
 805
 806 SCM_SYMBOL (sym_syntax, "syntax");
 807 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 808 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 809 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 810
 811 static SCM
 812 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 813 {
 814   SCM p;
 815   long line = SCM_LINUM (port);
 816   int column = SCM_COL (port) - 1;
 817
 818   switch (chr)
 819     {
 820     case '`':
 821       p = sym_quasisyntax;
 822       break;
 823
 824     case '\'':
 825       p = sym_syntax;
 826       break;
 827
 828     case ',':
 829       {
 830         int c;
 831
 832         c = scm_getc (port);
 833         if ('@' == c)
 834           p = sym_unsyntax_splicing;
 835         else
 836           {
 837             scm_ungetc (c, port);
 838             p = sym_unsyntax;
 839           }
 840         break;
 841       }
 842
 843     default:
 844       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 845                "scm_read_syntax", chr);
 846       abort ();
 847     }
 848
 849   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 850   return maybe_annotate_source (p, port, opts, line, column);
 851 }
 852
 853 static SCM
 854 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 855 {
 856   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 857
 858   if (!scm_is_eq (id, sym_nil))
 859     scm_i_input_error ("scm_read_nil", port,
 860                        "unexpected input while reading #nil: ~a",
 861                        scm_list_1 (id));
 862
 863   return SCM_ELISP_NIL;
 864 }
 865
 866 static SCM
 867 scm_read_semicolon_comment (int chr, SCM port)
 868 {
 869   int c;
 870
 871   /* We use the get_byte here because there is no need to get the
 872      locale correct with comment input. This presumes that newline
 873      always represents itself no matter what the encoding is.  */
 874   for (c = scm_get_byte_or_eof (port);
 875        (c != EOF) && (c != '\n');
 876        c = scm_get_byte_or_eof (port));
 877
 878   return SCM_UNSPECIFIED;
 879 }
 880
 881 \f
 882 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 883
 884 static SCM
 885 scm_read_boolean (int chr, SCM port)
 886 {
 887   switch (chr)
 888     {
 889     case 't':
 890     case 'T':
 891       return SCM_BOOL_T;
 892
 893     case 'f':
 894     case 'F':
 895       return SCM_BOOL_F;
 896     }
 897
 898   return SCM_UNSPECIFIED;
 899 }
 900
 901 static SCM
 902 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 903 #define FUNC_NAME "scm_lreadr"
 904 {
 905   char buffer[READER_CHAR_NAME_MAX_SIZE];
 906   SCM charname;
 907   size_t charname_len, bytes_read;
 908   scm_t_wchar cp;
 909   int overflow;
 910   scm_t_port *pt;
 911
 912   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 913                          &bytes_read);
 914   if (overflow)
 915     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 916
 917   if (bytes_read == 0)
 918     {
 919       chr = scm_getc (port);
 920       if (chr == EOF)
 921         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 922                            "while reading character", SCM_EOL);
 923
 924       /* CHR must be a token delimiter, like a whitespace.  */
 925       return (SCM_MAKE_CHAR (chr));
 926     }
 927
 928   pt = SCM_PTAB_ENTRY (port);
 929
 930   /* Simple ASCII characters can be processed immediately.  Also, simple
 931      ISO-8859-1 characters can be processed immediately if the encoding for this
 932      port is ISO-8859-1.  */
 933   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 934     {
 935       SCM_COL (port) += 1;
 936       return SCM_MAKE_CHAR (buffer[0]);
 937     }
 938
 939   /* Otherwise, convert the buffer into a proper scheme string for
 940      processing.  */
 941   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 942                                pt->ilseq_handler);
 943   charname_len = scm_i_string_length (charname);
 944   SCM_COL (port) += charname_len;
 945   cp = scm_i_string_ref (charname, 0);
 946   if (charname_len == 1)
 947     return SCM_MAKE_CHAR (cp);
 948
 949   /* Ignore dotted circles, which may be used to keep combining characters from
 950      combining with the backslash in #\charname.  */
 951   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 952     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 953
 954   if (cp >= '0' && cp < '8')
 955     {
 956       /* Dirk:FIXME::  This type of character syntax is not R5RS
 957        * compliant.  Further, it should be verified that the constant
 958        * does only consist of octal digits.  */
 959       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 960       if (SCM_I_INUMP (p))
 961         {
 962           scm_t_wchar c = scm_to_uint32 (p);
 963           if (SCM_IS_UNICODE_CHAR (c))
 964             return SCM_MAKE_CHAR (c);
 965           else
 966             scm_i_input_error (FUNC_NAME, port,
 967                                "out-of-range octal character escape: ~a",
 968                                scm_list_1 (charname));
 969         }
 970     }
 971
 972   if (cp == 'x' && (charname_len > 1))
 973     {
 974       SCM p;
 975
 976       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 977       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 978                                 scm_from_uint (16));
 979       if (SCM_I_INUMP (p))
 980         {
 981           scm_t_wchar c = scm_to_uint32 (p);
 982           if (SCM_IS_UNICODE_CHAR (c))
 983             return SCM_MAKE_CHAR (c);
 984           else
 985             scm_i_input_error (FUNC_NAME, port,
 986                                "out-of-range hex character escape: ~a",
 987                                scm_list_1 (charname));
 988         }
 989     }
 990
 991   /* The names of characters should never have non-Latin1
 992      characters.  */
 993   if (scm_i_is_narrow_string (charname)
 994       || scm_i_try_narrow_string (charname))
 995     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 996                                        charname_len);
 997       if (scm_is_true (ch))
 998         return ch;
 999     }
1000
1001   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1002                      scm_list_1 (charname));
1003
1004   return SCM_UNSPECIFIED;
1005 }
1006 #undef FUNC_NAME
1007
1008 static SCM
1009 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1010 {
1011   SCM symbol;
1012
1013   /* Read the symbol that comprises the keyword.  Doing this instead of
1014      invoking a specific symbol reader function allows `scm_read_keyword ()'
1015      to adapt to the delimiters currently valid of symbols.
1016
1017      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1018   symbol = scm_read_expression (port, opts);
1019   if (!scm_is_symbol (symbol))
1020     scm_i_input_error ("scm_read_keyword", port,
1021                        "keyword prefix `~a' not followed by a symbol: ~s",
1022                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1023
1024   return (scm_symbol_to_keyword (symbol));
1025 }
1026
1027 static SCM
1028 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1029                  long line, int column)
1030 {
1031   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1032      guarantee that it's going to do what we want.  After all, this is an
1033      implementation detail of `scm_read_vector ()', not a desirable
1034      property.  */
1035   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1036                                 port, opts, line, column);
1037 }
1038
1039 /* Helper used by scm_read_array */
1040 static int
1041 read_decimal_integer (SCM port, int c, ssize_t *resp)
1042 {
1043   ssize_t sign = 1;
1044   ssize_t res = 0;
1045   int got_it = 0;
1046
1047   if (c == '-')
1048     {
1049       sign = -1;
1050       c = scm_getc (port);
1051     }
1052
1053   while ('0' <= c && c <= '9')
1054     {
1055       res = 10*res + c-'0';
1056       got_it = 1;
1057       c = scm_getc (port);
1058     }
1059
1060   if (got_it)
1061     *resp = sign * res;
1062   return c;
1063 }
1064
1065 /* Read an array.  This function can also read vectors and uniform
1066    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1067    handled here.
1068
1069    C is the first character read after the '#'.
1070 */
1071 static SCM
1072 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1073 {
1074   ssize_t rank;
1075   scm_t_wchar tag_buf[8];
1076   int tag_len;
1077
1078   SCM tag, shape = SCM_BOOL_F, elements, array;
1079
1080   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1081      the array code can not deal with zero-length dimensions yet, and
1082      we want to allow zero-length vectors, of course.
1083   */
1084   if (c == '(')
1085     return scm_read_vector (c, port, opts, line, column);
1086
1087   /* Disambiguate between '#f' and uniform floating point vectors.
1088    */
1089   if (c == 'f')
1090     {
1091       c = scm_getc (port);
1092       if (c != '3' && c != '6')
1093         {
1094           if (c != EOF)
1095             scm_ungetc (c, port);
1096           return SCM_BOOL_F;
1097         }
1098       rank = 1;
1099       tag_buf[0] = 'f';
1100       tag_len = 1;
1101       goto continue_reading_tag;
1102     }
1103
1104   /* Read rank. */
1105   rank = 1;
1106   c = read_decimal_integer (port, c, &rank);
1107   if (rank < 0)
1108     scm_i_input_error (NULL, port, "array rank must be non-negative",
1109                        SCM_EOL);
1110
1111   /* Read tag. */
1112   tag_len = 0;
1113  continue_reading_tag:
1114   while (c != EOF && c != '(' && c != '@' && c != ':'
1115          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1116     {
1117       tag_buf[tag_len++] = c;
1118       c = scm_getc (port);
1119     }
1120   if (tag_len == 0)
1121     tag = SCM_BOOL_T;
1122   else
1123     {
1124       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1125       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1126         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1127                            scm_list_1 (tag));
1128     }
1129
1130   /* Read shape. */
1131   if (c == '@' || c == ':')
1132     {
1133       shape = SCM_EOL;
1134
1135       do
1136         {
1137           ssize_t lbnd = 0, len = 0;
1138           SCM s;
1139
1140           if (c == '@')
1141             {
1142               c = scm_getc (port);
1143               c = read_decimal_integer (port, c, &lbnd);
1144             }
1145
1146           s = scm_from_ssize_t (lbnd);
1147
1148           if (c == ':')
1149             {
1150               c = scm_getc (port);
1151               c = read_decimal_integer (port, c, &len);
1152               if (len < 0)
1153                 scm_i_input_error (NULL, port,
1154                                    "array length must be non-negative",
1155                                    SCM_EOL);
1156
1157               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1158             }
1159
1160           shape = scm_cons (s, shape);
1161         } while (c == '@' || c == ':');
1162
1163       shape = scm_reverse_x (shape, SCM_EOL);
1164     }
1165
1166   /* Read nested lists of elements. */
1167   if (c != '(')
1168     scm_i_input_error (NULL, port,
1169                        "missing '(' in vector or array literal",
1170                        SCM_EOL);
1171   elements = scm_read_sexp (c, port, opts);
1172
1173   if (scm_is_false (shape))
1174     shape = scm_from_ssize_t (rank);
1175   else if (scm_ilength (shape) != rank)
1176     scm_i_input_error
1177       (NULL, port,
1178        "the number of shape specifications must match the array rank",
1179        SCM_EOL);
1180
1181   /* Handle special print syntax of rank zero arrays; see
1182      scm_i_print_array for a rationale. */
1183   if (rank == 0)
1184     {
1185       if (!scm_is_pair (elements))
1186         scm_i_input_error (NULL, port,
1187                            "too few elements in array literal, need 1",
1188                            SCM_EOL);
1189       if (!scm_is_null (SCM_CDR (elements)))
1190         scm_i_input_error (NULL, port,
1191                            "too many elements in array literal, want 1",
1192                            SCM_EOL);
1193       elements = SCM_CAR (elements);
1194     }
1195
1196   /* Construct array, annotate with source location, and return. */
1197   array = scm_list_to_typed_array (tag, shape, elements);
1198   return maybe_annotate_source (array, port, opts, line, column);
1199 }
1200
1201 static SCM
1202 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1203                        long line, int column)
1204 {
1205   return scm_read_array (chr, port, opts, line, column);
1206 }
1207
1208 static SCM
1209 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1210                      long line, int column)
1211 {
1212   chr = scm_getc (port);
1213   if (chr != 'u')
1214     goto syntax;
1215
1216   chr = scm_getc (port);
1217   if (chr != '8')
1218     goto syntax;
1219
1220   chr = scm_getc (port);
1221   if (chr != '(')
1222     goto syntax;
1223
1224   return maybe_annotate_source
1225     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1226      port, opts, line, column);
1227
1228  syntax:
1229   scm_i_input_error ("read_bytevector", port,
1230                      "invalid bytevector prefix",
1231                      SCM_MAKE_CHAR (chr));
1232   return SCM_UNSPECIFIED;
1233 }
1234
1235 static SCM
1236 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1237                            long line, int column)
1238 {
1239   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1240      terribly inefficient but who cares?  */
1241   SCM s_bits = SCM_EOL;
1242
1243   for (chr = scm_getc (port);
1244        (chr != EOF) && ((chr == '0') || (chr == '1'));
1245        chr = scm_getc (port))
1246     {
1247       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1248     }
1249
1250   if (chr != EOF)
1251     scm_ungetc (chr, port);
1252
1253   return maybe_annotate_source
1254     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1255      port, opts, line, column);
1256 }
1257
1258 static SCM
1259 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1260 {
1261   int bang_seen = 0;
1262
1263   for (;;)
1264     {
1265       int c = scm_getc (port);
1266
1267       if (c == EOF)
1268         scm_i_input_error ("skip_block_comment", port,
1269                            "unterminated `#! ... !#' comment", SCM_EOL);
1270
1271       if (c == '!')
1272         bang_seen = 1;
1273       else if (c == '#' && bang_seen)
1274         break;
1275       else
1276         bang_seen = 0;
1277     }
1278
1279   return SCM_UNSPECIFIED;
1280 }
1281
1282 static SCM
1283 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1284 {
1285   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1286   int c;
1287   int i = 0;
1288
1289   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1290     {
1291       c = scm_getc (port);
1292       if (c == EOF)
1293         scm_i_input_error ("skip_block_comment", port,
1294                            "unterminated `#! ... !#' comment", SCM_EOL);
1295       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1296         name[i++] = c;
1297       else if (CHAR_IS_DELIMITER (c))
1298         {
1299           scm_ungetc (c, port);
1300           name[i] = '\0';
1301           if (0 == strcmp ("r6rs", name))
1302             ;  /* Silently ignore */
1303           else
1304             break;
1305
1306           return SCM_UNSPECIFIED;
1307         }
1308     }
1309   while (i > 0)
1310     scm_ungetc (name[--i], port);
1311   return scm_read_scsh_block_comment (chr, port);
1312 }
1313
1314 static SCM
1315 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1316 {
1317   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1318      nested.  So care must be taken.  */
1319   int nesting_level = 1;
1320
1321   int a = scm_getc (port);
1322
1323   if (a == EOF)
1324     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1325                        "unterminated `#| ... |#' comment", SCM_EOL);
1326
1327   while (nesting_level > 0)
1328     {
1329       int b = scm_getc (port);
1330
1331       if (b == EOF)
1332         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1333                            "unterminated `#| ... |#' comment", SCM_EOL);
1334
1335       if (a == '|' && b == '#')
1336         {
1337           nesting_level--;
1338           b = EOF;
1339         }
1340       else if (a == '#' && b == '|')
1341         {
1342           nesting_level++;
1343           b = EOF;
1344         }
1345
1346       a = b;
1347     }
1348
1349   return SCM_UNSPECIFIED;
1350 }
1351
1352 static SCM
1353 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1354                                scm_t_read_opts *opts)
1355 {
1356   scm_t_wchar c;
1357
1358   c = flush_ws (port, opts, (char *) NULL);
1359   if (EOF == c)
1360     scm_i_input_error ("read_commented_expression", port,
1361                        "no expression after #; comment", SCM_EOL);
1362   scm_ungetc (c, port);
1363   scm_read_expression (port, opts);
1364   return SCM_UNSPECIFIED;
1365 }
1366
1367 static SCM
1368 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1369 {
1370   /* Guile's extended symbol read syntax looks like this:
1371
1372        #{This is all a symbol name}#
1373
1374      So here, CHR is expected to be `{'.  */
1375   int saw_brace = 0;
1376   size_t len = 0;
1377   SCM buf = scm_i_make_string (1024, NULL, 0);
1378
1379   buf = scm_i_string_start_writing (buf);
1380
1381   while ((chr = scm_getc (port)) != EOF)
1382     {
1383       if (saw_brace)
1384         {
1385           if (chr == '#')
1386             {
1387               break;
1388             }
1389           else
1390             {
1391               saw_brace = 0;
1392               scm_i_string_set_x (buf, len++, '}');
1393             }
1394         }
1395
1396       if (chr == '}')
1397         saw_brace = 1;
1398       else if (chr == '\\')
1399         {
1400           /* It used to be that print.c would print extended-read-syntax
1401              symbols with backslashes before "non-standard" chars, but
1402              this routine wouldn't do anything with those escapes.
1403              Bummer.  What we've done is to change print.c to output
1404              R6RS hex escapes for those characters, relying on the fact
1405              that the extended read syntax would never put a `\' before
1406              an `x'.  For now, we just ignore other instances of
1407              backslash in the string.  */
1408           switch ((chr = scm_getc (port)))
1409             {
1410             case EOF:
1411               goto done;
1412             case 'x':
1413               {
1414                 scm_t_wchar c;
1415
1416                 SCM_READ_HEX_ESCAPE (10, ';');
1417                 scm_i_string_set_x (buf, len++, c);
1418                 break;
1419
1420               str_eof:
1421                 chr = EOF;
1422                 goto done;
1423
1424               bad_escaped:
1425                 scm_i_string_stop_writing ();
1426                 scm_i_input_error ("scm_read_extended_symbol", port,
1427                                    "illegal character in escape sequence: ~S",
1428                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1429                 break;
1430               }
1431             default:
1432               scm_i_string_set_x (buf, len++, chr);
1433               break;
1434             }
1435         }
1436       else
1437         scm_i_string_set_x (buf, len++, chr);
1438
1439       if (len >= scm_i_string_length (buf) - 2)
1440         {
1441           SCM addy;
1442
1443           scm_i_string_stop_writing ();
1444           addy = scm_i_make_string (1024, NULL, 0);
1445           buf = scm_string_append (scm_list_2 (buf, addy));
1446           len = 0;
1447           buf = scm_i_string_start_writing (buf);
1448         }
1449     }
1450
1451  done:
1452   scm_i_string_stop_writing ();
1453   if (chr == EOF)
1454     scm_i_input_error ("scm_read_extended_symbol", port,
1455                        "end of file while reading symbol", SCM_EOL);
1456
1457   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1458 }
1459
1460
1461 \f
1462 /* Top-level token readers, i.e., dispatchers.  */
1463
1464 static SCM
1465 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1466 {
1467   SCM proc;
1468
1469   proc = scm_get_hash_procedure (chr);
1470   if (scm_is_true (scm_procedure_p (proc)))
1471     {
1472       long line = SCM_LINUM (port);
1473       int column = SCM_COL (port) - 2;
1474       SCM got;
1475
1476       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1477
1478       if (opts->record_positions_p && SCM_NIMP (got)
1479           && !scm_i_has_source_properties (got))
1480         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1481
1482       return got;
1483     }
1484
1485   return SCM_UNSPECIFIED;
1486 }
1487
1488 /* The reader for the sharp `#' character.  It basically dispatches reads
1489    among the above token readers.   */
1490 static SCM
1491 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1492                 long line, int column)
1493 #define FUNC_NAME "scm_lreadr"
1494 {
1495   SCM result;
1496
1497   chr = scm_getc (port);
1498
1499   result = scm_read_sharp_extension (chr, port, opts);
1500   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1501     return result;
1502
1503   switch (chr)
1504     {
1505     case '\\':
1506       return (scm_read_character (chr, port, opts));
1507     case '(':
1508       return (scm_read_vector (chr, port, opts, line, column));
1509     case 's':
1510     case 'u':
1511     case 'f':
1512     case 'c':
1513       /* This one may return either a boolean or an SRFI-4 vector.  */
1514       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1515     case 'v':
1516       return (scm_read_bytevector (chr, port, opts, line, column));
1517     case '*':
1518       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1519     case 't':
1520     case 'T':
1521     case 'F':
1522       return (scm_read_boolean (chr, port));
1523     case ':':
1524       return (scm_read_keyword (chr, port, opts));
1525     case '0': case '1': case '2': case '3': case '4':
1526     case '5': case '6': case '7': case '8': case '9':
1527     case '@':
1528 #if SCM_ENABLE_DEPRECATED
1529       /* See below for 'i' and 'e'. */
1530     case 'a':
1531     case 'y':
1532     case 'h':
1533     case 'l':
1534 #endif
1535       return (scm_read_array (chr, port, opts, line, column));
1536
1537     case 'i':
1538     case 'e':
1539 #if SCM_ENABLE_DEPRECATED
1540       {
1541         /* When next char is '(', it really is an old-style
1542            uniform array. */
1543         scm_t_wchar next_c = scm_getc (port);
1544         if (next_c != EOF)
1545           scm_ungetc (next_c, port);
1546         if (next_c == '(')
1547           return scm_read_array (chr, port, opts, line, column);
1548         /* Fall through. */
1549       }
1550 #endif
1551     case 'b':
1552     case 'B':
1553     case 'o':
1554     case 'O':
1555     case 'd':
1556     case 'D':
1557     case 'x':
1558     case 'X':
1559     case 'I':
1560     case 'E':
1561       return (scm_read_number_and_radix (chr, port, opts));
1562     case '{':
1563       return (scm_read_extended_symbol (chr, port));
1564     case '!':
1565       return (scm_read_shebang (chr, port, opts));
1566     case ';':
1567       return (scm_read_commented_expression (chr, port, opts));
1568     case '`':
1569     case '\'':
1570     case ',':
1571       return (scm_read_syntax (chr, port, opts));
1572     case 'n':
1573       return (scm_read_nil (chr, port, opts));
1574     default:
1575       result = scm_read_sharp_extension (chr, port, opts);
1576       if (scm_is_eq (result, SCM_UNSPECIFIED))
1577         {
1578           /* To remain compatible with 1.8 and earlier, the following
1579              characters have lower precedence than `read-hash-extend'
1580              characters.  */
1581           switch (chr)
1582             {
1583             case '|':
1584               return scm_read_r6rs_block_comment (chr, port);
1585             default:
1586               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1587                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1588             }
1589         }
1590       else
1591         return result;
1592     }
1593
1594   return SCM_UNSPECIFIED;
1595 }
1596 #undef FUNC_NAME
1597
1598 static SCM
1599 scm_read_expression (SCM port, scm_t_read_opts *opts)
1600 #define FUNC_NAME "scm_read_expression"
1601 {
1602   while (1)
1603     {
1604       scm_t_wchar chr;
1605
1606       chr = scm_getc (port);
1607
1608       switch (chr)
1609         {
1610         case SCM_WHITE_SPACES:
1611         case SCM_LINE_INCREMENTORS:
1612           break;
1613         case ';':
1614           (void) scm_read_semicolon_comment (chr, port);
1615           break;
1616         case '[':
1617           if (!opts->square_brackets_p)
1618             return (scm_read_mixed_case_symbol (chr, port, opts));
1619           /* otherwise fall through */
1620         case '(':
1621           return (scm_read_sexp (chr, port, opts));
1622         case '"':
1623           return (scm_read_string (chr, port, opts));
1624         case '\'':
1625         case '`':
1626         case ',':
1627           return (scm_read_quote (chr, port, opts));
1628         case '#':
1629           {
1630             long line  = SCM_LINUM (port);
1631             int column = SCM_COL (port) - 1;
1632             SCM result = scm_read_sharp (chr, port, opts, line, column);
1633             if (scm_is_eq (result, SCM_UNSPECIFIED))
1634               /* We read a comment or some such.  */
1635               break;
1636             else
1637               return result;
1638           }
1639         case ')':
1640           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1641           break;
1642         case ']':
1643           if (opts->square_brackets_p)
1644             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1645           /* otherwise fall through */
1646         case EOF:
1647           return SCM_EOF_VAL;
1648         case ':':
1649           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1650             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1651           /* Fall through.  */
1652
1653         default:
1654           {
1655             if (((chr >= '0') && (chr <= '9'))
1656                 || (strchr ("+-.", chr)))
1657               return (scm_read_number (chr, port, opts));
1658             else
1659               return (scm_read_mixed_case_symbol (chr, port, opts));
1660           }
1661         }
1662     }
1663 }
1664 #undef FUNC_NAME
1665
1666 \f
1667 /* Actual reader.  */
1668
1669 static void init_read_options (SCM port, scm_t_read_opts *opts);
1670
1671 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1672             (SCM port),
1673             "Read an s-expression from the input port @var{port}, or from\n"
1674             "the current input port if @var{port} is not specified.\n"
1675             "Any whitespace before the next token is discarded.")
1676 #define FUNC_NAME s_scm_read
1677 {
1678   scm_t_read_opts opts;
1679   int c;
1680
1681   if (SCM_UNBNDP (port))
1682     port = scm_current_input_port ();
1683   SCM_VALIDATE_OPINPORT (1, port);
1684
1685   init_read_options (port, &opts);
1686
1687   c = flush_ws (port, &opts, (char *) NULL);
1688   if (EOF == c)
1689     return SCM_EOF_VAL;
1690   scm_ungetc (c, port);
1691
1692   return (scm_read_expression (port, &opts));
1693 }
1694 #undef FUNC_NAME
1695
1696
1697 \f
1698
1699 /* Manipulate the read-hash-procedures alist.  This could be written in
1700    Scheme, but maybe it will also be used by C code during initialisation.  */
1701 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1702             (SCM chr, SCM proc),
1703             "Install the procedure @var{proc} for reading expressions\n"
1704             "starting with the character sequence @code{#} and @var{chr}.\n"
1705             "@var{proc} will be called with two arguments:  the character\n"
1706             "@var{chr} and the port to read further data from. The object\n"
1707             "returned will be the return value of @code{read}. \n"
1708             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1709             )
1710 #define FUNC_NAME s_scm_read_hash_extend
1711 {
1712   SCM this;
1713   SCM prev;
1714
1715   SCM_VALIDATE_CHAR (1, chr);
1716   SCM_ASSERT (scm_is_false (proc)
1717               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1718               proc, SCM_ARG2, FUNC_NAME);
1719
1720   /* Check if chr is already in the alist.  */
1721   this = scm_i_read_hash_procedures_ref ();
1722   prev = SCM_BOOL_F;
1723   while (1)
1724     {
1725       if (scm_is_null (this))
1726         {
1727           /* not found, so add it to the beginning.  */
1728           if (scm_is_true (proc))
1729             {
1730               SCM new = scm_cons (scm_cons (chr, proc),
1731                                   scm_i_read_hash_procedures_ref ());
1732               scm_i_read_hash_procedures_set_x (new);
1733             }
1734           break;
1735         }
1736       if (scm_is_eq (chr, SCM_CAAR (this)))
1737         {
1738           /* already in the alist.  */
1739           if (scm_is_false (proc))
1740             {
1741               /* remove it.  */
1742               if (scm_is_false (prev))
1743                 {
1744                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1745                   scm_i_read_hash_procedures_set_x (rest);
1746                 }
1747               else
1748                 scm_set_cdr_x (prev, SCM_CDR (this));
1749             }
1750           else
1751             {
1752               /* replace it.  */
1753               scm_set_cdr_x (SCM_CAR (this), proc);
1754             }
1755           break;
1756         }
1757       prev = this;
1758       this = SCM_CDR (this);
1759     }
1760
1761   return SCM_UNSPECIFIED;
1762 }
1763 #undef FUNC_NAME
1764
1765 /* Recover the read-hash procedure corresponding to char c.  */
1766 static SCM
1767 scm_get_hash_procedure (int c)
1768 {
1769   SCM rest = scm_i_read_hash_procedures_ref ();
1770
1771   while (1)
1772     {
1773       if (scm_is_null (rest))
1774         return SCM_BOOL_F;
1775
1776       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1777         return SCM_CDAR (rest);
1778
1779       rest = SCM_CDR (rest);
1780     }
1781 }
1782
1783 #define SCM_ENCODING_SEARCH_SIZE (500)
1784
1785 /* Search the first few hundred characters of a file for an Emacs-like coding
1786    declaration.  Returns either NULL or a string whose storage has been
1787    allocated with `scm_gc_malloc ()'.  */
1788 char *
1789 scm_i_scan_for_encoding (SCM port)
1790 {
1791   scm_t_port *pt;
1792   char header[SCM_ENCODING_SEARCH_SIZE+1];
1793   size_t bytes_read, encoding_length, i;
1794   char *encoding = NULL;
1795   int utf8_bom = 0;
1796   char *pos, *encoding_start;
1797   int in_comment;
1798
1799   pt = SCM_PTAB_ENTRY (port);
1800
1801   if (pt->rw_active == SCM_PORT_WRITE)
1802     scm_flush (port);
1803
1804   if (pt->rw_random)
1805     pt->rw_active = SCM_PORT_READ;
1806
1807   if (pt->read_pos == pt->read_end)
1808     {
1809       /* We can use the read buffer, and thus avoid a seek. */
1810       if (scm_fill_input (port) == EOF)
1811         return NULL;
1812
1813       bytes_read = pt->read_end - pt->read_pos;
1814       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1815         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1816
1817       if (bytes_read <= 1)
1818         /* An unbuffered port -- don't scan.  */
1819         return NULL;
1820
1821       memcpy (header, pt->read_pos, bytes_read);
1822       header[bytes_read] = '\0';
1823     }
1824   else
1825     {
1826       /* Try to read some bytes and then seek back.  Not all ports
1827          support seeking back; and indeed some file ports (like
1828          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1829          check performed by SCM_FPORT_FDES---but fail to seek
1830          backwards.  Hence this block comes second.  We prefer to use
1831          the read buffer in-place.  */
1832       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1833         return NULL;
1834
1835       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1836       header[bytes_read] = '\0';
1837       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1838     }
1839
1840   if (bytes_read > 3
1841       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1842     utf8_bom = 1;
1843
1844   /* search past "coding[:=]" */
1845   pos = header;
1846   while (1)
1847     {
1848       if ((pos = strstr(pos, "coding")) == NULL)
1849         return NULL;
1850
1851       pos += strlen("coding");
1852       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1853           (*pos == ':' || *pos == '='))
1854         {
1855           pos ++;
1856           break;
1857         }
1858     }
1859
1860   /* skip spaces */
1861   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1862          (*pos == ' ' || *pos == '\t'))
1863     pos ++;
1864
1865   /* grab the next token */
1866   encoding_start = pos;
1867   i = 0;
1868   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1869          && encoding_start + i - header < bytes_read
1870          && (isalnum ((int) encoding_start[i])
1871              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1872     i++;
1873
1874   encoding_length = i;
1875   if (encoding_length == 0)
1876     return NULL;
1877
1878   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1879   for (i = 0; i < encoding_length; i++)
1880     encoding[i] = toupper ((int) encoding[i]);
1881
1882   /* push backwards to make sure we were in a comment */
1883   in_comment = 0;
1884   pos = encoding_start;
1885   while (pos >= header)
1886     {
1887       if (*pos == ';')
1888         {
1889           in_comment = 1;
1890           break;
1891         }
1892       else if (*pos == '\n' || pos == header)
1893         {
1894           /* This wasn't in a semicolon comment. Check for a
1895            hash-bang comment. */
1896           char *beg = strstr (header, "#!");
1897           char *end = strstr (header, "!#");
1898           if (beg < encoding_start && encoding_start + encoding_length <= end)
1899             in_comment = 1;
1900           break;
1901         }
1902       else
1903         {
1904           pos --;
1905           continue;
1906         }
1907     }
1908   if (!in_comment)
1909     /* This wasn't in a comment */
1910     return NULL;
1911
1912   if (utf8_bom && strcmp(encoding, "UTF-8"))
1913     scm_misc_error (NULL,
1914                     "the port input declares the encoding ~s but is encoded as UTF-8",
1915                     scm_list_1 (scm_from_locale_string (encoding)));
1916
1917   return encoding;
1918 }
1919
1920 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1921             (SCM port),
1922             "Scans the port for an Emacs-like character coding declaration\n"
1923             "near the top of the contents of a port with random-accessible contents.\n"
1924             "The coding declaration is of the form\n"
1925             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1926             "\n"
1927             "Returns a string containing the character encoding of the file\n"
1928             "if a declaration was found, or @code{#f} otherwise.\n")
1929 #define FUNC_NAME s_scm_file_encoding
1930 {
1931   char *enc;
1932   SCM s_enc;
1933
1934   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1935
1936   enc = scm_i_scan_for_encoding (port);
1937   if (enc == NULL)
1938     return SCM_BOOL_F;
1939   else
1940     {
1941       s_enc = scm_from_locale_string (enc);
1942       return s_enc;
1943     }
1944
1945   return SCM_BOOL_F;
1946 }
1947 #undef FUNC_NAME
1948
1949 \f
1950 /* Per-port read options.
1951
1952    We store per-port read options in the 'port-read-options' key of the
1953    port's alist, which is stored in 'scm_i_port_weak_hash'.  The value
1954    stored in the alist is a single integer that contains a two-bit field
1955    for each read option.
1956
1957    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
1958    the applicable value should be inherited from the corresponding
1959    global read option.  Otherwise, the bit field contains the value of
1960    the read option.  For boolean read options that have been set
1961    per-port, the possible values are 0 or 1.  If the 'keyword_style'
1962    read option has been set per-port, its possible values are those in
1963    'enum t_keyword_style'. */
1964
1965 /* Key to read options in per-port alists. */
1966 SCM_SYMBOL (sym_port_read_options, "port-read-options");
1967
1968 /* Offsets of bit fields for each per-port override */
1969 #define READ_OPTION_COPY_SOURCE_P          0
1970 #define READ_OPTION_RECORD_POSITIONS_P     2
1971 #define READ_OPTION_CASE_INSENSITIVE_P     4
1972 #define READ_OPTION_KEYWORD_STYLE          6
1973 #define READ_OPTION_R6RS_ESCAPES_P         8
1974 #define READ_OPTION_SQUARE_BRACKETS_P     10
1975 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
1976
1977 #define READ_OPTIONS_NUM_BITS             14
1978
1979 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
1980 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
1981
1982 #define READ_OPTION_MASK     3
1983 #define READ_OPTION_INHERIT  3
1984
1985 static void
1986 set_port_read_option (SCM port, int option, int new_value)
1987 {
1988   SCM alist, scm_read_options;
1989   unsigned int read_options;
1990
1991   new_value &= READ_OPTION_MASK;
1992   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
1993   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
1994   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
1995   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
1996     read_options = scm_to_uint (scm_read_options);
1997   else
1998     read_options = READ_OPTIONS_INHERIT_ALL;
1999   read_options &= ~(READ_OPTION_MASK << option);
2000   read_options |= new_value << option;
2001   scm_read_options = scm_from_uint (read_options);
2002   alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2003   scm_hashq_set_x (scm_i_port_weak_hash, port, alist);
2004   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2005 }
2006
2007 /* Initialize OPTS based on PORT's read options and the global read
2008    options. */
2009 static void
2010 init_read_options (SCM port, scm_t_read_opts *opts)
2011 {
2012   SCM alist, val, scm_read_options;
2013   unsigned int read_options, x;
2014
2015   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2016   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2017   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2018   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2019
2020   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2021     read_options = scm_to_uint (scm_read_options);
2022   else
2023     read_options = READ_OPTIONS_INHERIT_ALL;
2024
2025   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2026   if (x == READ_OPTION_INHERIT)
2027     {
2028       val = SCM_PACK (SCM_KEYWORD_STYLE);
2029       if (scm_is_eq (val, scm_keyword_prefix))
2030         x = KEYWORD_STYLE_PREFIX;
2031       else if (scm_is_eq (val, scm_keyword_postfix))
2032         x = KEYWORD_STYLE_POSTFIX;
2033       else
2034         x = KEYWORD_STYLE_HASH_PREFIX;
2035     }
2036   opts->keyword_style = x;
2037
2038 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2039   do                                                                    \
2040     {                                                                   \
2041       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2042       if (x == READ_OPTION_INHERIT)                                     \
2043         x = !!SCM_ ## NAME;                                             \
2044           opts->name = x;                                               \
2045     }                                                                   \
2046   while (0)
2047
2048   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2049   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2050   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2051   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2052   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2053   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2054
2055 #undef RESOLVE_BOOLEAN_OPTION
2056 }
2057
2058 void
2059 scm_init_read ()
2060 {
2061   SCM read_hash_procs;
2062
2063   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2064
2065   scm_i_read_hash_procedures =
2066     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2067
2068   scm_init_opts (scm_read_options, scm_read_opts);
2069 #include "libguile/read.x"
2070 }
2071
2072 /*
2073   Local Variables:
2074   c-file-style: "gnu"
2075   End:
2076 */