libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /* Internal read options structure.  This is initialized by 'scm_read'
  85    from the global and per-port read options, and a pointer is passed
  86    down to all helper functions. */
  87
  88 enum t_keyword_style
  89   {
  90     KEYWORD_STYLE_HASH_PREFIX,
  91     KEYWORD_STYLE_PREFIX,
  92     KEYWORD_STYLE_POSTFIX
  93   };
  94
  95 struct t_read_opts
  96 {
  97   enum t_keyword_style keyword_style;
  98   unsigned int copy_source_p        : 1;
  99   unsigned int record_positions_p   : 1;
 100   unsigned int case_insensitive_p   : 1;
 101   unsigned int r6rs_escapes_p       : 1;
 102   unsigned int square_brackets_p    : 1;
 103   unsigned int hungry_eol_escapes_p : 1;
 104 };
 105
 106 typedef struct t_read_opts scm_t_read_opts;
 107
 108
 109 /*
 110   Give meaningful error messages for errors
 111
 112   We use the format
 113
 114   FILE:LINE:COL: MESSAGE
 115   This happened in ....
 116
 117   This is not standard GNU format, but the test-suite likes the real
 118   message to be in front.
 119
 120  */
 121
 122
 123 void
 124 scm_i_input_error (char const *function,
 125                    SCM port, const char *message, SCM arg)
 126 {
 127   SCM fn = (scm_is_string (SCM_FILENAME(port))
 128             ? SCM_FILENAME(port)
 129             : scm_from_locale_string ("#<unknown port>"));
 130
 131   SCM string_port = scm_open_output_string ();
 132   SCM string = SCM_EOL;
 133   scm_simple_format (string_port,
 134                      scm_from_locale_string ("~A:~S:~S: ~A"),
 135                      scm_list_4 (fn,
 136                                  scm_from_long (SCM_LINUM (port) + 1),
 137                                  scm_from_int (SCM_COL (port) + 1),
 138                                  scm_from_locale_string (message)));
 139
 140   string = scm_get_output_string (string_port);
 141   scm_close_output_port (string_port);
 142   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 143                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 144                  string,
 145                  arg,
 146                  SCM_BOOL_F);
 147 }
 148
 149
 150 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 151             (SCM setting),
 152             "Option interface for the read options. Instead of using\n"
 153             "this procedure directly, use the procedures @code{read-enable},\n"
 154             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 155 #define FUNC_NAME s_scm_read_options
 156 {
 157   SCM ans = scm_options (setting,
 158                          scm_read_opts,
 159                          FUNC_NAME);
 160   if (SCM_COPY_SOURCE_P)
 161     SCM_RECORD_POSITIONS_P = 1;
 162   return ans;
 163 }
 164 #undef FUNC_NAME
 165
 166 /* A fluid referring to an association list mapping extra hash
 167    characters to procedures.  */
 168 static SCM *scm_i_read_hash_procedures;
 169
 170 static SCM
 171 scm_i_read_hash_procedures_ref (void)
 172 {
 173   return scm_fluid_ref (*scm_i_read_hash_procedures);
 174 }
 175
 176 static void
 177 scm_i_read_hash_procedures_set_x (SCM value)
 178 {
 179   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 180 }
 181
 182 \f
 183 /* Token readers.  */
 184
 185
 186 /* Size of the C buffer used to read symbols and numbers.  */
 187 #define READER_BUFFER_SIZE            128
 188
 189 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 190 #define READER_STRING_BUFFER_SIZE     128
 191
 192 /* The maximum size of Scheme character names.  */
 193 #define READER_CHAR_NAME_MAX_SIZE      50
 194
 195 /* The maximum size of reader directive names.  */
 196 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 197
 198
 199 /* `isblank' is only in C99.  */
 200 #define CHAR_IS_BLANK_(_chr)                                    \
 201   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 202    || ((_chr) == '\f') || ((_chr) == '\r'))
 203
 204 #ifdef MSDOS
 205 # define CHAR_IS_BLANK(_chr)                    \
 206   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 207 #else
 208 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 209 #endif
 210
 211
 212 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 213    structure'').  */
 214 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 215   (CHAR_IS_BLANK (c)                                            \
 216    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 217
 218 #define CHAR_IS_DELIMITER(c)                                    \
 219   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 220    || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
 221
 222 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 223    Structure''.  */
 224 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 225   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 226    || ((_chr) == 'd') || ((_chr) == 'l'))
 227
 228 /* Read an SCSH block comment.  */
 229 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 230 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 231 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 232 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 233 static SCM scm_get_hash_procedure (int);
 234
 235 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 236    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 237    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 238    bytes actually read.  */
 239 static int
 240 read_token (SCM port, scm_t_read_opts *opts,
 241             char *buf, size_t buf_size, size_t *read)
 242 {
 243    *read = 0;
 244
 245    while (*read < buf_size)
 246      {
 247        int chr;
 248
 249        chr = scm_get_byte_or_eof (port);
 250
 251        if (chr == EOF)
 252         return 0;
 253       else if (CHAR_IS_DELIMITER (chr))
 254         {
 255           scm_unget_byte (chr, port);
 256           return 0;
 257         }
 258       else
 259         {
 260           *buf = (char) chr;
 261           buf++, (*read)++;
 262         }
 263      }
 264
 265    return 1;
 266  }
 267
 268 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 269    if the token doesn't fit in BUFFER_SIZE bytes.  */
 270 static char *
 271 read_complete_token (SCM port, scm_t_read_opts *opts,
 272                      char *buffer, size_t buffer_size, size_t *read)
 273 {
 274   int overflow = 0;
 275   size_t bytes_read, overflow_size = 0;
 276   char *overflow_buffer = NULL;
 277
 278   do
 279     {
 280       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 281       if (bytes_read == 0)
 282         break;
 283       if (overflow || overflow_size != 0)
 284         {
 285           if (overflow_size == 0)
 286             {
 287               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 288               memcpy (overflow_buffer, buffer, bytes_read);
 289               overflow_size = bytes_read;
 290             }
 291           else
 292             {
 293               char *new_buf =
 294                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 295
 296               memcpy (new_buf, overflow_buffer, overflow_size);
 297               memcpy (new_buf + overflow_size, buffer, bytes_read);
 298
 299               overflow_buffer = new_buf;
 300               overflow_size += bytes_read;
 301             }
 302         }
 303     }
 304   while (overflow);
 305
 306   if (overflow_size)
 307     *read = overflow_size;
 308   else
 309     *read = bytes_read;
 310
 311   return (overflow_size > 0 ? overflow_buffer : buffer);
 312 }
 313
 314 /* Skip whitespace from PORT and return the first non-whitespace character
 315    read.  Raise an error on end-of-file.  */
 316 static int
 317 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 318 {
 319   scm_t_wchar c;
 320   while (1)
 321     switch (c = scm_getc (port))
 322       {
 323       case EOF:
 324       goteof:
 325         if (eoferr)
 326           {
 327             scm_i_input_error (eoferr,
 328                                port,
 329                                "end of file",
 330                                SCM_EOL);
 331           }
 332         return c;
 333
 334       case ';':
 335       lp:
 336         switch (c = scm_getc (port))
 337           {
 338           case EOF:
 339             goto goteof;
 340           default:
 341             goto lp;
 342           case SCM_LINE_INCREMENTORS:
 343             break;
 344           }
 345         break;
 346
 347       case '#':
 348         switch (c = scm_getc (port))
 349           {
 350           case EOF:
 351             eoferr = "read_sharp";
 352             goto goteof;
 353           case '!':
 354             scm_read_shebang (c, port, opts);
 355             break;
 356           case ';':
 357             scm_read_commented_expression (c, port, opts);
 358             break;
 359           case '|':
 360             if (scm_is_false (scm_get_hash_procedure (c)))
 361               {
 362                 scm_read_r6rs_block_comment (c, port);
 363                 break;
 364               }
 365             /* fall through */
 366           default:
 367             scm_ungetc (c, port);
 368             return '#';
 369           }
 370         break;
 371
 372       case SCM_LINE_INCREMENTORS:
 373       case SCM_SINGLE_SPACES:
 374       case '\t':
 375         break;
 376
 377       default:
 378         return c;
 379       }
 380
 381   return 0;
 382 }
 383
 384
 385 \f
 386 /* Token readers.  */
 387
 388 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 389 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 390                            long line, int column);
 391
 392
 393 static SCM
 394 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 395                        long line, int column)
 396 {
 397   if (opts->record_positions_p)
 398     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 399   return x;
 400 }
 401
 402 static SCM
 403 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 404 #define FUNC_NAME "scm_i_lreadparen"
 405 {
 406   int c;
 407   SCM tmp, tl, ans = SCM_EOL;
 408   const int terminating_char = ((chr == '[') ? ']' : ')');
 409
 410   /* Need to capture line and column numbers here. */
 411   long line = SCM_LINUM (port);
 412   int column = SCM_COL (port) - 1;
 413
 414   c = flush_ws (port, opts, FUNC_NAME);
 415   if (terminating_char == c)
 416     return SCM_EOL;
 417
 418   scm_ungetc (c, port);
 419   tmp = scm_read_expression (port, opts);
 420
 421   /* Note that it is possible for scm_read_expression to return
 422      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 423      check that it's a real dot by checking `c'.  */
 424   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 425     {
 426       ans = scm_read_expression (port, opts);
 427       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 428         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 429                            SCM_EOL);
 430       return ans;
 431     }
 432
 433   /* Build the head of the list structure. */
 434   ans = tl = scm_cons (tmp, SCM_EOL);
 435
 436   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 437     {
 438       SCM new_tail;
 439
 440       if (c == ')' || (c == ']' && opts->square_brackets_p))
 441         scm_i_input_error (FUNC_NAME, port,
 442                            "in pair: mismatched close paren: ~A",
 443                            scm_list_1 (SCM_MAKE_CHAR (c)));
 444
 445       scm_ungetc (c, port);
 446       tmp = scm_read_expression (port, opts);
 447
 448       /* See above note about scm_sym_dot.  */
 449       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 450         {
 451           SCM_SETCDR (tl, scm_read_expression (port, opts));
 452
 453           c = flush_ws (port, opts, FUNC_NAME);
 454           if (terminating_char != c)
 455             scm_i_input_error (FUNC_NAME, port,
 456                                "in pair: missing close paren", SCM_EOL);
 457           goto exit;
 458         }
 459
 460       new_tail = scm_cons (tmp, SCM_EOL);
 461       SCM_SETCDR (tl, new_tail);
 462       tl = new_tail;
 463     }
 464
 465  exit:
 466   return maybe_annotate_source (ans, port, opts, line, column);
 467 }
 468 #undef FUNC_NAME
 469
 470
 471 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 472    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 473    found.  */
 474 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 475   do                                                               \
 476     {                                                              \
 477       scm_t_wchar a;                                               \
 478       size_t i = 0;                                                \
 479       c = 0;                                                       \
 480       while (i < ndigits)                                          \
 481         {                                                          \
 482           a = scm_getc (port);                                     \
 483           if (a == EOF)                                            \
 484             goto str_eof;                                          \
 485           if (terminator                                           \
 486               && (a == (scm_t_wchar) terminator)                   \
 487               && (i > 0))                                          \
 488             break;                                                 \
 489           if ('0' <= a && a <= '9')                                \
 490             a -= '0';                                              \
 491           else if ('A' <= a && a <= 'F')                           \
 492             a = a - 'A' + 10;                                      \
 493           else if ('a' <= a && a <= 'f')                           \
 494             a = a - 'a' + 10;                                      \
 495           else                                                     \
 496             {                                                      \
 497               c = a;                                               \
 498               goto bad_escaped;                                    \
 499             }                                                      \
 500           c = c * 16 + a;                                          \
 501           i ++;                                                    \
 502         }                                                          \
 503     } while (0)
 504
 505 static void
 506 skip_intraline_whitespace (SCM port)
 507 {
 508   scm_t_wchar c;
 509
 510   do
 511     {
 512       c = scm_getc (port);
 513       if (c == EOF)
 514         return;
 515     }
 516   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 517
 518   scm_ungetc (c, port);
 519 }
 520
 521 static SCM
 522 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 523 #define FUNC_NAME "scm_lreadr"
 524 {
 525   /* For strings smaller than C_STR, this function creates only one Scheme
 526      object (the string returned).  */
 527
 528   SCM str = SCM_EOL;
 529   size_t c_str_len = 0;
 530   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 531
 532   /* Need to capture line and column numbers here. */
 533   long line = SCM_LINUM (port);
 534   int column = SCM_COL (port) - 1;
 535
 536   while ('"' != (c = scm_getc (port)))
 537     {
 538       if (c == EOF)
 539         {
 540         str_eof:
 541           scm_i_input_error (FUNC_NAME, port,
 542                              "end of file in string constant", SCM_EOL);
 543         }
 544
 545       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 546         {
 547           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 548           c_str_len = 0;
 549         }
 550
 551       if (c == '\\')
 552         {
 553           switch (c = scm_getc (port))
 554             {
 555             case EOF:
 556               goto str_eof;
 557             case '"':
 558             case '\\':
 559               break;
 560             case '\n':
 561               if (opts->hungry_eol_escapes_p)
 562                 skip_intraline_whitespace (port);
 563               continue;
 564             case '0':
 565               c = '\0';
 566               break;
 567             case 'f':
 568               c = '\f';
 569               break;
 570             case 'n':
 571               c = '\n';
 572               break;
 573             case 'r':
 574               c = '\r';
 575               break;
 576             case 't':
 577               c = '\t';
 578               break;
 579             case 'a':
 580               c = '\007';
 581               break;
 582             case 'v':
 583               c = '\v';
 584               break;
 585             case 'b':
 586               c = '\010';
 587               break;
 588             case 'x':
 589               if (opts->r6rs_escapes_p)
 590                 SCM_READ_HEX_ESCAPE (10, ';');
 591               else
 592                 SCM_READ_HEX_ESCAPE (2, '\0');
 593               break;
 594             case 'u':
 595               if (!opts->r6rs_escapes_p)
 596                 {
 597                   SCM_READ_HEX_ESCAPE (4, '\0');
 598                   break;
 599                 }
 600             case 'U':
 601               if (!opts->r6rs_escapes_p)
 602                 {
 603                   SCM_READ_HEX_ESCAPE (6, '\0');
 604                   break;
 605                 }
 606             default:
 607             bad_escaped:
 608               scm_i_input_error (FUNC_NAME, port,
 609                                  "illegal character in escape sequence: ~S",
 610                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 611             }
 612         }
 613
 614       c_str[c_str_len++] = c;
 615     }
 616
 617   if (scm_is_null (str))
 618     /* Fast path: we got a string that fits in C_STR.  */
 619     str = scm_from_utf32_stringn (c_str, c_str_len);
 620   else
 621     {
 622       if (c_str_len > 0)
 623         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 624
 625       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 626     }
 627
 628   return maybe_annotate_source (str, port, opts, line, column);
 629 }
 630 #undef FUNC_NAME
 631
 632
 633 static SCM
 634 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 635 {
 636   SCM result, str = SCM_EOL;
 637   char local_buffer[READER_BUFFER_SIZE], *buffer;
 638   size_t bytes_read;
 639   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 640
 641   /* Need to capture line and column numbers here. */
 642   long line = SCM_LINUM (port);
 643   int column = SCM_COL (port) - 1;
 644
 645   scm_ungetc (chr, port);
 646   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 647                                 &bytes_read);
 648
 649   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 650
 651   result = scm_string_to_number (str, SCM_UNDEFINED);
 652   if (scm_is_false (result))
 653     {
 654       /* Return a symbol instead of a number */
 655       if (opts->case_insensitive_p)
 656         str = scm_string_downcase_x (str);
 657       result = scm_string_to_symbol (str);
 658     }
 659   else if (SCM_NIMP (result))
 660     result = maybe_annotate_source (result, port, opts, line, column);
 661
 662   SCM_COL (port) += scm_i_string_length (str);
 663   return result;
 664 }
 665
 666 static SCM
 667 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 668 {
 669   SCM result;
 670   int ends_with_colon = 0;
 671   size_t bytes_read;
 672   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 673   char local_buffer[READER_BUFFER_SIZE], *buffer;
 674   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 675   SCM str;
 676
 677   scm_ungetc (chr, port);
 678   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 679                                 &bytes_read);
 680   if (bytes_read > 0)
 681     ends_with_colon = buffer[bytes_read - 1] == ':';
 682
 683   if (postfix && ends_with_colon && (bytes_read > 1))
 684     {
 685       str = scm_from_stringn (buffer, bytes_read - 1,
 686                               pt->encoding, pt->ilseq_handler);
 687
 688       if (opts->case_insensitive_p)
 689         str = scm_string_downcase_x (str);
 690       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 691     }
 692   else
 693     {
 694       str = scm_from_stringn (buffer, bytes_read,
 695                               pt->encoding, pt->ilseq_handler);
 696
 697       if (opts->case_insensitive_p)
 698         str = scm_string_downcase_x (str);
 699       result = scm_string_to_symbol (str);
 700     }
 701
 702   SCM_COL (port) += scm_i_string_length (str);
 703   return result;
 704 }
 705
 706 static SCM
 707 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 708 #define FUNC_NAME "scm_lreadr"
 709 {
 710   SCM result;
 711   size_t read;
 712   char local_buffer[READER_BUFFER_SIZE], *buffer;
 713   unsigned int radix;
 714   SCM str;
 715   scm_t_port *pt;
 716
 717   switch (chr)
 718     {
 719     case 'B':
 720     case 'b':
 721       radix = 2;
 722       break;
 723
 724     case 'o':
 725     case 'O':
 726       radix = 8;
 727       break;
 728
 729     case 'd':
 730     case 'D':
 731       radix = 10;
 732       break;
 733
 734     case 'x':
 735     case 'X':
 736       radix = 16;
 737       break;
 738
 739     default:
 740       scm_ungetc (chr, port);
 741       scm_ungetc ('#', port);
 742       radix = 10;
 743     }
 744
 745   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 746                                 &read);
 747
 748   pt = SCM_PTAB_ENTRY (port);
 749   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 750
 751   result = scm_string_to_number (str, scm_from_uint (radix));
 752
 753   SCM_COL (port) += scm_i_string_length (str);
 754
 755   if (scm_is_true (result))
 756     return result;
 757
 758   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 759
 760   return SCM_BOOL_F;
 761 }
 762 #undef FUNC_NAME
 763
 764 static SCM
 765 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 766 {
 767   SCM p;
 768   long line = SCM_LINUM (port);
 769   int column = SCM_COL (port) - 1;
 770
 771   switch (chr)
 772     {
 773     case '`':
 774       p = scm_sym_quasiquote;
 775       break;
 776
 777     case '\'':
 778       p = scm_sym_quote;
 779       break;
 780
 781     case ',':
 782       {
 783         scm_t_wchar c;
 784
 785         c = scm_getc (port);
 786         if ('@' == c)
 787           p = scm_sym_uq_splicing;
 788         else
 789           {
 790             scm_ungetc (c, port);
 791             p = scm_sym_unquote;
 792           }
 793         break;
 794       }
 795
 796     default:
 797       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 798                "scm_read_quote", chr);
 799       abort ();
 800     }
 801
 802   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 803   return maybe_annotate_source (p, port, opts, line, column);
 804 }
 805
 806 SCM_SYMBOL (sym_syntax, "syntax");
 807 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 808 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 809 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 810
 811 static SCM
 812 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 813 {
 814   SCM p;
 815   long line = SCM_LINUM (port);
 816   int column = SCM_COL (port) - 1;
 817
 818   switch (chr)
 819     {
 820     case '`':
 821       p = sym_quasisyntax;
 822       break;
 823
 824     case '\'':
 825       p = sym_syntax;
 826       break;
 827
 828     case ',':
 829       {
 830         int c;
 831
 832         c = scm_getc (port);
 833         if ('@' == c)
 834           p = sym_unsyntax_splicing;
 835         else
 836           {
 837             scm_ungetc (c, port);
 838             p = sym_unsyntax;
 839           }
 840         break;
 841       }
 842
 843     default:
 844       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 845                "scm_read_syntax", chr);
 846       abort ();
 847     }
 848
 849   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 850   return maybe_annotate_source (p, port, opts, line, column);
 851 }
 852
 853 static SCM
 854 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 855 {
 856   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 857
 858   if (!scm_is_eq (id, sym_nil))
 859     scm_i_input_error ("scm_read_nil", port,
 860                        "unexpected input while reading #nil: ~a",
 861                        scm_list_1 (id));
 862
 863   return SCM_ELISP_NIL;
 864 }
 865
 866 static SCM
 867 scm_read_semicolon_comment (int chr, SCM port)
 868 {
 869   int c;
 870
 871   /* We use the get_byte here because there is no need to get the
 872      locale correct with comment input. This presumes that newline
 873      always represents itself no matter what the encoding is.  */
 874   for (c = scm_get_byte_or_eof (port);
 875        (c != EOF) && (c != '\n');
 876        c = scm_get_byte_or_eof (port));
 877
 878   return SCM_UNSPECIFIED;
 879 }
 880
 881 \f
 882 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 883
 884 static SCM
 885 scm_read_boolean (int chr, SCM port)
 886 {
 887   switch (chr)
 888     {
 889     case 't':
 890     case 'T':
 891       return SCM_BOOL_T;
 892
 893     case 'f':
 894     case 'F':
 895       return SCM_BOOL_F;
 896     }
 897
 898   return SCM_UNSPECIFIED;
 899 }
 900
 901 static SCM
 902 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 903 #define FUNC_NAME "scm_lreadr"
 904 {
 905   char buffer[READER_CHAR_NAME_MAX_SIZE];
 906   SCM charname;
 907   size_t charname_len, bytes_read;
 908   scm_t_wchar cp;
 909   int overflow;
 910   scm_t_port *pt;
 911
 912   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 913                          &bytes_read);
 914   if (overflow)
 915     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 916
 917   if (bytes_read == 0)
 918     {
 919       chr = scm_getc (port);
 920       if (chr == EOF)
 921         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 922                            "while reading character", SCM_EOL);
 923
 924       /* CHR must be a token delimiter, like a whitespace.  */
 925       return (SCM_MAKE_CHAR (chr));
 926     }
 927
 928   pt = SCM_PTAB_ENTRY (port);
 929
 930   /* Simple ASCII characters can be processed immediately.  Also, simple
 931      ISO-8859-1 characters can be processed immediately if the encoding for this
 932      port is ISO-8859-1.  */
 933   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 934     {
 935       SCM_COL (port) += 1;
 936       return SCM_MAKE_CHAR (buffer[0]);
 937     }
 938
 939   /* Otherwise, convert the buffer into a proper scheme string for
 940      processing.  */
 941   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 942                                pt->ilseq_handler);
 943   charname_len = scm_i_string_length (charname);
 944   SCM_COL (port) += charname_len;
 945   cp = scm_i_string_ref (charname, 0);
 946   if (charname_len == 1)
 947     return SCM_MAKE_CHAR (cp);
 948
 949   /* Ignore dotted circles, which may be used to keep combining characters from
 950      combining with the backslash in #\charname.  */
 951   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 952     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 953
 954   if (cp >= '0' && cp < '8')
 955     {
 956       /* Dirk:FIXME::  This type of character syntax is not R5RS
 957        * compliant.  Further, it should be verified that the constant
 958        * does only consist of octal digits.  */
 959       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 960       if (SCM_I_INUMP (p))
 961         {
 962           scm_t_wchar c = scm_to_uint32 (p);
 963           if (SCM_IS_UNICODE_CHAR (c))
 964             return SCM_MAKE_CHAR (c);
 965           else
 966             scm_i_input_error (FUNC_NAME, port,
 967                                "out-of-range octal character escape: ~a",
 968                                scm_list_1 (charname));
 969         }
 970     }
 971
 972   if (cp == 'x' && (charname_len > 1))
 973     {
 974       SCM p;
 975
 976       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 977       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 978                                 scm_from_uint (16));
 979       if (SCM_I_INUMP (p))
 980         {
 981           scm_t_wchar c = scm_to_uint32 (p);
 982           if (SCM_IS_UNICODE_CHAR (c))
 983             return SCM_MAKE_CHAR (c);
 984           else
 985             scm_i_input_error (FUNC_NAME, port,
 986                                "out-of-range hex character escape: ~a",
 987                                scm_list_1 (charname));
 988         }
 989     }
 990
 991   /* The names of characters should never have non-Latin1
 992      characters.  */
 993   if (scm_i_is_narrow_string (charname)
 994       || scm_i_try_narrow_string (charname))
 995     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 996                                        charname_len);
 997       if (scm_is_true (ch))
 998         return ch;
 999     }
1000
1001   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1002                      scm_list_1 (charname));
1003
1004   return SCM_UNSPECIFIED;
1005 }
1006 #undef FUNC_NAME
1007
1008 static SCM
1009 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1010 {
1011   SCM symbol;
1012
1013   /* Read the symbol that comprises the keyword.  Doing this instead of
1014      invoking a specific symbol reader function allows `scm_read_keyword ()'
1015      to adapt to the delimiters currently valid of symbols.
1016
1017      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1018   symbol = scm_read_expression (port, opts);
1019   if (!scm_is_symbol (symbol))
1020     scm_i_input_error ("scm_read_keyword", port,
1021                        "keyword prefix `~a' not followed by a symbol: ~s",
1022                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1023
1024   return (scm_symbol_to_keyword (symbol));
1025 }
1026
1027 static SCM
1028 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1029                  long line, int column)
1030 {
1031   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1032      guarantee that it's going to do what we want.  After all, this is an
1033      implementation detail of `scm_read_vector ()', not a desirable
1034      property.  */
1035   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1036                                 port, opts, line, column);
1037 }
1038
1039 /* Helper used by scm_read_array */
1040 static int
1041 read_decimal_integer (SCM port, int c, ssize_t *resp)
1042 {
1043   ssize_t sign = 1;
1044   ssize_t res = 0;
1045   int got_it = 0;
1046
1047   if (c == '-')
1048     {
1049       sign = -1;
1050       c = scm_getc (port);
1051     }
1052
1053   while ('0' <= c && c <= '9')
1054     {
1055       res = 10*res + c-'0';
1056       got_it = 1;
1057       c = scm_getc (port);
1058     }
1059
1060   if (got_it)
1061     *resp = sign * res;
1062   return c;
1063 }
1064
1065 /* Read an array.  This function can also read vectors and uniform
1066    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1067    handled here.
1068
1069    C is the first character read after the '#'.
1070 */
1071 static SCM
1072 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1073 {
1074   ssize_t rank;
1075   scm_t_wchar tag_buf[8];
1076   int tag_len;
1077
1078   SCM tag, shape = SCM_BOOL_F, elements, array;
1079
1080   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1081      the array code can not deal with zero-length dimensions yet, and
1082      we want to allow zero-length vectors, of course.
1083   */
1084   if (c == '(')
1085     return scm_read_vector (c, port, opts, line, column);
1086
1087   /* Disambiguate between '#f' and uniform floating point vectors.
1088    */
1089   if (c == 'f')
1090     {
1091       c = scm_getc (port);
1092       if (c != '3' && c != '6')
1093         {
1094           if (c != EOF)
1095             scm_ungetc (c, port);
1096           return SCM_BOOL_F;
1097         }
1098       rank = 1;
1099       tag_buf[0] = 'f';
1100       tag_len = 1;
1101       goto continue_reading_tag;
1102     }
1103
1104   /* Read rank. */
1105   rank = 1;
1106   c = read_decimal_integer (port, c, &rank);
1107   if (rank < 0)
1108     scm_i_input_error (NULL, port, "array rank must be non-negative",
1109                        SCM_EOL);
1110
1111   /* Read tag. */
1112   tag_len = 0;
1113  continue_reading_tag:
1114   while (c != EOF && c != '(' && c != '@' && c != ':'
1115          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1116     {
1117       tag_buf[tag_len++] = c;
1118       c = scm_getc (port);
1119     }
1120   if (tag_len == 0)
1121     tag = SCM_BOOL_T;
1122   else
1123     {
1124       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1125       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1126         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1127                            scm_list_1 (tag));
1128     }
1129
1130   /* Read shape. */
1131   if (c == '@' || c == ':')
1132     {
1133       shape = SCM_EOL;
1134
1135       do
1136         {
1137           ssize_t lbnd = 0, len = 0;
1138           SCM s;
1139
1140           if (c == '@')
1141             {
1142               c = scm_getc (port);
1143               c = read_decimal_integer (port, c, &lbnd);
1144             }
1145
1146           s = scm_from_ssize_t (lbnd);
1147
1148           if (c == ':')
1149             {
1150               c = scm_getc (port);
1151               c = read_decimal_integer (port, c, &len);
1152               if (len < 0)
1153                 scm_i_input_error (NULL, port,
1154                                    "array length must be non-negative",
1155                                    SCM_EOL);
1156
1157               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1158             }
1159
1160           shape = scm_cons (s, shape);
1161         } while (c == '@' || c == ':');
1162
1163       shape = scm_reverse_x (shape, SCM_EOL);
1164     }
1165
1166   /* Read nested lists of elements. */
1167   if (c != '(')
1168     scm_i_input_error (NULL, port,
1169                        "missing '(' in vector or array literal",
1170                        SCM_EOL);
1171   elements = scm_read_sexp (c, port, opts);
1172
1173   if (scm_is_false (shape))
1174     shape = scm_from_ssize_t (rank);
1175   else if (scm_ilength (shape) != rank)
1176     scm_i_input_error
1177       (NULL, port,
1178        "the number of shape specifications must match the array rank",
1179        SCM_EOL);
1180
1181   /* Handle special print syntax of rank zero arrays; see
1182      scm_i_print_array for a rationale. */
1183   if (rank == 0)
1184     {
1185       if (!scm_is_pair (elements))
1186         scm_i_input_error (NULL, port,
1187                            "too few elements in array literal, need 1",
1188                            SCM_EOL);
1189       if (!scm_is_null (SCM_CDR (elements)))
1190         scm_i_input_error (NULL, port,
1191                            "too many elements in array literal, want 1",
1192                            SCM_EOL);
1193       elements = SCM_CAR (elements);
1194     }
1195
1196   /* Construct array, annotate with source location, and return. */
1197   array = scm_list_to_typed_array (tag, shape, elements);
1198   return maybe_annotate_source (array, port, opts, line, column);
1199 }
1200
1201 static SCM
1202 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1203                        long line, int column)
1204 {
1205   return scm_read_array (chr, port, opts, line, column);
1206 }
1207
1208 static SCM
1209 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1210                      long line, int column)
1211 {
1212   chr = scm_getc (port);
1213   if (chr != 'u')
1214     goto syntax;
1215
1216   chr = scm_getc (port);
1217   if (chr != '8')
1218     goto syntax;
1219
1220   chr = scm_getc (port);
1221   if (chr != '(')
1222     goto syntax;
1223
1224   return maybe_annotate_source
1225     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1226      port, opts, line, column);
1227
1228  syntax:
1229   scm_i_input_error ("read_bytevector", port,
1230                      "invalid bytevector prefix",
1231                      SCM_MAKE_CHAR (chr));
1232   return SCM_UNSPECIFIED;
1233 }
1234
1235 static SCM
1236 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1237                            long line, int column)
1238 {
1239   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1240      terribly inefficient but who cares?  */
1241   SCM s_bits = SCM_EOL;
1242
1243   for (chr = scm_getc (port);
1244        (chr != EOF) && ((chr == '0') || (chr == '1'));
1245        chr = scm_getc (port))
1246     {
1247       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1248     }
1249
1250   if (chr != EOF)
1251     scm_ungetc (chr, port);
1252
1253   return maybe_annotate_source
1254     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1255      port, opts, line, column);
1256 }
1257
1258 static SCM
1259 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1260 {
1261   int bang_seen = 0;
1262
1263   for (;;)
1264     {
1265       int c = scm_getc (port);
1266
1267       if (c == EOF)
1268         scm_i_input_error ("skip_block_comment", port,
1269                            "unterminated `#! ... !#' comment", SCM_EOL);
1270
1271       if (c == '!')
1272         bang_seen = 1;
1273       else if (c == '#' && bang_seen)
1274         break;
1275       else
1276         bang_seen = 0;
1277     }
1278
1279   return SCM_UNSPECIFIED;
1280 }
1281
1282 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1283                                          int value);
1284
1285 static SCM
1286 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1287 {
1288   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1289   int c;
1290   int i = 0;
1291
1292   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1293     {
1294       c = scm_getc (port);
1295       if (c == EOF)
1296         scm_i_input_error ("skip_block_comment", port,
1297                            "unterminated `#! ... !#' comment", SCM_EOL);
1298       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1299         name[i++] = c;
1300       else if (CHAR_IS_DELIMITER (c))
1301         {
1302           scm_ungetc (c, port);
1303           name[i] = '\0';
1304           if (0 == strcmp ("r6rs", name))
1305             ;  /* Silently ignore */
1306           else if (0 == strcmp ("fold-case", name))
1307             set_port_case_insensitive_p (port, opts, 1);
1308           else if (0 == strcmp ("no-fold-case", name))
1309             set_port_case_insensitive_p (port, opts, 0);
1310           else
1311             break;
1312
1313           return SCM_UNSPECIFIED;
1314         }
1315     }
1316   while (i > 0)
1317     scm_ungetc (name[--i], port);
1318   return scm_read_scsh_block_comment (chr, port);
1319 }
1320
1321 static SCM
1322 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1323 {
1324   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1325      nested.  So care must be taken.  */
1326   int nesting_level = 1;
1327
1328   int a = scm_getc (port);
1329
1330   if (a == EOF)
1331     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1332                        "unterminated `#| ... |#' comment", SCM_EOL);
1333
1334   while (nesting_level > 0)
1335     {
1336       int b = scm_getc (port);
1337
1338       if (b == EOF)
1339         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1340                            "unterminated `#| ... |#' comment", SCM_EOL);
1341
1342       if (a == '|' && b == '#')
1343         {
1344           nesting_level--;
1345           b = EOF;
1346         }
1347       else if (a == '#' && b == '|')
1348         {
1349           nesting_level++;
1350           b = EOF;
1351         }
1352
1353       a = b;
1354     }
1355
1356   return SCM_UNSPECIFIED;
1357 }
1358
1359 static SCM
1360 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1361                                scm_t_read_opts *opts)
1362 {
1363   scm_t_wchar c;
1364
1365   c = flush_ws (port, opts, (char *) NULL);
1366   if (EOF == c)
1367     scm_i_input_error ("read_commented_expression", port,
1368                        "no expression after #; comment", SCM_EOL);
1369   scm_ungetc (c, port);
1370   scm_read_expression (port, opts);
1371   return SCM_UNSPECIFIED;
1372 }
1373
1374 static SCM
1375 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1376 {
1377   /* Guile's extended symbol read syntax looks like this:
1378
1379        #{This is all a symbol name}#
1380
1381      So here, CHR is expected to be `{'.  */
1382   int saw_brace = 0;
1383   size_t len = 0;
1384   SCM buf = scm_i_make_string (1024, NULL, 0);
1385
1386   buf = scm_i_string_start_writing (buf);
1387
1388   while ((chr = scm_getc (port)) != EOF)
1389     {
1390       if (saw_brace)
1391         {
1392           if (chr == '#')
1393             {
1394               break;
1395             }
1396           else
1397             {
1398               saw_brace = 0;
1399               scm_i_string_set_x (buf, len++, '}');
1400             }
1401         }
1402
1403       if (chr == '}')
1404         saw_brace = 1;
1405       else if (chr == '\\')
1406         {
1407           /* It used to be that print.c would print extended-read-syntax
1408              symbols with backslashes before "non-standard" chars, but
1409              this routine wouldn't do anything with those escapes.
1410              Bummer.  What we've done is to change print.c to output
1411              R6RS hex escapes for those characters, relying on the fact
1412              that the extended read syntax would never put a `\' before
1413              an `x'.  For now, we just ignore other instances of
1414              backslash in the string.  */
1415           switch ((chr = scm_getc (port)))
1416             {
1417             case EOF:
1418               goto done;
1419             case 'x':
1420               {
1421                 scm_t_wchar c;
1422
1423                 SCM_READ_HEX_ESCAPE (10, ';');
1424                 scm_i_string_set_x (buf, len++, c);
1425                 break;
1426
1427               str_eof:
1428                 chr = EOF;
1429                 goto done;
1430
1431               bad_escaped:
1432                 scm_i_string_stop_writing ();
1433                 scm_i_input_error ("scm_read_extended_symbol", port,
1434                                    "illegal character in escape sequence: ~S",
1435                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1436                 break;
1437               }
1438             default:
1439               scm_i_string_set_x (buf, len++, chr);
1440               break;
1441             }
1442         }
1443       else
1444         scm_i_string_set_x (buf, len++, chr);
1445
1446       if (len >= scm_i_string_length (buf) - 2)
1447         {
1448           SCM addy;
1449
1450           scm_i_string_stop_writing ();
1451           addy = scm_i_make_string (1024, NULL, 0);
1452           buf = scm_string_append (scm_list_2 (buf, addy));
1453           len = 0;
1454           buf = scm_i_string_start_writing (buf);
1455         }
1456     }
1457
1458  done:
1459   scm_i_string_stop_writing ();
1460   if (chr == EOF)
1461     scm_i_input_error ("scm_read_extended_symbol", port,
1462                        "end of file while reading symbol", SCM_EOL);
1463
1464   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1465 }
1466
1467
1468 \f
1469 /* Top-level token readers, i.e., dispatchers.  */
1470
1471 static SCM
1472 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1473 {
1474   SCM proc;
1475
1476   proc = scm_get_hash_procedure (chr);
1477   if (scm_is_true (scm_procedure_p (proc)))
1478     {
1479       long line = SCM_LINUM (port);
1480       int column = SCM_COL (port) - 2;
1481       SCM got;
1482
1483       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1484
1485       if (opts->record_positions_p && SCM_NIMP (got)
1486           && !scm_i_has_source_properties (got))
1487         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1488
1489       return got;
1490     }
1491
1492   return SCM_UNSPECIFIED;
1493 }
1494
1495 /* The reader for the sharp `#' character.  It basically dispatches reads
1496    among the above token readers.   */
1497 static SCM
1498 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1499                 long line, int column)
1500 #define FUNC_NAME "scm_lreadr"
1501 {
1502   SCM result;
1503
1504   chr = scm_getc (port);
1505
1506   result = scm_read_sharp_extension (chr, port, opts);
1507   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1508     return result;
1509
1510   switch (chr)
1511     {
1512     case '\\':
1513       return (scm_read_character (chr, port, opts));
1514     case '(':
1515       return (scm_read_vector (chr, port, opts, line, column));
1516     case 's':
1517     case 'u':
1518     case 'f':
1519     case 'c':
1520       /* This one may return either a boolean or an SRFI-4 vector.  */
1521       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1522     case 'v':
1523       return (scm_read_bytevector (chr, port, opts, line, column));
1524     case '*':
1525       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1526     case 't':
1527     case 'T':
1528     case 'F':
1529       return (scm_read_boolean (chr, port));
1530     case ':':
1531       return (scm_read_keyword (chr, port, opts));
1532     case '0': case '1': case '2': case '3': case '4':
1533     case '5': case '6': case '7': case '8': case '9':
1534     case '@':
1535 #if SCM_ENABLE_DEPRECATED
1536       /* See below for 'i' and 'e'. */
1537     case 'a':
1538     case 'y':
1539     case 'h':
1540     case 'l':
1541 #endif
1542       return (scm_read_array (chr, port, opts, line, column));
1543
1544     case 'i':
1545     case 'e':
1546 #if SCM_ENABLE_DEPRECATED
1547       {
1548         /* When next char is '(', it really is an old-style
1549            uniform array. */
1550         scm_t_wchar next_c = scm_getc (port);
1551         if (next_c != EOF)
1552           scm_ungetc (next_c, port);
1553         if (next_c == '(')
1554           return scm_read_array (chr, port, opts, line, column);
1555         /* Fall through. */
1556       }
1557 #endif
1558     case 'b':
1559     case 'B':
1560     case 'o':
1561     case 'O':
1562     case 'd':
1563     case 'D':
1564     case 'x':
1565     case 'X':
1566     case 'I':
1567     case 'E':
1568       return (scm_read_number_and_radix (chr, port, opts));
1569     case '{':
1570       return (scm_read_extended_symbol (chr, port));
1571     case '!':
1572       return (scm_read_shebang (chr, port, opts));
1573     case ';':
1574       return (scm_read_commented_expression (chr, port, opts));
1575     case '`':
1576     case '\'':
1577     case ',':
1578       return (scm_read_syntax (chr, port, opts));
1579     case 'n':
1580       return (scm_read_nil (chr, port, opts));
1581     default:
1582       result = scm_read_sharp_extension (chr, port, opts);
1583       if (scm_is_eq (result, SCM_UNSPECIFIED))
1584         {
1585           /* To remain compatible with 1.8 and earlier, the following
1586              characters have lower precedence than `read-hash-extend'
1587              characters.  */
1588           switch (chr)
1589             {
1590             case '|':
1591               return scm_read_r6rs_block_comment (chr, port);
1592             default:
1593               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1594                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1595             }
1596         }
1597       else
1598         return result;
1599     }
1600
1601   return SCM_UNSPECIFIED;
1602 }
1603 #undef FUNC_NAME
1604
1605 static SCM
1606 scm_read_expression (SCM port, scm_t_read_opts *opts)
1607 #define FUNC_NAME "scm_read_expression"
1608 {
1609   while (1)
1610     {
1611       scm_t_wchar chr;
1612
1613       chr = scm_getc (port);
1614
1615       switch (chr)
1616         {
1617         case SCM_WHITE_SPACES:
1618         case SCM_LINE_INCREMENTORS:
1619           break;
1620         case ';':
1621           (void) scm_read_semicolon_comment (chr, port);
1622           break;
1623         case '[':
1624           if (!opts->square_brackets_p)
1625             return (scm_read_mixed_case_symbol (chr, port, opts));
1626           /* otherwise fall through */
1627         case '(':
1628           return (scm_read_sexp (chr, port, opts));
1629         case '"':
1630           return (scm_read_string (chr, port, opts));
1631         case '\'':
1632         case '`':
1633         case ',':
1634           return (scm_read_quote (chr, port, opts));
1635         case '#':
1636           {
1637             long line  = SCM_LINUM (port);
1638             int column = SCM_COL (port) - 1;
1639             SCM result = scm_read_sharp (chr, port, opts, line, column);
1640             if (scm_is_eq (result, SCM_UNSPECIFIED))
1641               /* We read a comment or some such.  */
1642               break;
1643             else
1644               return result;
1645           }
1646         case ')':
1647           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1648           break;
1649         case ']':
1650           if (opts->square_brackets_p)
1651             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1652           /* otherwise fall through */
1653         case EOF:
1654           return SCM_EOF_VAL;
1655         case ':':
1656           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1657             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1658           /* Fall through.  */
1659
1660         default:
1661           {
1662             if (((chr >= '0') && (chr <= '9'))
1663                 || (strchr ("+-.", chr)))
1664               return (scm_read_number (chr, port, opts));
1665             else
1666               return (scm_read_mixed_case_symbol (chr, port, opts));
1667           }
1668         }
1669     }
1670 }
1671 #undef FUNC_NAME
1672
1673 \f
1674 /* Actual reader.  */
1675
1676 static void init_read_options (SCM port, scm_t_read_opts *opts);
1677
1678 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1679             (SCM port),
1680             "Read an s-expression from the input port @var{port}, or from\n"
1681             "the current input port if @var{port} is not specified.\n"
1682             "Any whitespace before the next token is discarded.")
1683 #define FUNC_NAME s_scm_read
1684 {
1685   scm_t_read_opts opts;
1686   int c;
1687
1688   if (SCM_UNBNDP (port))
1689     port = scm_current_input_port ();
1690   SCM_VALIDATE_OPINPORT (1, port);
1691
1692   init_read_options (port, &opts);
1693
1694   c = flush_ws (port, &opts, (char *) NULL);
1695   if (EOF == c)
1696     return SCM_EOF_VAL;
1697   scm_ungetc (c, port);
1698
1699   return (scm_read_expression (port, &opts));
1700 }
1701 #undef FUNC_NAME
1702
1703
1704 \f
1705
1706 /* Manipulate the read-hash-procedures alist.  This could be written in
1707    Scheme, but maybe it will also be used by C code during initialisation.  */
1708 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1709             (SCM chr, SCM proc),
1710             "Install the procedure @var{proc} for reading expressions\n"
1711             "starting with the character sequence @code{#} and @var{chr}.\n"
1712             "@var{proc} will be called with two arguments:  the character\n"
1713             "@var{chr} and the port to read further data from. The object\n"
1714             "returned will be the return value of @code{read}. \n"
1715             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1716             )
1717 #define FUNC_NAME s_scm_read_hash_extend
1718 {
1719   SCM this;
1720   SCM prev;
1721
1722   SCM_VALIDATE_CHAR (1, chr);
1723   SCM_ASSERT (scm_is_false (proc)
1724               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1725               proc, SCM_ARG2, FUNC_NAME);
1726
1727   /* Check if chr is already in the alist.  */
1728   this = scm_i_read_hash_procedures_ref ();
1729   prev = SCM_BOOL_F;
1730   while (1)
1731     {
1732       if (scm_is_null (this))
1733         {
1734           /* not found, so add it to the beginning.  */
1735           if (scm_is_true (proc))
1736             {
1737               SCM new = scm_cons (scm_cons (chr, proc),
1738                                   scm_i_read_hash_procedures_ref ());
1739               scm_i_read_hash_procedures_set_x (new);
1740             }
1741           break;
1742         }
1743       if (scm_is_eq (chr, SCM_CAAR (this)))
1744         {
1745           /* already in the alist.  */
1746           if (scm_is_false (proc))
1747             {
1748               /* remove it.  */
1749               if (scm_is_false (prev))
1750                 {
1751                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1752                   scm_i_read_hash_procedures_set_x (rest);
1753                 }
1754               else
1755                 scm_set_cdr_x (prev, SCM_CDR (this));
1756             }
1757           else
1758             {
1759               /* replace it.  */
1760               scm_set_cdr_x (SCM_CAR (this), proc);
1761             }
1762           break;
1763         }
1764       prev = this;
1765       this = SCM_CDR (this);
1766     }
1767
1768   return SCM_UNSPECIFIED;
1769 }
1770 #undef FUNC_NAME
1771
1772 /* Recover the read-hash procedure corresponding to char c.  */
1773 static SCM
1774 scm_get_hash_procedure (int c)
1775 {
1776   SCM rest = scm_i_read_hash_procedures_ref ();
1777
1778   while (1)
1779     {
1780       if (scm_is_null (rest))
1781         return SCM_BOOL_F;
1782
1783       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1784         return SCM_CDAR (rest);
1785
1786       rest = SCM_CDR (rest);
1787     }
1788 }
1789
1790 #define SCM_ENCODING_SEARCH_SIZE (500)
1791
1792 /* Search the first few hundred characters of a file for an Emacs-like coding
1793    declaration.  Returns either NULL or a string whose storage has been
1794    allocated with `scm_gc_malloc ()'.  */
1795 char *
1796 scm_i_scan_for_encoding (SCM port)
1797 {
1798   scm_t_port *pt;
1799   char header[SCM_ENCODING_SEARCH_SIZE+1];
1800   size_t bytes_read, encoding_length, i;
1801   char *encoding = NULL;
1802   int utf8_bom = 0;
1803   char *pos, *encoding_start;
1804   int in_comment;
1805
1806   pt = SCM_PTAB_ENTRY (port);
1807
1808   if (pt->rw_active == SCM_PORT_WRITE)
1809     scm_flush (port);
1810
1811   if (pt->rw_random)
1812     pt->rw_active = SCM_PORT_READ;
1813
1814   if (pt->read_pos == pt->read_end)
1815     {
1816       /* We can use the read buffer, and thus avoid a seek. */
1817       if (scm_fill_input (port) == EOF)
1818         return NULL;
1819
1820       bytes_read = pt->read_end - pt->read_pos;
1821       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1822         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1823
1824       if (bytes_read <= 1)
1825         /* An unbuffered port -- don't scan.  */
1826         return NULL;
1827
1828       memcpy (header, pt->read_pos, bytes_read);
1829       header[bytes_read] = '\0';
1830     }
1831   else
1832     {
1833       /* Try to read some bytes and then seek back.  Not all ports
1834          support seeking back; and indeed some file ports (like
1835          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1836          check performed by SCM_FPORT_FDES---but fail to seek
1837          backwards.  Hence this block comes second.  We prefer to use
1838          the read buffer in-place.  */
1839       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1840         return NULL;
1841
1842       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1843       header[bytes_read] = '\0';
1844       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1845     }
1846
1847   if (bytes_read > 3
1848       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1849     utf8_bom = 1;
1850
1851   /* search past "coding[:=]" */
1852   pos = header;
1853   while (1)
1854     {
1855       if ((pos = strstr(pos, "coding")) == NULL)
1856         return NULL;
1857
1858       pos += strlen("coding");
1859       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1860           (*pos == ':' || *pos == '='))
1861         {
1862           pos ++;
1863           break;
1864         }
1865     }
1866
1867   /* skip spaces */
1868   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1869          (*pos == ' ' || *pos == '\t'))
1870     pos ++;
1871
1872   /* grab the next token */
1873   encoding_start = pos;
1874   i = 0;
1875   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1876          && encoding_start + i - header < bytes_read
1877          && (isalnum ((int) encoding_start[i])
1878              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1879     i++;
1880
1881   encoding_length = i;
1882   if (encoding_length == 0)
1883     return NULL;
1884
1885   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1886   for (i = 0; i < encoding_length; i++)
1887     encoding[i] = toupper ((int) encoding[i]);
1888
1889   /* push backwards to make sure we were in a comment */
1890   in_comment = 0;
1891   pos = encoding_start;
1892   while (pos >= header)
1893     {
1894       if (*pos == ';')
1895         {
1896           in_comment = 1;
1897           break;
1898         }
1899       else if (*pos == '\n' || pos == header)
1900         {
1901           /* This wasn't in a semicolon comment. Check for a
1902            hash-bang comment. */
1903           char *beg = strstr (header, "#!");
1904           char *end = strstr (header, "!#");
1905           if (beg < encoding_start && encoding_start + encoding_length <= end)
1906             in_comment = 1;
1907           break;
1908         }
1909       else
1910         {
1911           pos --;
1912           continue;
1913         }
1914     }
1915   if (!in_comment)
1916     /* This wasn't in a comment */
1917     return NULL;
1918
1919   if (utf8_bom && strcmp(encoding, "UTF-8"))
1920     scm_misc_error (NULL,
1921                     "the port input declares the encoding ~s but is encoded as UTF-8",
1922                     scm_list_1 (scm_from_locale_string (encoding)));
1923
1924   return encoding;
1925 }
1926
1927 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1928             (SCM port),
1929             "Scans the port for an Emacs-like character coding declaration\n"
1930             "near the top of the contents of a port with random-accessible contents.\n"
1931             "The coding declaration is of the form\n"
1932             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1933             "\n"
1934             "Returns a string containing the character encoding of the file\n"
1935             "if a declaration was found, or @code{#f} otherwise.\n")
1936 #define FUNC_NAME s_scm_file_encoding
1937 {
1938   char *enc;
1939   SCM s_enc;
1940
1941   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1942
1943   enc = scm_i_scan_for_encoding (port);
1944   if (enc == NULL)
1945     return SCM_BOOL_F;
1946   else
1947     {
1948       s_enc = scm_from_locale_string (enc);
1949       return s_enc;
1950     }
1951
1952   return SCM_BOOL_F;
1953 }
1954 #undef FUNC_NAME
1955
1956 \f
1957 /* Per-port read options.
1958
1959    We store per-port read options in the 'port-read-options' key of the
1960    port's alist, which is stored in 'scm_i_port_weak_hash'.  The value
1961    stored in the alist is a single integer that contains a two-bit field
1962    for each read option.
1963
1964    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
1965    the applicable value should be inherited from the corresponding
1966    global read option.  Otherwise, the bit field contains the value of
1967    the read option.  For boolean read options that have been set
1968    per-port, the possible values are 0 or 1.  If the 'keyword_style'
1969    read option has been set per-port, its possible values are those in
1970    'enum t_keyword_style'. */
1971
1972 /* Key to read options in per-port alists. */
1973 SCM_SYMBOL (sym_port_read_options, "port-read-options");
1974
1975 /* Offsets of bit fields for each per-port override */
1976 #define READ_OPTION_COPY_SOURCE_P          0
1977 #define READ_OPTION_RECORD_POSITIONS_P     2
1978 #define READ_OPTION_CASE_INSENSITIVE_P     4
1979 #define READ_OPTION_KEYWORD_STYLE          6
1980 #define READ_OPTION_R6RS_ESCAPES_P         8
1981 #define READ_OPTION_SQUARE_BRACKETS_P     10
1982 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
1983
1984 #define READ_OPTIONS_NUM_BITS             14
1985
1986 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
1987 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
1988
1989 #define READ_OPTION_MASK     3
1990 #define READ_OPTION_INHERIT  3
1991
1992 static void
1993 set_port_read_option (SCM port, int option, int new_value)
1994 {
1995   SCM alist, scm_read_options;
1996   unsigned int read_options;
1997
1998   new_value &= READ_OPTION_MASK;
1999   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2000   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2001   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2002   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2003     read_options = scm_to_uint (scm_read_options);
2004   else
2005     read_options = READ_OPTIONS_INHERIT_ALL;
2006   read_options &= ~(READ_OPTION_MASK << option);
2007   read_options |= new_value << option;
2008   scm_read_options = scm_from_uint (read_options);
2009   alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2010   scm_hashq_set_x (scm_i_port_weak_hash, port, alist);
2011   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2012 }
2013
2014 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2015 static void
2016 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2017 {
2018   value = !!value;
2019   opts->case_insensitive_p = value;
2020   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2021 }
2022
2023 /* Initialize OPTS based on PORT's read options and the global read
2024    options. */
2025 static void
2026 init_read_options (SCM port, scm_t_read_opts *opts)
2027 {
2028   SCM alist, val, scm_read_options;
2029   unsigned int read_options, x;
2030
2031   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2032   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2033   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2034   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2035
2036   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2037     read_options = scm_to_uint (scm_read_options);
2038   else
2039     read_options = READ_OPTIONS_INHERIT_ALL;
2040
2041   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2042   if (x == READ_OPTION_INHERIT)
2043     {
2044       val = SCM_PACK (SCM_KEYWORD_STYLE);
2045       if (scm_is_eq (val, scm_keyword_prefix))
2046         x = KEYWORD_STYLE_PREFIX;
2047       else if (scm_is_eq (val, scm_keyword_postfix))
2048         x = KEYWORD_STYLE_POSTFIX;
2049       else
2050         x = KEYWORD_STYLE_HASH_PREFIX;
2051     }
2052   opts->keyword_style = x;
2053
2054 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2055   do                                                                    \
2056     {                                                                   \
2057       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2058       if (x == READ_OPTION_INHERIT)                                     \
2059         x = !!SCM_ ## NAME;                                             \
2060           opts->name = x;                                               \
2061     }                                                                   \
2062   while (0)
2063
2064   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2065   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2066   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2067   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2068   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2069   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2070
2071 #undef RESOLVE_BOOLEAN_OPTION
2072 }
2073
2074 void
2075 scm_init_read ()
2076 {
2077   SCM read_hash_procs;
2078
2079   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2080
2081   scm_i_read_hash_procedures =
2082     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2083
2084   scm_init_opts (scm_read_options, scm_read_opts);
2085 #include "libguile/read.x"
2086 }
2087
2088 /*
2089   Local Variables:
2090   c-file-style: "gnu"
2091   End:
2092 */