libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /* Internal read options structure.  This is initialized by 'scm_read'
  85    from the global read options, and a pointer is passed down to all
  86    helper functions. */
  87 enum t_keyword_style {
  88   KEYWORD_STYLE_HASH_PREFIX,
  89   KEYWORD_STYLE_PREFIX,
  90   KEYWORD_STYLE_POSTFIX
  91 };
  92
  93 struct t_read_opts {
  94   enum t_keyword_style keyword_style;
  95   unsigned int copy_source_p        : 1;
  96   unsigned int record_positions_p   : 1;
  97   unsigned int case_insensitive_p   : 1;
  98   unsigned int r6rs_escapes_p       : 1;
  99   unsigned int square_brackets_p    : 1;
 100   unsigned int hungry_eol_escapes_p : 1;
 101 };
 102
 103 typedef struct t_read_opts scm_t_read_opts;
 104
 105 /* Initialize OPTS from the global read options. */
 106 static void
 107 init_read_options (scm_t_read_opts *opts)
 108 {
 109   SCM val;
 110   int x;
 111
 112   val = SCM_PACK (SCM_KEYWORD_STYLE);
 113   if (scm_is_eq (val, scm_keyword_prefix))
 114     x = KEYWORD_STYLE_PREFIX;
 115   else if (scm_is_eq (val, scm_keyword_postfix))
 116     x = KEYWORD_STYLE_POSTFIX;
 117   else
 118     x = KEYWORD_STYLE_HASH_PREFIX;
 119   opts->keyword_style = x;
 120
 121 #define RESOLVE_BOOLEAN_OPTION(NAME, name)      \
 122   (opts->name = !!SCM_ ## NAME)
 123
 124   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
 125   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
 126   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
 127   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
 128   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
 129   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
 130
 131 #undef RESOLVE_BOOLEAN_OPTION
 132 }
 133
 134
 135 /*
 136   Give meaningful error messages for errors
 137
 138   We use the format
 139
 140   FILE:LINE:COL: MESSAGE
 141   This happened in ....
 142
 143   This is not standard GNU format, but the test-suite likes the real
 144   message to be in front.
 145
 146  */
 147
 148
 149 void
 150 scm_i_input_error (char const *function,
 151                    SCM port, const char *message, SCM arg)
 152 {
 153   SCM fn = (scm_is_string (SCM_FILENAME(port))
 154             ? SCM_FILENAME(port)
 155             : scm_from_locale_string ("#<unknown port>"));
 156
 157   SCM string_port = scm_open_output_string ();
 158   SCM string = SCM_EOL;
 159   scm_simple_format (string_port,
 160                      scm_from_locale_string ("~A:~S:~S: ~A"),
 161                      scm_list_4 (fn,
 162                                  scm_from_long (SCM_LINUM (port) + 1),
 163                                  scm_from_int (SCM_COL (port) + 1),
 164                                  scm_from_locale_string (message)));
 165
 166   string = scm_get_output_string (string_port);
 167   scm_close_output_port (string_port);
 168   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 169                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 170                  string,
 171                  arg,
 172                  SCM_BOOL_F);
 173 }
 174
 175
 176 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 177             (SCM setting),
 178             "Option interface for the read options. Instead of using\n"
 179             "this procedure directly, use the procedures @code{read-enable},\n"
 180             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 181 #define FUNC_NAME s_scm_read_options
 182 {
 183   SCM ans = scm_options (setting,
 184                          scm_read_opts,
 185                          FUNC_NAME);
 186   if (SCM_COPY_SOURCE_P)
 187     SCM_RECORD_POSITIONS_P = 1;
 188   return ans;
 189 }
 190 #undef FUNC_NAME
 191
 192 /* A fluid referring to an association list mapping extra hash
 193    characters to procedures.  */
 194 static SCM *scm_i_read_hash_procedures;
 195
 196 static SCM
 197 scm_i_read_hash_procedures_ref (void)
 198 {
 199   return scm_fluid_ref (*scm_i_read_hash_procedures);
 200 }
 201
 202 static void
 203 scm_i_read_hash_procedures_set_x (SCM value)
 204 {
 205   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 206 }
 207
 208 \f
 209 /* Token readers.  */
 210
 211
 212 /* Size of the C buffer used to read symbols and numbers.  */
 213 #define READER_BUFFER_SIZE            128
 214
 215 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 216 #define READER_STRING_BUFFER_SIZE     128
 217
 218 /* The maximum size of Scheme character names.  */
 219 #define READER_CHAR_NAME_MAX_SIZE      50
 220
 221 /* The maximum size of reader directive names.  */
 222 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 223
 224
 225 /* `isblank' is only in C99.  */
 226 #define CHAR_IS_BLANK_(_chr)                                    \
 227   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 228    || ((_chr) == '\f') || ((_chr) == '\r'))
 229
 230 #ifdef MSDOS
 231 # define CHAR_IS_BLANK(_chr)                    \
 232   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 233 #else
 234 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 235 #endif
 236
 237
 238 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 239    structure'').  */
 240 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 241   (CHAR_IS_BLANK (c)                                            \
 242    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 243
 244 #define CHAR_IS_DELIMITER(c)                                    \
 245   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 246    || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
 247
 248 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 249    Structure''.  */
 250 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 251   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 252    || ((_chr) == 'd') || ((_chr) == 'l'))
 253
 254 /* Read an SCSH block comment.  */
 255 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 256 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 257 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 258 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 259 static SCM scm_get_hash_procedure (int);
 260
 261 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 262    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 263    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 264    bytes actually read.  */
 265 static int
 266 read_token (SCM port, scm_t_read_opts *opts,
 267             char *buf, size_t buf_size, size_t *read)
 268 {
 269    *read = 0;
 270
 271    while (*read < buf_size)
 272      {
 273        int chr;
 274
 275        chr = scm_get_byte_or_eof (port);
 276
 277        if (chr == EOF)
 278         return 0;
 279       else if (CHAR_IS_DELIMITER (chr))
 280         {
 281           scm_unget_byte (chr, port);
 282           return 0;
 283         }
 284       else
 285         {
 286           *buf = (char) chr;
 287           buf++, (*read)++;
 288         }
 289      }
 290
 291    return 1;
 292  }
 293
 294 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 295    if the token doesn't fit in BUFFER_SIZE bytes.  */
 296 static char *
 297 read_complete_token (SCM port, scm_t_read_opts *opts,
 298                      char *buffer, size_t buffer_size, size_t *read)
 299 {
 300   int overflow = 0;
 301   size_t bytes_read, overflow_size = 0;
 302   char *overflow_buffer = NULL;
 303
 304   do
 305     {
 306       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 307       if (bytes_read == 0)
 308         break;
 309       if (overflow || overflow_size != 0)
 310         {
 311           if (overflow_size == 0)
 312             {
 313               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 314               memcpy (overflow_buffer, buffer, bytes_read);
 315               overflow_size = bytes_read;
 316             }
 317           else
 318             {
 319               char *new_buf =
 320                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 321
 322               memcpy (new_buf, overflow_buffer, overflow_size);
 323               memcpy (new_buf + overflow_size, buffer, bytes_read);
 324
 325               overflow_buffer = new_buf;
 326               overflow_size += bytes_read;
 327             }
 328         }
 329     }
 330   while (overflow);
 331
 332   if (overflow_size)
 333     *read = overflow_size;
 334   else
 335     *read = bytes_read;
 336
 337   return (overflow_size > 0 ? overflow_buffer : buffer);
 338 }
 339
 340 /* Skip whitespace from PORT and return the first non-whitespace character
 341    read.  Raise an error on end-of-file.  */
 342 static int
 343 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 344 {
 345   scm_t_wchar c;
 346   while (1)
 347     switch (c = scm_getc (port))
 348       {
 349       case EOF:
 350       goteof:
 351         if (eoferr)
 352           {
 353             scm_i_input_error (eoferr,
 354                                port,
 355                                "end of file",
 356                                SCM_EOL);
 357           }
 358         return c;
 359
 360       case ';':
 361       lp:
 362         switch (c = scm_getc (port))
 363           {
 364           case EOF:
 365             goto goteof;
 366           default:
 367             goto lp;
 368           case SCM_LINE_INCREMENTORS:
 369             break;
 370           }
 371         break;
 372
 373       case '#':
 374         switch (c = scm_getc (port))
 375           {
 376           case EOF:
 377             eoferr = "read_sharp";
 378             goto goteof;
 379           case '!':
 380             scm_read_shebang (c, port, opts);
 381             break;
 382           case ';':
 383             scm_read_commented_expression (c, port, opts);
 384             break;
 385           case '|':
 386             if (scm_is_false (scm_get_hash_procedure (c)))
 387               {
 388                 scm_read_r6rs_block_comment (c, port);
 389                 break;
 390               }
 391             /* fall through */
 392           default:
 393             scm_ungetc (c, port);
 394             return '#';
 395           }
 396         break;
 397
 398       case SCM_LINE_INCREMENTORS:
 399       case SCM_SINGLE_SPACES:
 400       case '\t':
 401         break;
 402
 403       default:
 404         return c;
 405       }
 406
 407   return 0;
 408 }
 409
 410
 411 \f
 412 /* Token readers.  */
 413
 414 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 415 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 416                            long line, int column);
 417
 418
 419 static SCM
 420 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 421                        long line, int column)
 422 {
 423   if (opts->record_positions_p)
 424     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 425   return x;
 426 }
 427
 428 static SCM
 429 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 430 #define FUNC_NAME "scm_i_lreadparen"
 431 {
 432   int c;
 433   SCM tmp, tl, ans = SCM_EOL;
 434   const int terminating_char = ((chr == '[') ? ']' : ')');
 435
 436   /* Need to capture line and column numbers here. */
 437   long line = SCM_LINUM (port);
 438   int column = SCM_COL (port) - 1;
 439
 440   c = flush_ws (port, opts, FUNC_NAME);
 441   if (terminating_char == c)
 442     return SCM_EOL;
 443
 444   scm_ungetc (c, port);
 445   tmp = scm_read_expression (port, opts);
 446
 447   /* Note that it is possible for scm_read_expression to return
 448      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 449      check that it's a real dot by checking `c'.  */
 450   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 451     {
 452       ans = scm_read_expression (port, opts);
 453       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 454         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 455                            SCM_EOL);
 456       return ans;
 457     }
 458
 459   /* Build the head of the list structure. */
 460   ans = tl = scm_cons (tmp, SCM_EOL);
 461
 462   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 463     {
 464       SCM new_tail;
 465
 466       if (c == ')' || (c == ']' && opts->square_brackets_p))
 467         scm_i_input_error (FUNC_NAME, port,
 468                            "in pair: mismatched close paren: ~A",
 469                            scm_list_1 (SCM_MAKE_CHAR (c)));
 470
 471       scm_ungetc (c, port);
 472       tmp = scm_read_expression (port, opts);
 473
 474       /* See above note about scm_sym_dot.  */
 475       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 476         {
 477           SCM_SETCDR (tl, scm_read_expression (port, opts));
 478
 479           c = flush_ws (port, opts, FUNC_NAME);
 480           if (terminating_char != c)
 481             scm_i_input_error (FUNC_NAME, port,
 482                                "in pair: missing close paren", SCM_EOL);
 483           goto exit;
 484         }
 485
 486       new_tail = scm_cons (tmp, SCM_EOL);
 487       SCM_SETCDR (tl, new_tail);
 488       tl = new_tail;
 489     }
 490
 491  exit:
 492   return maybe_annotate_source (ans, port, opts, line, column);
 493 }
 494 #undef FUNC_NAME
 495
 496
 497 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 498    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 499    found.  */
 500 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 501   do                                                               \
 502     {                                                              \
 503       scm_t_wchar a;                                               \
 504       size_t i = 0;                                                \
 505       c = 0;                                                       \
 506       while (i < ndigits)                                          \
 507         {                                                          \
 508           a = scm_getc (port);                                     \
 509           if (a == EOF)                                            \
 510             goto str_eof;                                          \
 511           if (terminator                                           \
 512               && (a == (scm_t_wchar) terminator)                   \
 513               && (i > 0))                                          \
 514             break;                                                 \
 515           if ('0' <= a && a <= '9')                                \
 516             a -= '0';                                              \
 517           else if ('A' <= a && a <= 'F')                           \
 518             a = a - 'A' + 10;                                      \
 519           else if ('a' <= a && a <= 'f')                           \
 520             a = a - 'a' + 10;                                      \
 521           else                                                     \
 522             {                                                      \
 523               c = a;                                               \
 524               goto bad_escaped;                                    \
 525             }                                                      \
 526           c = c * 16 + a;                                          \
 527           i ++;                                                    \
 528         }                                                          \
 529     } while (0)
 530
 531 static void
 532 skip_intraline_whitespace (SCM port)
 533 {
 534   scm_t_wchar c;
 535
 536   do
 537     {
 538       c = scm_getc (port);
 539       if (c == EOF)
 540         return;
 541     }
 542   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 543
 544   scm_ungetc (c, port);
 545 }
 546
 547 static SCM
 548 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 549 #define FUNC_NAME "scm_lreadr"
 550 {
 551   /* For strings smaller than C_STR, this function creates only one Scheme
 552      object (the string returned).  */
 553
 554   SCM str = SCM_EOL;
 555   size_t c_str_len = 0;
 556   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 557
 558   /* Need to capture line and column numbers here. */
 559   long line = SCM_LINUM (port);
 560   int column = SCM_COL (port) - 1;
 561
 562   while ('"' != (c = scm_getc (port)))
 563     {
 564       if (c == EOF)
 565         {
 566         str_eof:
 567           scm_i_input_error (FUNC_NAME, port,
 568                              "end of file in string constant", SCM_EOL);
 569         }
 570
 571       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 572         {
 573           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 574           c_str_len = 0;
 575         }
 576
 577       if (c == '\\')
 578         {
 579           switch (c = scm_getc (port))
 580             {
 581             case EOF:
 582               goto str_eof;
 583             case '"':
 584             case '\\':
 585               break;
 586             case '\n':
 587               if (opts->hungry_eol_escapes_p)
 588                 skip_intraline_whitespace (port);
 589               continue;
 590             case '0':
 591               c = '\0';
 592               break;
 593             case 'f':
 594               c = '\f';
 595               break;
 596             case 'n':
 597               c = '\n';
 598               break;
 599             case 'r':
 600               c = '\r';
 601               break;
 602             case 't':
 603               c = '\t';
 604               break;
 605             case 'a':
 606               c = '\007';
 607               break;
 608             case 'v':
 609               c = '\v';
 610               break;
 611             case 'b':
 612               c = '\010';
 613               break;
 614             case 'x':
 615               if (opts->r6rs_escapes_p)
 616                 SCM_READ_HEX_ESCAPE (10, ';');
 617               else
 618                 SCM_READ_HEX_ESCAPE (2, '\0');
 619               break;
 620             case 'u':
 621               if (!opts->r6rs_escapes_p)
 622                 {
 623                   SCM_READ_HEX_ESCAPE (4, '\0');
 624                   break;
 625                 }
 626             case 'U':
 627               if (!opts->r6rs_escapes_p)
 628                 {
 629                   SCM_READ_HEX_ESCAPE (6, '\0');
 630                   break;
 631                 }
 632             default:
 633             bad_escaped:
 634               scm_i_input_error (FUNC_NAME, port,
 635                                  "illegal character in escape sequence: ~S",
 636                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 637             }
 638         }
 639
 640       c_str[c_str_len++] = c;
 641     }
 642
 643   if (scm_is_null (str))
 644     /* Fast path: we got a string that fits in C_STR.  */
 645     str = scm_from_utf32_stringn (c_str, c_str_len);
 646   else
 647     {
 648       if (c_str_len > 0)
 649         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 650
 651       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 652     }
 653
 654   return maybe_annotate_source (str, port, opts, line, column);
 655 }
 656 #undef FUNC_NAME
 657
 658
 659 static SCM
 660 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 661 {
 662   SCM result, str = SCM_EOL;
 663   char local_buffer[READER_BUFFER_SIZE], *buffer;
 664   size_t bytes_read;
 665   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 666
 667   /* Need to capture line and column numbers here. */
 668   long line = SCM_LINUM (port);
 669   int column = SCM_COL (port) - 1;
 670
 671   scm_ungetc (chr, port);
 672   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 673                                 &bytes_read);
 674
 675   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 676
 677   result = scm_string_to_number (str, SCM_UNDEFINED);
 678   if (scm_is_false (result))
 679     {
 680       /* Return a symbol instead of a number */
 681       if (opts->case_insensitive_p)
 682         str = scm_string_downcase_x (str);
 683       result = scm_string_to_symbol (str);
 684     }
 685   else if (SCM_NIMP (result))
 686     result = maybe_annotate_source (result, port, opts, line, column);
 687
 688   SCM_COL (port) += scm_i_string_length (str);
 689   return result;
 690 }
 691
 692 static SCM
 693 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 694 {
 695   SCM result;
 696   int ends_with_colon = 0;
 697   size_t bytes_read;
 698   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 699   char local_buffer[READER_BUFFER_SIZE], *buffer;
 700   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 701   SCM str;
 702
 703   scm_ungetc (chr, port);
 704   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 705                                 &bytes_read);
 706   if (bytes_read > 0)
 707     ends_with_colon = buffer[bytes_read - 1] == ':';
 708
 709   if (postfix && ends_with_colon && (bytes_read > 1))
 710     {
 711       str = scm_from_stringn (buffer, bytes_read - 1,
 712                               pt->encoding, pt->ilseq_handler);
 713
 714       if (opts->case_insensitive_p)
 715         str = scm_string_downcase_x (str);
 716       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 717     }
 718   else
 719     {
 720       str = scm_from_stringn (buffer, bytes_read,
 721                               pt->encoding, pt->ilseq_handler);
 722
 723       if (opts->case_insensitive_p)
 724         str = scm_string_downcase_x (str);
 725       result = scm_string_to_symbol (str);
 726     }
 727
 728   SCM_COL (port) += scm_i_string_length (str);
 729   return result;
 730 }
 731
 732 static SCM
 733 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 734 #define FUNC_NAME "scm_lreadr"
 735 {
 736   SCM result;
 737   size_t read;
 738   char local_buffer[READER_BUFFER_SIZE], *buffer;
 739   unsigned int radix;
 740   SCM str;
 741   scm_t_port *pt;
 742
 743   switch (chr)
 744     {
 745     case 'B':
 746     case 'b':
 747       radix = 2;
 748       break;
 749
 750     case 'o':
 751     case 'O':
 752       radix = 8;
 753       break;
 754
 755     case 'd':
 756     case 'D':
 757       radix = 10;
 758       break;
 759
 760     case 'x':
 761     case 'X':
 762       radix = 16;
 763       break;
 764
 765     default:
 766       scm_ungetc (chr, port);
 767       scm_ungetc ('#', port);
 768       radix = 10;
 769     }
 770
 771   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 772                                 &read);
 773
 774   pt = SCM_PTAB_ENTRY (port);
 775   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 776
 777   result = scm_string_to_number (str, scm_from_uint (radix));
 778
 779   SCM_COL (port) += scm_i_string_length (str);
 780
 781   if (scm_is_true (result))
 782     return result;
 783
 784   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 785
 786   return SCM_BOOL_F;
 787 }
 788 #undef FUNC_NAME
 789
 790 static SCM
 791 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 792 {
 793   SCM p;
 794   long line = SCM_LINUM (port);
 795   int column = SCM_COL (port) - 1;
 796
 797   switch (chr)
 798     {
 799     case '`':
 800       p = scm_sym_quasiquote;
 801       break;
 802
 803     case '\'':
 804       p = scm_sym_quote;
 805       break;
 806
 807     case ',':
 808       {
 809         scm_t_wchar c;
 810
 811         c = scm_getc (port);
 812         if ('@' == c)
 813           p = scm_sym_uq_splicing;
 814         else
 815           {
 816             scm_ungetc (c, port);
 817             p = scm_sym_unquote;
 818           }
 819         break;
 820       }
 821
 822     default:
 823       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 824                "scm_read_quote", chr);
 825       abort ();
 826     }
 827
 828   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 829   return maybe_annotate_source (p, port, opts, line, column);
 830 }
 831
 832 SCM_SYMBOL (sym_syntax, "syntax");
 833 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 834 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 835 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 836
 837 static SCM
 838 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 839 {
 840   SCM p;
 841   long line = SCM_LINUM (port);
 842   int column = SCM_COL (port) - 1;
 843
 844   switch (chr)
 845     {
 846     case '`':
 847       p = sym_quasisyntax;
 848       break;
 849
 850     case '\'':
 851       p = sym_syntax;
 852       break;
 853
 854     case ',':
 855       {
 856         int c;
 857
 858         c = scm_getc (port);
 859         if ('@' == c)
 860           p = sym_unsyntax_splicing;
 861         else
 862           {
 863             scm_ungetc (c, port);
 864             p = sym_unsyntax;
 865           }
 866         break;
 867       }
 868
 869     default:
 870       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 871                "scm_read_syntax", chr);
 872       abort ();
 873     }
 874
 875   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 876   return maybe_annotate_source (p, port, opts, line, column);
 877 }
 878
 879 static SCM
 880 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 881 {
 882   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 883
 884   if (!scm_is_eq (id, sym_nil))
 885     scm_i_input_error ("scm_read_nil", port,
 886                        "unexpected input while reading #nil: ~a",
 887                        scm_list_1 (id));
 888
 889   return SCM_ELISP_NIL;
 890 }
 891
 892 static SCM
 893 scm_read_semicolon_comment (int chr, SCM port)
 894 {
 895   int c;
 896
 897   /* We use the get_byte here because there is no need to get the
 898      locale correct with comment input. This presumes that newline
 899      always represents itself no matter what the encoding is.  */
 900   for (c = scm_get_byte_or_eof (port);
 901        (c != EOF) && (c != '\n');
 902        c = scm_get_byte_or_eof (port));
 903
 904   return SCM_UNSPECIFIED;
 905 }
 906
 907 \f
 908 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 909
 910 static SCM
 911 scm_read_boolean (int chr, SCM port)
 912 {
 913   switch (chr)
 914     {
 915     case 't':
 916     case 'T':
 917       return SCM_BOOL_T;
 918
 919     case 'f':
 920     case 'F':
 921       return SCM_BOOL_F;
 922     }
 923
 924   return SCM_UNSPECIFIED;
 925 }
 926
 927 static SCM
 928 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 929 #define FUNC_NAME "scm_lreadr"
 930 {
 931   char buffer[READER_CHAR_NAME_MAX_SIZE];
 932   SCM charname;
 933   size_t charname_len, bytes_read;
 934   scm_t_wchar cp;
 935   int overflow;
 936   scm_t_port *pt;
 937
 938   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 939                          &bytes_read);
 940   if (overflow)
 941     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 942
 943   if (bytes_read == 0)
 944     {
 945       chr = scm_getc (port);
 946       if (chr == EOF)
 947         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 948                            "while reading character", SCM_EOL);
 949
 950       /* CHR must be a token delimiter, like a whitespace.  */
 951       return (SCM_MAKE_CHAR (chr));
 952     }
 953
 954   pt = SCM_PTAB_ENTRY (port);
 955
 956   /* Simple ASCII characters can be processed immediately.  Also, simple
 957      ISO-8859-1 characters can be processed immediately if the encoding for this
 958      port is ISO-8859-1.  */
 959   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 960     {
 961       SCM_COL (port) += 1;
 962       return SCM_MAKE_CHAR (buffer[0]);
 963     }
 964
 965   /* Otherwise, convert the buffer into a proper scheme string for
 966      processing.  */
 967   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 968                                pt->ilseq_handler);
 969   charname_len = scm_i_string_length (charname);
 970   SCM_COL (port) += charname_len;
 971   cp = scm_i_string_ref (charname, 0);
 972   if (charname_len == 1)
 973     return SCM_MAKE_CHAR (cp);
 974
 975   /* Ignore dotted circles, which may be used to keep combining characters from
 976      combining with the backslash in #\charname.  */
 977   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 978     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 979
 980   if (cp >= '0' && cp < '8')
 981     {
 982       /* Dirk:FIXME::  This type of character syntax is not R5RS
 983        * compliant.  Further, it should be verified that the constant
 984        * does only consist of octal digits.  */
 985       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 986       if (SCM_I_INUMP (p))
 987         {
 988           scm_t_wchar c = scm_to_uint32 (p);
 989           if (SCM_IS_UNICODE_CHAR (c))
 990             return SCM_MAKE_CHAR (c);
 991           else
 992             scm_i_input_error (FUNC_NAME, port,
 993                                "out-of-range octal character escape: ~a",
 994                                scm_list_1 (charname));
 995         }
 996     }
 997
 998   if (cp == 'x' && (charname_len > 1))
 999     {
1000       SCM p;
1001
1002       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1003       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1004                                 scm_from_uint (16));
1005       if (SCM_I_INUMP (p))
1006         {
1007           scm_t_wchar c = scm_to_uint32 (p);
1008           if (SCM_IS_UNICODE_CHAR (c))
1009             return SCM_MAKE_CHAR (c);
1010           else
1011             scm_i_input_error (FUNC_NAME, port,
1012                                "out-of-range hex character escape: ~a",
1013                                scm_list_1 (charname));
1014         }
1015     }
1016
1017   /* The names of characters should never have non-Latin1
1018      characters.  */
1019   if (scm_i_is_narrow_string (charname)
1020       || scm_i_try_narrow_string (charname))
1021     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1022                                        charname_len);
1023       if (scm_is_true (ch))
1024         return ch;
1025     }
1026
1027   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1028                      scm_list_1 (charname));
1029
1030   return SCM_UNSPECIFIED;
1031 }
1032 #undef FUNC_NAME
1033
1034 static SCM
1035 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1036 {
1037   SCM symbol;
1038
1039   /* Read the symbol that comprises the keyword.  Doing this instead of
1040      invoking a specific symbol reader function allows `scm_read_keyword ()'
1041      to adapt to the delimiters currently valid of symbols.
1042
1043      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1044   symbol = scm_read_expression (port, opts);
1045   if (!scm_is_symbol (symbol))
1046     scm_i_input_error ("scm_read_keyword", port,
1047                        "keyword prefix `~a' not followed by a symbol: ~s",
1048                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1049
1050   return (scm_symbol_to_keyword (symbol));
1051 }
1052
1053 static SCM
1054 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1055                  long line, int column)
1056 {
1057   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058      guarantee that it's going to do what we want.  After all, this is an
1059      implementation detail of `scm_read_vector ()', not a desirable
1060      property.  */
1061   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1062                                 port, opts, line, column);
1063 }
1064
1065 /* Helper used by scm_read_array */
1066 static int
1067 read_decimal_integer (SCM port, int c, ssize_t *resp)
1068 {
1069   ssize_t sign = 1;
1070   ssize_t res = 0;
1071   int got_it = 0;
1072
1073   if (c == '-')
1074     {
1075       sign = -1;
1076       c = scm_getc (port);
1077     }
1078
1079   while ('0' <= c && c <= '9')
1080     {
1081       res = 10*res + c-'0';
1082       got_it = 1;
1083       c = scm_getc (port);
1084     }
1085
1086   if (got_it)
1087     *resp = sign * res;
1088   return c;
1089 }
1090
1091 /* Read an array.  This function can also read vectors and uniform
1092    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1093    handled here.
1094
1095    C is the first character read after the '#'.
1096 */
1097 static SCM
1098 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1099 {
1100   ssize_t rank;
1101   scm_t_wchar tag_buf[8];
1102   int tag_len;
1103
1104   SCM tag, shape = SCM_BOOL_F, elements, array;
1105
1106   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1107      the array code can not deal with zero-length dimensions yet, and
1108      we want to allow zero-length vectors, of course.
1109   */
1110   if (c == '(')
1111     return scm_read_vector (c, port, opts, line, column);
1112
1113   /* Disambiguate between '#f' and uniform floating point vectors.
1114    */
1115   if (c == 'f')
1116     {
1117       c = scm_getc (port);
1118       if (c != '3' && c != '6')
1119         {
1120           if (c != EOF)
1121             scm_ungetc (c, port);
1122           return SCM_BOOL_F;
1123         }
1124       rank = 1;
1125       tag_buf[0] = 'f';
1126       tag_len = 1;
1127       goto continue_reading_tag;
1128     }
1129
1130   /* Read rank. */
1131   rank = 1;
1132   c = read_decimal_integer (port, c, &rank);
1133   if (rank < 0)
1134     scm_i_input_error (NULL, port, "array rank must be non-negative",
1135                        SCM_EOL);
1136
1137   /* Read tag. */
1138   tag_len = 0;
1139  continue_reading_tag:
1140   while (c != EOF && c != '(' && c != '@' && c != ':'
1141          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1142     {
1143       tag_buf[tag_len++] = c;
1144       c = scm_getc (port);
1145     }
1146   if (tag_len == 0)
1147     tag = SCM_BOOL_T;
1148   else
1149     {
1150       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1151       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1152         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1153                            scm_list_1 (tag));
1154     }
1155
1156   /* Read shape. */
1157   if (c == '@' || c == ':')
1158     {
1159       shape = SCM_EOL;
1160
1161       do
1162         {
1163           ssize_t lbnd = 0, len = 0;
1164           SCM s;
1165
1166           if (c == '@')
1167             {
1168               c = scm_getc (port);
1169               c = read_decimal_integer (port, c, &lbnd);
1170             }
1171
1172           s = scm_from_ssize_t (lbnd);
1173
1174           if (c == ':')
1175             {
1176               c = scm_getc (port);
1177               c = read_decimal_integer (port, c, &len);
1178               if (len < 0)
1179                 scm_i_input_error (NULL, port,
1180                                    "array length must be non-negative",
1181                                    SCM_EOL);
1182
1183               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1184             }
1185
1186           shape = scm_cons (s, shape);
1187         } while (c == '@' || c == ':');
1188
1189       shape = scm_reverse_x (shape, SCM_EOL);
1190     }
1191
1192   /* Read nested lists of elements. */
1193   if (c != '(')
1194     scm_i_input_error (NULL, port,
1195                        "missing '(' in vector or array literal",
1196                        SCM_EOL);
1197   elements = scm_read_sexp (c, port, opts);
1198
1199   if (scm_is_false (shape))
1200     shape = scm_from_ssize_t (rank);
1201   else if (scm_ilength (shape) != rank)
1202     scm_i_input_error
1203       (NULL, port,
1204        "the number of shape specifications must match the array rank",
1205        SCM_EOL);
1206
1207   /* Handle special print syntax of rank zero arrays; see
1208      scm_i_print_array for a rationale. */
1209   if (rank == 0)
1210     {
1211       if (!scm_is_pair (elements))
1212         scm_i_input_error (NULL, port,
1213                            "too few elements in array literal, need 1",
1214                            SCM_EOL);
1215       if (!scm_is_null (SCM_CDR (elements)))
1216         scm_i_input_error (NULL, port,
1217                            "too many elements in array literal, want 1",
1218                            SCM_EOL);
1219       elements = SCM_CAR (elements);
1220     }
1221
1222   /* Construct array, annotate with source location, and return. */
1223   array = scm_list_to_typed_array (tag, shape, elements);
1224   return maybe_annotate_source (array, port, opts, line, column);
1225 }
1226
1227 static SCM
1228 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1229                        long line, int column)
1230 {
1231   return scm_read_array (chr, port, opts, line, column);
1232 }
1233
1234 static SCM
1235 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1236                      long line, int column)
1237 {
1238   chr = scm_getc (port);
1239   if (chr != 'u')
1240     goto syntax;
1241
1242   chr = scm_getc (port);
1243   if (chr != '8')
1244     goto syntax;
1245
1246   chr = scm_getc (port);
1247   if (chr != '(')
1248     goto syntax;
1249
1250   return maybe_annotate_source
1251     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1252      port, opts, line, column);
1253
1254  syntax:
1255   scm_i_input_error ("read_bytevector", port,
1256                      "invalid bytevector prefix",
1257                      SCM_MAKE_CHAR (chr));
1258   return SCM_UNSPECIFIED;
1259 }
1260
1261 static SCM
1262 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1263                            long line, int column)
1264 {
1265   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1266      terribly inefficient but who cares?  */
1267   SCM s_bits = SCM_EOL;
1268
1269   for (chr = scm_getc (port);
1270        (chr != EOF) && ((chr == '0') || (chr == '1'));
1271        chr = scm_getc (port))
1272     {
1273       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1274     }
1275
1276   if (chr != EOF)
1277     scm_ungetc (chr, port);
1278
1279   return maybe_annotate_source
1280     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1281      port, opts, line, column);
1282 }
1283
1284 static SCM
1285 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1286 {
1287   int bang_seen = 0;
1288
1289   for (;;)
1290     {
1291       int c = scm_getc (port);
1292
1293       if (c == EOF)
1294         scm_i_input_error ("skip_block_comment", port,
1295                            "unterminated `#! ... !#' comment", SCM_EOL);
1296
1297       if (c == '!')
1298         bang_seen = 1;
1299       else if (c == '#' && bang_seen)
1300         break;
1301       else
1302         bang_seen = 0;
1303     }
1304
1305   return SCM_UNSPECIFIED;
1306 }
1307
1308 static SCM
1309 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1310 {
1311   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1312   int c;
1313   int i = 0;
1314
1315   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1316     {
1317       c = scm_getc (port);
1318       if (c == EOF)
1319         scm_i_input_error ("skip_block_comment", port,
1320                            "unterminated `#! ... !#' comment", SCM_EOL);
1321       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1322         name[i++] = c;
1323       else if (CHAR_IS_DELIMITER (c))
1324         {
1325           scm_ungetc (c, port);
1326           name[i] = '\0';
1327           if (0 == strcmp ("r6rs", name))
1328             ;  /* Silently ignore */
1329           else
1330             break;
1331
1332           return SCM_UNSPECIFIED;
1333         }
1334     }
1335   while (i > 0)
1336     scm_ungetc (name[--i], port);
1337   return scm_read_scsh_block_comment (chr, port);
1338 }
1339
1340 static SCM
1341 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1342 {
1343   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1344      nested.  So care must be taken.  */
1345   int nesting_level = 1;
1346
1347   int a = scm_getc (port);
1348
1349   if (a == EOF)
1350     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1351                        "unterminated `#| ... |#' comment", SCM_EOL);
1352
1353   while (nesting_level > 0)
1354     {
1355       int b = scm_getc (port);
1356
1357       if (b == EOF)
1358         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1359                            "unterminated `#| ... |#' comment", SCM_EOL);
1360
1361       if (a == '|' && b == '#')
1362         {
1363           nesting_level--;
1364           b = EOF;
1365         }
1366       else if (a == '#' && b == '|')
1367         {
1368           nesting_level++;
1369           b = EOF;
1370         }
1371
1372       a = b;
1373     }
1374
1375   return SCM_UNSPECIFIED;
1376 }
1377
1378 static SCM
1379 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1380                                scm_t_read_opts *opts)
1381 {
1382   scm_t_wchar c;
1383
1384   c = flush_ws (port, opts, (char *) NULL);
1385   if (EOF == c)
1386     scm_i_input_error ("read_commented_expression", port,
1387                        "no expression after #; comment", SCM_EOL);
1388   scm_ungetc (c, port);
1389   scm_read_expression (port, opts);
1390   return SCM_UNSPECIFIED;
1391 }
1392
1393 static SCM
1394 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1395 {
1396   /* Guile's extended symbol read syntax looks like this:
1397
1398        #{This is all a symbol name}#
1399
1400      So here, CHR is expected to be `{'.  */
1401   int saw_brace = 0;
1402   size_t len = 0;
1403   SCM buf = scm_i_make_string (1024, NULL, 0);
1404
1405   buf = scm_i_string_start_writing (buf);
1406
1407   while ((chr = scm_getc (port)) != EOF)
1408     {
1409       if (saw_brace)
1410         {
1411           if (chr == '#')
1412             {
1413               break;
1414             }
1415           else
1416             {
1417               saw_brace = 0;
1418               scm_i_string_set_x (buf, len++, '}');
1419             }
1420         }
1421
1422       if (chr == '}')
1423         saw_brace = 1;
1424       else if (chr == '\\')
1425         {
1426           /* It used to be that print.c would print extended-read-syntax
1427              symbols with backslashes before "non-standard" chars, but
1428              this routine wouldn't do anything with those escapes.
1429              Bummer.  What we've done is to change print.c to output
1430              R6RS hex escapes for those characters, relying on the fact
1431              that the extended read syntax would never put a `\' before
1432              an `x'.  For now, we just ignore other instances of
1433              backslash in the string.  */
1434           switch ((chr = scm_getc (port)))
1435             {
1436             case EOF:
1437               goto done;
1438             case 'x':
1439               {
1440                 scm_t_wchar c;
1441
1442                 SCM_READ_HEX_ESCAPE (10, ';');
1443                 scm_i_string_set_x (buf, len++, c);
1444                 break;
1445
1446               str_eof:
1447                 chr = EOF;
1448                 goto done;
1449
1450               bad_escaped:
1451                 scm_i_string_stop_writing ();
1452                 scm_i_input_error ("scm_read_extended_symbol", port,
1453                                    "illegal character in escape sequence: ~S",
1454                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1455                 break;
1456               }
1457             default:
1458               scm_i_string_set_x (buf, len++, chr);
1459               break;
1460             }
1461         }
1462       else
1463         scm_i_string_set_x (buf, len++, chr);
1464
1465       if (len >= scm_i_string_length (buf) - 2)
1466         {
1467           SCM addy;
1468
1469           scm_i_string_stop_writing ();
1470           addy = scm_i_make_string (1024, NULL, 0);
1471           buf = scm_string_append (scm_list_2 (buf, addy));
1472           len = 0;
1473           buf = scm_i_string_start_writing (buf);
1474         }
1475     }
1476
1477  done:
1478   scm_i_string_stop_writing ();
1479   if (chr == EOF)
1480     scm_i_input_error ("scm_read_extended_symbol", port,
1481                        "end of file while reading symbol", SCM_EOL);
1482
1483   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1484 }
1485
1486
1487 \f
1488 /* Top-level token readers, i.e., dispatchers.  */
1489
1490 static SCM
1491 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1492 {
1493   SCM proc;
1494
1495   proc = scm_get_hash_procedure (chr);
1496   if (scm_is_true (scm_procedure_p (proc)))
1497     {
1498       long line = SCM_LINUM (port);
1499       int column = SCM_COL (port) - 2;
1500       SCM got;
1501
1502       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1503
1504       if (opts->record_positions_p && SCM_NIMP (got)
1505           && !scm_i_has_source_properties (got))
1506         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1507
1508       return got;
1509     }
1510
1511   return SCM_UNSPECIFIED;
1512 }
1513
1514 /* The reader for the sharp `#' character.  It basically dispatches reads
1515    among the above token readers.   */
1516 static SCM
1517 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1518                 long line, int column)
1519 #define FUNC_NAME "scm_lreadr"
1520 {
1521   SCM result;
1522
1523   chr = scm_getc (port);
1524
1525   result = scm_read_sharp_extension (chr, port, opts);
1526   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1527     return result;
1528
1529   switch (chr)
1530     {
1531     case '\\':
1532       return (scm_read_character (chr, port, opts));
1533     case '(':
1534       return (scm_read_vector (chr, port, opts, line, column));
1535     case 's':
1536     case 'u':
1537     case 'f':
1538     case 'c':
1539       /* This one may return either a boolean or an SRFI-4 vector.  */
1540       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1541     case 'v':
1542       return (scm_read_bytevector (chr, port, opts, line, column));
1543     case '*':
1544       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1545     case 't':
1546     case 'T':
1547     case 'F':
1548       return (scm_read_boolean (chr, port));
1549     case ':':
1550       return (scm_read_keyword (chr, port, opts));
1551     case '0': case '1': case '2': case '3': case '4':
1552     case '5': case '6': case '7': case '8': case '9':
1553     case '@':
1554 #if SCM_ENABLE_DEPRECATED
1555       /* See below for 'i' and 'e'. */
1556     case 'a':
1557     case 'y':
1558     case 'h':
1559     case 'l':
1560 #endif
1561       return (scm_read_array (chr, port, opts, line, column));
1562
1563     case 'i':
1564     case 'e':
1565 #if SCM_ENABLE_DEPRECATED
1566       {
1567         /* When next char is '(', it really is an old-style
1568            uniform array. */
1569         scm_t_wchar next_c = scm_getc (port);
1570         if (next_c != EOF)
1571           scm_ungetc (next_c, port);
1572         if (next_c == '(')
1573           return scm_read_array (chr, port, opts, line, column);
1574         /* Fall through. */
1575       }
1576 #endif
1577     case 'b':
1578     case 'B':
1579     case 'o':
1580     case 'O':
1581     case 'd':
1582     case 'D':
1583     case 'x':
1584     case 'X':
1585     case 'I':
1586     case 'E':
1587       return (scm_read_number_and_radix (chr, port, opts));
1588     case '{':
1589       return (scm_read_extended_symbol (chr, port));
1590     case '!':
1591       return (scm_read_shebang (chr, port, opts));
1592     case ';':
1593       return (scm_read_commented_expression (chr, port, opts));
1594     case '`':
1595     case '\'':
1596     case ',':
1597       return (scm_read_syntax (chr, port, opts));
1598     case 'n':
1599       return (scm_read_nil (chr, port, opts));
1600     default:
1601       result = scm_read_sharp_extension (chr, port, opts);
1602       if (scm_is_eq (result, SCM_UNSPECIFIED))
1603         {
1604           /* To remain compatible with 1.8 and earlier, the following
1605              characters have lower precedence than `read-hash-extend'
1606              characters.  */
1607           switch (chr)
1608             {
1609             case '|':
1610               return scm_read_r6rs_block_comment (chr, port);
1611             default:
1612               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1613                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1614             }
1615         }
1616       else
1617         return result;
1618     }
1619
1620   return SCM_UNSPECIFIED;
1621 }
1622 #undef FUNC_NAME
1623
1624 static SCM
1625 scm_read_expression (SCM port, scm_t_read_opts *opts)
1626 #define FUNC_NAME "scm_read_expression"
1627 {
1628   while (1)
1629     {
1630       scm_t_wchar chr;
1631
1632       chr = scm_getc (port);
1633
1634       switch (chr)
1635         {
1636         case SCM_WHITE_SPACES:
1637         case SCM_LINE_INCREMENTORS:
1638           break;
1639         case ';':
1640           (void) scm_read_semicolon_comment (chr, port);
1641           break;
1642         case '[':
1643           if (!opts->square_brackets_p)
1644             return (scm_read_mixed_case_symbol (chr, port, opts));
1645           /* otherwise fall through */
1646         case '(':
1647           return (scm_read_sexp (chr, port, opts));
1648         case '"':
1649           return (scm_read_string (chr, port, opts));
1650         case '\'':
1651         case '`':
1652         case ',':
1653           return (scm_read_quote (chr, port, opts));
1654         case '#':
1655           {
1656             long line  = SCM_LINUM (port);
1657             int column = SCM_COL (port) - 1;
1658             SCM result = scm_read_sharp (chr, port, opts, line, column);
1659             if (scm_is_eq (result, SCM_UNSPECIFIED))
1660               /* We read a comment or some such.  */
1661               break;
1662             else
1663               return result;
1664           }
1665         case ')':
1666           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1667           break;
1668         case ']':
1669           if (opts->square_brackets_p)
1670             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1671           /* otherwise fall through */
1672         case EOF:
1673           return SCM_EOF_VAL;
1674         case ':':
1675           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1676             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1677           /* Fall through.  */
1678
1679         default:
1680           {
1681             if (((chr >= '0') && (chr <= '9'))
1682                 || (strchr ("+-.", chr)))
1683               return (scm_read_number (chr, port, opts));
1684             else
1685               return (scm_read_mixed_case_symbol (chr, port, opts));
1686           }
1687         }
1688     }
1689 }
1690 #undef FUNC_NAME
1691
1692 \f
1693 /* Actual reader.  */
1694
1695 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1696             (SCM port),
1697             "Read an s-expression from the input port @var{port}, or from\n"
1698             "the current input port if @var{port} is not specified.\n"
1699             "Any whitespace before the next token is discarded.")
1700 #define FUNC_NAME s_scm_read
1701 {
1702   scm_t_read_opts opts;
1703   int c;
1704
1705   if (SCM_UNBNDP (port))
1706     port = scm_current_input_port ();
1707   SCM_VALIDATE_OPINPORT (1, port);
1708
1709   init_read_options (&opts);
1710
1711   c = flush_ws (port, &opts, (char *) NULL);
1712   if (EOF == c)
1713     return SCM_EOF_VAL;
1714   scm_ungetc (c, port);
1715
1716   return (scm_read_expression (port, &opts));
1717 }
1718 #undef FUNC_NAME
1719
1720
1721 \f
1722
1723 /* Manipulate the read-hash-procedures alist.  This could be written in
1724    Scheme, but maybe it will also be used by C code during initialisation.  */
1725 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1726             (SCM chr, SCM proc),
1727             "Install the procedure @var{proc} for reading expressions\n"
1728             "starting with the character sequence @code{#} and @var{chr}.\n"
1729             "@var{proc} will be called with two arguments:  the character\n"
1730             "@var{chr} and the port to read further data from. The object\n"
1731             "returned will be the return value of @code{read}. \n"
1732             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1733             )
1734 #define FUNC_NAME s_scm_read_hash_extend
1735 {
1736   SCM this;
1737   SCM prev;
1738
1739   SCM_VALIDATE_CHAR (1, chr);
1740   SCM_ASSERT (scm_is_false (proc)
1741               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1742               proc, SCM_ARG2, FUNC_NAME);
1743
1744   /* Check if chr is already in the alist.  */
1745   this = scm_i_read_hash_procedures_ref ();
1746   prev = SCM_BOOL_F;
1747   while (1)
1748     {
1749       if (scm_is_null (this))
1750         {
1751           /* not found, so add it to the beginning.  */
1752           if (scm_is_true (proc))
1753             {
1754               SCM new = scm_cons (scm_cons (chr, proc),
1755                                   scm_i_read_hash_procedures_ref ());
1756               scm_i_read_hash_procedures_set_x (new);
1757             }
1758           break;
1759         }
1760       if (scm_is_eq (chr, SCM_CAAR (this)))
1761         {
1762           /* already in the alist.  */
1763           if (scm_is_false (proc))
1764             {
1765               /* remove it.  */
1766               if (scm_is_false (prev))
1767                 {
1768                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1769                   scm_i_read_hash_procedures_set_x (rest);
1770                 }
1771               else
1772                 scm_set_cdr_x (prev, SCM_CDR (this));
1773             }
1774           else
1775             {
1776               /* replace it.  */
1777               scm_set_cdr_x (SCM_CAR (this), proc);
1778             }
1779           break;
1780         }
1781       prev = this;
1782       this = SCM_CDR (this);
1783     }
1784
1785   return SCM_UNSPECIFIED;
1786 }
1787 #undef FUNC_NAME
1788
1789 /* Recover the read-hash procedure corresponding to char c.  */
1790 static SCM
1791 scm_get_hash_procedure (int c)
1792 {
1793   SCM rest = scm_i_read_hash_procedures_ref ();
1794
1795   while (1)
1796     {
1797       if (scm_is_null (rest))
1798         return SCM_BOOL_F;
1799
1800       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1801         return SCM_CDAR (rest);
1802
1803       rest = SCM_CDR (rest);
1804     }
1805 }
1806
1807 #define SCM_ENCODING_SEARCH_SIZE (500)
1808
1809 /* Search the first few hundred characters of a file for an Emacs-like coding
1810    declaration.  Returns either NULL or a string whose storage has been
1811    allocated with `scm_gc_malloc ()'.  */
1812 char *
1813 scm_i_scan_for_encoding (SCM port)
1814 {
1815   scm_t_port *pt;
1816   char header[SCM_ENCODING_SEARCH_SIZE+1];
1817   size_t bytes_read, encoding_length, i;
1818   char *encoding = NULL;
1819   int utf8_bom = 0;
1820   char *pos, *encoding_start;
1821   int in_comment;
1822
1823   pt = SCM_PTAB_ENTRY (port);
1824
1825   if (pt->rw_active == SCM_PORT_WRITE)
1826     scm_flush (port);
1827
1828   if (pt->rw_random)
1829     pt->rw_active = SCM_PORT_READ;
1830
1831   if (pt->read_pos == pt->read_end)
1832     {
1833       /* We can use the read buffer, and thus avoid a seek. */
1834       if (scm_fill_input (port) == EOF)
1835         return NULL;
1836
1837       bytes_read = pt->read_end - pt->read_pos;
1838       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1839         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1840
1841       if (bytes_read <= 1)
1842         /* An unbuffered port -- don't scan.  */
1843         return NULL;
1844
1845       memcpy (header, pt->read_pos, bytes_read);
1846       header[bytes_read] = '\0';
1847     }
1848   else
1849     {
1850       /* Try to read some bytes and then seek back.  Not all ports
1851          support seeking back; and indeed some file ports (like
1852          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1853          check performed by SCM_FPORT_FDES---but fail to seek
1854          backwards.  Hence this block comes second.  We prefer to use
1855          the read buffer in-place.  */
1856       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1857         return NULL;
1858
1859       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1860       header[bytes_read] = '\0';
1861       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1862     }
1863
1864   if (bytes_read > 3
1865       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1866     utf8_bom = 1;
1867
1868   /* search past "coding[:=]" */
1869   pos = header;
1870   while (1)
1871     {
1872       if ((pos = strstr(pos, "coding")) == NULL)
1873         return NULL;
1874
1875       pos += strlen("coding");
1876       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1877           (*pos == ':' || *pos == '='))
1878         {
1879           pos ++;
1880           break;
1881         }
1882     }
1883
1884   /* skip spaces */
1885   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1886          (*pos == ' ' || *pos == '\t'))
1887     pos ++;
1888
1889   /* grab the next token */
1890   encoding_start = pos;
1891   i = 0;
1892   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1893          && encoding_start + i - header < bytes_read
1894          && (isalnum ((int) encoding_start[i])
1895              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1896     i++;
1897
1898   encoding_length = i;
1899   if (encoding_length == 0)
1900     return NULL;
1901
1902   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1903   for (i = 0; i < encoding_length; i++)
1904     encoding[i] = toupper ((int) encoding[i]);
1905
1906   /* push backwards to make sure we were in a comment */
1907   in_comment = 0;
1908   pos = encoding_start;
1909   while (pos >= header)
1910     {
1911       if (*pos == ';')
1912         {
1913           in_comment = 1;
1914           break;
1915         }
1916       else if (*pos == '\n' || pos == header)
1917         {
1918           /* This wasn't in a semicolon comment. Check for a
1919            hash-bang comment. */
1920           char *beg = strstr (header, "#!");
1921           char *end = strstr (header, "!#");
1922           if (beg < encoding_start && encoding_start + encoding_length <= end)
1923             in_comment = 1;
1924           break;
1925         }
1926       else
1927         {
1928           pos --;
1929           continue;
1930         }
1931     }
1932   if (!in_comment)
1933     /* This wasn't in a comment */
1934     return NULL;
1935
1936   if (utf8_bom && strcmp(encoding, "UTF-8"))
1937     scm_misc_error (NULL,
1938                     "the port input declares the encoding ~s but is encoded as UTF-8",
1939                     scm_list_1 (scm_from_locale_string (encoding)));
1940
1941   return encoding;
1942 }
1943
1944 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1945             (SCM port),
1946             "Scans the port for an Emacs-like character coding declaration\n"
1947             "near the top of the contents of a port with random-accessible contents.\n"
1948             "The coding declaration is of the form\n"
1949             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1950             "\n"
1951             "Returns a string containing the character encoding of the file\n"
1952             "if a declaration was found, or @code{#f} otherwise.\n")
1953 #define FUNC_NAME s_scm_file_encoding
1954 {
1955   char *enc;
1956   SCM s_enc;
1957
1958   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1959
1960   enc = scm_i_scan_for_encoding (port);
1961   if (enc == NULL)
1962     return SCM_BOOL_F;
1963   else
1964     {
1965       s_enc = scm_from_locale_string (enc);
1966       return s_enc;
1967     }
1968
1969   return SCM_BOOL_F;
1970 }
1971 #undef FUNC_NAME
1972
1973 void
1974 scm_init_read ()
1975 {
1976   SCM read_hash_procs;
1977
1978   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1979
1980   scm_i_read_hash_procedures =
1981     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1982
1983   scm_init_opts (scm_read_options, scm_read_opts);
1984 #include "libguile/read.x"
1985 }
1986
1987 /*
1988   Local Variables:
1989   c-file-style: "gnu"
1990   End:
1991 */