libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /* Internal read options structure.  This is initialized by 'scm_read'
  85    from the global read options, and a pointer is passed down to all
  86    helper functions. */
  87 enum t_keyword_style {
  88   KEYWORD_STYLE_HASH_PREFIX,
  89   KEYWORD_STYLE_PREFIX,
  90   KEYWORD_STYLE_POSTFIX
  91 };
  92
  93 struct t_read_opts {
  94   enum t_keyword_style keyword_style;
  95   unsigned int copy_source_p        : 1;
  96   unsigned int record_positions_p   : 1;
  97   unsigned int case_insensitive_p   : 1;
  98   unsigned int r6rs_escapes_p       : 1;
  99   unsigned int square_brackets_p    : 1;
 100   unsigned int hungry_eol_escapes_p : 1;
 101 };
 102
 103 typedef struct t_read_opts scm_t_read_opts;
 104
 105 /* Initialize OPTS from the global read options. */
 106 static void
 107 init_read_options (scm_t_read_opts *opts)
 108 {
 109   SCM val;
 110   int x;
 111
 112   val = SCM_PACK (SCM_KEYWORD_STYLE);
 113   if (scm_is_eq (val, scm_keyword_prefix))
 114     x = KEYWORD_STYLE_PREFIX;
 115   else if (scm_is_eq (val, scm_keyword_postfix))
 116     x = KEYWORD_STYLE_POSTFIX;
 117   else
 118     x = KEYWORD_STYLE_HASH_PREFIX;
 119   opts->keyword_style = x;
 120
 121 #define RESOLVE_BOOLEAN_OPTION(NAME, name)      \
 122   (opts->name = !!SCM_ ## NAME)
 123
 124   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
 125   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
 126   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
 127   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
 128   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
 129   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
 130
 131 #undef RESOLVE_BOOLEAN_OPTION
 132 }
 133
 134
 135 /*
 136   Give meaningful error messages for errors
 137
 138   We use the format
 139
 140   FILE:LINE:COL: MESSAGE
 141   This happened in ....
 142
 143   This is not standard GNU format, but the test-suite likes the real
 144   message to be in front.
 145
 146  */
 147
 148
 149 void
 150 scm_i_input_error (char const *function,
 151                    SCM port, const char *message, SCM arg)
 152 {
 153   SCM fn = (scm_is_string (SCM_FILENAME(port))
 154             ? SCM_FILENAME(port)
 155             : scm_from_locale_string ("#<unknown port>"));
 156
 157   SCM string_port = scm_open_output_string ();
 158   SCM string = SCM_EOL;
 159   scm_simple_format (string_port,
 160                      scm_from_locale_string ("~A:~S:~S: ~A"),
 161                      scm_list_4 (fn,
 162                                  scm_from_long (SCM_LINUM (port) + 1),
 163                                  scm_from_int (SCM_COL (port) + 1),
 164                                  scm_from_locale_string (message)));
 165
 166   string = scm_get_output_string (string_port);
 167   scm_close_output_port (string_port);
 168   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 169                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 170                  string,
 171                  arg,
 172                  SCM_BOOL_F);
 173 }
 174
 175
 176 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 177             (SCM setting),
 178             "Option interface for the read options. Instead of using\n"
 179             "this procedure directly, use the procedures @code{read-enable},\n"
 180             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 181 #define FUNC_NAME s_scm_read_options
 182 {
 183   SCM ans = scm_options (setting,
 184                          scm_read_opts,
 185                          FUNC_NAME);
 186   if (SCM_COPY_SOURCE_P)
 187     SCM_RECORD_POSITIONS_P = 1;
 188   return ans;
 189 }
 190 #undef FUNC_NAME
 191
 192 /* A fluid referring to an association list mapping extra hash
 193    characters to procedures.  */
 194 static SCM *scm_i_read_hash_procedures;
 195
 196 static SCM
 197 scm_i_read_hash_procedures_ref (void)
 198 {
 199   return scm_fluid_ref (*scm_i_read_hash_procedures);
 200 }
 201
 202 static void
 203 scm_i_read_hash_procedures_set_x (SCM value)
 204 {
 205   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 206 }
 207
 208 \f
 209 /* Token readers.  */
 210
 211
 212 /* Size of the C buffer used to read symbols and numbers.  */
 213 #define READER_BUFFER_SIZE            128
 214
 215 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 216 #define READER_STRING_BUFFER_SIZE     128
 217
 218 /* The maximum size of Scheme character names.  */
 219 #define READER_CHAR_NAME_MAX_SIZE      50
 220
 221
 222 /* `isblank' is only in C99.  */
 223 #define CHAR_IS_BLANK_(_chr)                                    \
 224   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 225    || ((_chr) == '\f') || ((_chr) == '\r'))
 226
 227 #ifdef MSDOS
 228 # define CHAR_IS_BLANK(_chr)                    \
 229   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 230 #else
 231 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 232 #endif
 233
 234
 235 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 236    structure'').  */
 237 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 238   (CHAR_IS_BLANK (c)                                            \
 239    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 240
 241 #define CHAR_IS_DELIMITER(c)                                    \
 242   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 243    || (((c) == ']' || (c) == '[') && opts->square_brackets_p))
 244
 245 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 246    Structure''.  */
 247 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 248   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 249    || ((_chr) == 'd') || ((_chr) == 'l'))
 250
 251 /* Read an SCSH block comment.  */
 252 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 253 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 254 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 255 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 256 static SCM scm_get_hash_procedure (int);
 257
 258 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 259    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 260    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 261    bytes actually read.  */
 262 static int
 263 read_token (SCM port, scm_t_read_opts *opts,
 264             char *buf, size_t buf_size, size_t *read)
 265 {
 266    *read = 0;
 267
 268    while (*read < buf_size)
 269      {
 270        int chr;
 271
 272        chr = scm_get_byte_or_eof (port);
 273
 274        if (chr == EOF)
 275         return 0;
 276       else if (CHAR_IS_DELIMITER (chr))
 277         {
 278           scm_unget_byte (chr, port);
 279           return 0;
 280         }
 281       else
 282         {
 283           *buf = (char) chr;
 284           buf++, (*read)++;
 285         }
 286      }
 287
 288    return 1;
 289  }
 290
 291 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 292    if the token doesn't fit in BUFFER_SIZE bytes.  */
 293 static char *
 294 read_complete_token (SCM port, scm_t_read_opts *opts,
 295                      char *buffer, size_t buffer_size, size_t *read)
 296 {
 297   int overflow = 0;
 298   size_t bytes_read, overflow_size = 0;
 299   char *overflow_buffer = NULL;
 300
 301   do
 302     {
 303       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 304       if (bytes_read == 0)
 305         break;
 306       if (overflow || overflow_size != 0)
 307         {
 308           if (overflow_size == 0)
 309             {
 310               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 311               memcpy (overflow_buffer, buffer, bytes_read);
 312               overflow_size = bytes_read;
 313             }
 314           else
 315             {
 316               char *new_buf =
 317                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 318
 319               memcpy (new_buf, overflow_buffer, overflow_size);
 320               memcpy (new_buf + overflow_size, buffer, bytes_read);
 321
 322               overflow_buffer = new_buf;
 323               overflow_size += bytes_read;
 324             }
 325         }
 326     }
 327   while (overflow);
 328
 329   if (overflow_size)
 330     *read = overflow_size;
 331   else
 332     *read = bytes_read;
 333
 334   return (overflow_size > 0 ? overflow_buffer : buffer);
 335 }
 336
 337 /* Skip whitespace from PORT and return the first non-whitespace character
 338    read.  Raise an error on end-of-file.  */
 339 static int
 340 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 341 {
 342   scm_t_wchar c;
 343   while (1)
 344     switch (c = scm_getc (port))
 345       {
 346       case EOF:
 347       goteof:
 348         if (eoferr)
 349           {
 350             scm_i_input_error (eoferr,
 351                                port,
 352                                "end of file",
 353                                SCM_EOL);
 354           }
 355         return c;
 356
 357       case ';':
 358       lp:
 359         switch (c = scm_getc (port))
 360           {
 361           case EOF:
 362             goto goteof;
 363           default:
 364             goto lp;
 365           case SCM_LINE_INCREMENTORS:
 366             break;
 367           }
 368         break;
 369
 370       case '#':
 371         switch (c = scm_getc (port))
 372           {
 373           case EOF:
 374             eoferr = "read_sharp";
 375             goto goteof;
 376           case '!':
 377             scm_read_shebang (c, port, opts);
 378             break;
 379           case ';':
 380             scm_read_commented_expression (c, port, opts);
 381             break;
 382           case '|':
 383             if (scm_is_false (scm_get_hash_procedure (c)))
 384               {
 385                 scm_read_r6rs_block_comment (c, port);
 386                 break;
 387               }
 388             /* fall through */
 389           default:
 390             scm_ungetc (c, port);
 391             return '#';
 392           }
 393         break;
 394
 395       case SCM_LINE_INCREMENTORS:
 396       case SCM_SINGLE_SPACES:
 397       case '\t':
 398         break;
 399
 400       default:
 401         return c;
 402       }
 403
 404   return 0;
 405 }
 406
 407
 408 \f
 409 /* Token readers.  */
 410
 411 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 412 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 413                            long line, int column);
 414
 415
 416 static SCM
 417 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 418                        long line, int column)
 419 {
 420   if (opts->record_positions_p)
 421     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 422   return x;
 423 }
 424
 425 static SCM
 426 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 427 #define FUNC_NAME "scm_i_lreadparen"
 428 {
 429   int c;
 430   SCM tmp, tl, ans = SCM_EOL;
 431   const int terminating_char = ((chr == '[') ? ']' : ')');
 432
 433   /* Need to capture line and column numbers here. */
 434   long line = SCM_LINUM (port);
 435   int column = SCM_COL (port) - 1;
 436
 437   c = flush_ws (port, opts, FUNC_NAME);
 438   if (terminating_char == c)
 439     return SCM_EOL;
 440
 441   scm_ungetc (c, port);
 442   tmp = scm_read_expression (port, opts);
 443
 444   /* Note that it is possible for scm_read_expression to return
 445      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 446      check that it's a real dot by checking `c'.  */
 447   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 448     {
 449       ans = scm_read_expression (port, opts);
 450       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 451         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 452                            SCM_EOL);
 453       return ans;
 454     }
 455
 456   /* Build the head of the list structure. */
 457   ans = tl = scm_cons (tmp, SCM_EOL);
 458
 459   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 460     {
 461       SCM new_tail;
 462
 463       if (c == ')' || (c == ']' && opts->square_brackets_p))
 464         scm_i_input_error (FUNC_NAME, port,
 465                            "in pair: mismatched close paren: ~A",
 466                            scm_list_1 (SCM_MAKE_CHAR (c)));
 467
 468       scm_ungetc (c, port);
 469       tmp = scm_read_expression (port, opts);
 470
 471       /* See above note about scm_sym_dot.  */
 472       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 473         {
 474           SCM_SETCDR (tl, scm_read_expression (port, opts));
 475
 476           c = flush_ws (port, opts, FUNC_NAME);
 477           if (terminating_char != c)
 478             scm_i_input_error (FUNC_NAME, port,
 479                                "in pair: missing close paren", SCM_EOL);
 480           goto exit;
 481         }
 482
 483       new_tail = scm_cons (tmp, SCM_EOL);
 484       SCM_SETCDR (tl, new_tail);
 485       tl = new_tail;
 486     }
 487
 488  exit:
 489   return maybe_annotate_source (ans, port, opts, line, column);
 490 }
 491 #undef FUNC_NAME
 492
 493
 494 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 495    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 496    found.  */
 497 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 498   do                                                               \
 499     {                                                              \
 500       scm_t_wchar a;                                               \
 501       size_t i = 0;                                                \
 502       c = 0;                                                       \
 503       while (i < ndigits)                                          \
 504         {                                                          \
 505           a = scm_getc (port);                                     \
 506           if (a == EOF)                                            \
 507             goto str_eof;                                          \
 508           if (terminator                                           \
 509               && (a == (scm_t_wchar) terminator)                   \
 510               && (i > 0))                                          \
 511             break;                                                 \
 512           if ('0' <= a && a <= '9')                                \
 513             a -= '0';                                              \
 514           else if ('A' <= a && a <= 'F')                           \
 515             a = a - 'A' + 10;                                      \
 516           else if ('a' <= a && a <= 'f')                           \
 517             a = a - 'a' + 10;                                      \
 518           else                                                     \
 519             {                                                      \
 520               c = a;                                               \
 521               goto bad_escaped;                                    \
 522             }                                                      \
 523           c = c * 16 + a;                                          \
 524           i ++;                                                    \
 525         }                                                          \
 526     } while (0)
 527
 528 static void
 529 skip_intraline_whitespace (SCM port)
 530 {
 531   scm_t_wchar c;
 532
 533   do
 534     {
 535       c = scm_getc (port);
 536       if (c == EOF)
 537         return;
 538     }
 539   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 540
 541   scm_ungetc (c, port);
 542 }
 543
 544 static SCM
 545 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 546 #define FUNC_NAME "scm_lreadr"
 547 {
 548   /* For strings smaller than C_STR, this function creates only one Scheme
 549      object (the string returned).  */
 550
 551   SCM str = SCM_EOL;
 552   size_t c_str_len = 0;
 553   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 554
 555   /* Need to capture line and column numbers here. */
 556   long line = SCM_LINUM (port);
 557   int column = SCM_COL (port) - 1;
 558
 559   while ('"' != (c = scm_getc (port)))
 560     {
 561       if (c == EOF)
 562         {
 563         str_eof:
 564           scm_i_input_error (FUNC_NAME, port,
 565                              "end of file in string constant", SCM_EOL);
 566         }
 567
 568       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 569         {
 570           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 571           c_str_len = 0;
 572         }
 573
 574       if (c == '\\')
 575         {
 576           switch (c = scm_getc (port))
 577             {
 578             case EOF:
 579               goto str_eof;
 580             case '"':
 581             case '\\':
 582               break;
 583             case '\n':
 584               if (opts->hungry_eol_escapes_p)
 585                 skip_intraline_whitespace (port);
 586               continue;
 587             case '0':
 588               c = '\0';
 589               break;
 590             case 'f':
 591               c = '\f';
 592               break;
 593             case 'n':
 594               c = '\n';
 595               break;
 596             case 'r':
 597               c = '\r';
 598               break;
 599             case 't':
 600               c = '\t';
 601               break;
 602             case 'a':
 603               c = '\007';
 604               break;
 605             case 'v':
 606               c = '\v';
 607               break;
 608             case 'b':
 609               c = '\010';
 610               break;
 611             case 'x':
 612               if (opts->r6rs_escapes_p)
 613                 SCM_READ_HEX_ESCAPE (10, ';');
 614               else
 615                 SCM_READ_HEX_ESCAPE (2, '\0');
 616               break;
 617             case 'u':
 618               if (!opts->r6rs_escapes_p)
 619                 {
 620                   SCM_READ_HEX_ESCAPE (4, '\0');
 621                   break;
 622                 }
 623             case 'U':
 624               if (!opts->r6rs_escapes_p)
 625                 {
 626                   SCM_READ_HEX_ESCAPE (6, '\0');
 627                   break;
 628                 }
 629             default:
 630             bad_escaped:
 631               scm_i_input_error (FUNC_NAME, port,
 632                                  "illegal character in escape sequence: ~S",
 633                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 634             }
 635         }
 636
 637       c_str[c_str_len++] = c;
 638     }
 639
 640   if (scm_is_null (str))
 641     /* Fast path: we got a string that fits in C_STR.  */
 642     str = scm_from_utf32_stringn (c_str, c_str_len);
 643   else
 644     {
 645       if (c_str_len > 0)
 646         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 647
 648       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 649     }
 650
 651   return maybe_annotate_source (str, port, opts, line, column);
 652 }
 653 #undef FUNC_NAME
 654
 655
 656 static SCM
 657 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 658 {
 659   SCM result, str = SCM_EOL;
 660   char local_buffer[READER_BUFFER_SIZE], *buffer;
 661   size_t bytes_read;
 662   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 663
 664   /* Need to capture line and column numbers here. */
 665   long line = SCM_LINUM (port);
 666   int column = SCM_COL (port) - 1;
 667
 668   scm_ungetc (chr, port);
 669   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 670                                 &bytes_read);
 671
 672   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 673
 674   result = scm_string_to_number (str, SCM_UNDEFINED);
 675   if (scm_is_false (result))
 676     {
 677       /* Return a symbol instead of a number */
 678       if (opts->case_insensitive_p)
 679         str = scm_string_downcase_x (str);
 680       result = scm_string_to_symbol (str);
 681     }
 682   else if (SCM_NIMP (result))
 683     result = maybe_annotate_source (result, port, opts, line, column);
 684
 685   SCM_COL (port) += scm_i_string_length (str);
 686   return result;
 687 }
 688
 689 static SCM
 690 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 691 {
 692   SCM result;
 693   int ends_with_colon = 0;
 694   size_t bytes_read;
 695   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 696   char local_buffer[READER_BUFFER_SIZE], *buffer;
 697   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 698   SCM str;
 699
 700   scm_ungetc (chr, port);
 701   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 702                                 &bytes_read);
 703   if (bytes_read > 0)
 704     ends_with_colon = buffer[bytes_read - 1] == ':';
 705
 706   if (postfix && ends_with_colon && (bytes_read > 1))
 707     {
 708       str = scm_from_stringn (buffer, bytes_read - 1,
 709                               pt->encoding, pt->ilseq_handler);
 710
 711       if (opts->case_insensitive_p)
 712         str = scm_string_downcase_x (str);
 713       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 714     }
 715   else
 716     {
 717       str = scm_from_stringn (buffer, bytes_read,
 718                               pt->encoding, pt->ilseq_handler);
 719
 720       if (opts->case_insensitive_p)
 721         str = scm_string_downcase_x (str);
 722       result = scm_string_to_symbol (str);
 723     }
 724
 725   SCM_COL (port) += scm_i_string_length (str);
 726   return result;
 727 }
 728
 729 static SCM
 730 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 731 #define FUNC_NAME "scm_lreadr"
 732 {
 733   SCM result;
 734   size_t read;
 735   char local_buffer[READER_BUFFER_SIZE], *buffer;
 736   unsigned int radix;
 737   SCM str;
 738   scm_t_port *pt;
 739
 740   switch (chr)
 741     {
 742     case 'B':
 743     case 'b':
 744       radix = 2;
 745       break;
 746
 747     case 'o':
 748     case 'O':
 749       radix = 8;
 750       break;
 751
 752     case 'd':
 753     case 'D':
 754       radix = 10;
 755       break;
 756
 757     case 'x':
 758     case 'X':
 759       radix = 16;
 760       break;
 761
 762     default:
 763       scm_ungetc (chr, port);
 764       scm_ungetc ('#', port);
 765       radix = 10;
 766     }
 767
 768   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 769                                 &read);
 770
 771   pt = SCM_PTAB_ENTRY (port);
 772   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 773
 774   result = scm_string_to_number (str, scm_from_uint (radix));
 775
 776   SCM_COL (port) += scm_i_string_length (str);
 777
 778   if (scm_is_true (result))
 779     return result;
 780
 781   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 782
 783   return SCM_BOOL_F;
 784 }
 785 #undef FUNC_NAME
 786
 787 static SCM
 788 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 789 {
 790   SCM p;
 791   long line = SCM_LINUM (port);
 792   int column = SCM_COL (port) - 1;
 793
 794   switch (chr)
 795     {
 796     case '`':
 797       p = scm_sym_quasiquote;
 798       break;
 799
 800     case '\'':
 801       p = scm_sym_quote;
 802       break;
 803
 804     case ',':
 805       {
 806         scm_t_wchar c;
 807
 808         c = scm_getc (port);
 809         if ('@' == c)
 810           p = scm_sym_uq_splicing;
 811         else
 812           {
 813             scm_ungetc (c, port);
 814             p = scm_sym_unquote;
 815           }
 816         break;
 817       }
 818
 819     default:
 820       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 821                "scm_read_quote", chr);
 822       abort ();
 823     }
 824
 825   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 826   return maybe_annotate_source (p, port, opts, line, column);
 827 }
 828
 829 SCM_SYMBOL (sym_syntax, "syntax");
 830 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 831 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 832 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 833
 834 static SCM
 835 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 836 {
 837   SCM p;
 838   long line = SCM_LINUM (port);
 839   int column = SCM_COL (port) - 1;
 840
 841   switch (chr)
 842     {
 843     case '`':
 844       p = sym_quasisyntax;
 845       break;
 846
 847     case '\'':
 848       p = sym_syntax;
 849       break;
 850
 851     case ',':
 852       {
 853         int c;
 854
 855         c = scm_getc (port);
 856         if ('@' == c)
 857           p = sym_unsyntax_splicing;
 858         else
 859           {
 860             scm_ungetc (c, port);
 861             p = sym_unsyntax;
 862           }
 863         break;
 864       }
 865
 866     default:
 867       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 868                "scm_read_syntax", chr);
 869       abort ();
 870     }
 871
 872   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 873   return maybe_annotate_source (p, port, opts, line, column);
 874 }
 875
 876 static SCM
 877 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 878 {
 879   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 880
 881   if (!scm_is_eq (id, sym_nil))
 882     scm_i_input_error ("scm_read_nil", port,
 883                        "unexpected input while reading #nil: ~a",
 884                        scm_list_1 (id));
 885
 886   return SCM_ELISP_NIL;
 887 }
 888
 889 static SCM
 890 scm_read_semicolon_comment (int chr, SCM port)
 891 {
 892   int c;
 893
 894   /* We use the get_byte here because there is no need to get the
 895      locale correct with comment input. This presumes that newline
 896      always represents itself no matter what the encoding is.  */
 897   for (c = scm_get_byte_or_eof (port);
 898        (c != EOF) && (c != '\n');
 899        c = scm_get_byte_or_eof (port));
 900
 901   return SCM_UNSPECIFIED;
 902 }
 903
 904 \f
 905 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 906
 907 static SCM
 908 scm_read_boolean (int chr, SCM port)
 909 {
 910   switch (chr)
 911     {
 912     case 't':
 913     case 'T':
 914       return SCM_BOOL_T;
 915
 916     case 'f':
 917     case 'F':
 918       return SCM_BOOL_F;
 919     }
 920
 921   return SCM_UNSPECIFIED;
 922 }
 923
 924 static SCM
 925 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 926 #define FUNC_NAME "scm_lreadr"
 927 {
 928   char buffer[READER_CHAR_NAME_MAX_SIZE];
 929   SCM charname;
 930   size_t charname_len, bytes_read;
 931   scm_t_wchar cp;
 932   int overflow;
 933   scm_t_port *pt;
 934
 935   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 936                          &bytes_read);
 937   if (overflow)
 938     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 939
 940   if (bytes_read == 0)
 941     {
 942       chr = scm_getc (port);
 943       if (chr == EOF)
 944         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 945                            "while reading character", SCM_EOL);
 946
 947       /* CHR must be a token delimiter, like a whitespace.  */
 948       return (SCM_MAKE_CHAR (chr));
 949     }
 950
 951   pt = SCM_PTAB_ENTRY (port);
 952
 953   /* Simple ASCII characters can be processed immediately.  Also, simple
 954      ISO-8859-1 characters can be processed immediately if the encoding for this
 955      port is ISO-8859-1.  */
 956   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 957     {
 958       SCM_COL (port) += 1;
 959       return SCM_MAKE_CHAR (buffer[0]);
 960     }
 961
 962   /* Otherwise, convert the buffer into a proper scheme string for
 963      processing.  */
 964   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 965                                pt->ilseq_handler);
 966   charname_len = scm_i_string_length (charname);
 967   SCM_COL (port) += charname_len;
 968   cp = scm_i_string_ref (charname, 0);
 969   if (charname_len == 1)
 970     return SCM_MAKE_CHAR (cp);
 971
 972   /* Ignore dotted circles, which may be used to keep combining characters from
 973      combining with the backslash in #\charname.  */
 974   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 975     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 976
 977   if (cp >= '0' && cp < '8')
 978     {
 979       /* Dirk:FIXME::  This type of character syntax is not R5RS
 980        * compliant.  Further, it should be verified that the constant
 981        * does only consist of octal digits.  */
 982       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 983       if (SCM_I_INUMP (p))
 984         {
 985           scm_t_wchar c = scm_to_uint32 (p);
 986           if (SCM_IS_UNICODE_CHAR (c))
 987             return SCM_MAKE_CHAR (c);
 988           else
 989             scm_i_input_error (FUNC_NAME, port,
 990                                "out-of-range octal character escape: ~a",
 991                                scm_list_1 (charname));
 992         }
 993     }
 994
 995   if (cp == 'x' && (charname_len > 1))
 996     {
 997       SCM p;
 998
 999       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1000       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1001                                 scm_from_uint (16));
1002       if (SCM_I_INUMP (p))
1003         {
1004           scm_t_wchar c = scm_to_uint32 (p);
1005           if (SCM_IS_UNICODE_CHAR (c))
1006             return SCM_MAKE_CHAR (c);
1007           else
1008             scm_i_input_error (FUNC_NAME, port,
1009                                "out-of-range hex character escape: ~a",
1010                                scm_list_1 (charname));
1011         }
1012     }
1013
1014   /* The names of characters should never have non-Latin1
1015      characters.  */
1016   if (scm_i_is_narrow_string (charname)
1017       || scm_i_try_narrow_string (charname))
1018     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1019                                        charname_len);
1020       if (scm_is_true (ch))
1021         return ch;
1022     }
1023
1024   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1025                      scm_list_1 (charname));
1026
1027   return SCM_UNSPECIFIED;
1028 }
1029 #undef FUNC_NAME
1030
1031 static SCM
1032 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1033 {
1034   SCM symbol;
1035
1036   /* Read the symbol that comprises the keyword.  Doing this instead of
1037      invoking a specific symbol reader function allows `scm_read_keyword ()'
1038      to adapt to the delimiters currently valid of symbols.
1039
1040      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1041   symbol = scm_read_expression (port, opts);
1042   if (!scm_is_symbol (symbol))
1043     scm_i_input_error ("scm_read_keyword", port,
1044                        "keyword prefix `~a' not followed by a symbol: ~s",
1045                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1046
1047   return (scm_symbol_to_keyword (symbol));
1048 }
1049
1050 static SCM
1051 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1052                  long line, int column)
1053 {
1054   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1055      guarantee that it's going to do what we want.  After all, this is an
1056      implementation detail of `scm_read_vector ()', not a desirable
1057      property.  */
1058   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1059                                 port, opts, line, column);
1060 }
1061
1062 /* Helper used by scm_read_array */
1063 static int
1064 read_decimal_integer (SCM port, int c, ssize_t *resp)
1065 {
1066   ssize_t sign = 1;
1067   ssize_t res = 0;
1068   int got_it = 0;
1069
1070   if (c == '-')
1071     {
1072       sign = -1;
1073       c = scm_getc (port);
1074     }
1075
1076   while ('0' <= c && c <= '9')
1077     {
1078       res = 10*res + c-'0';
1079       got_it = 1;
1080       c = scm_getc (port);
1081     }
1082
1083   if (got_it)
1084     *resp = sign * res;
1085   return c;
1086 }
1087
1088 /* Read an array.  This function can also read vectors and uniform
1089    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1090    handled here.
1091
1092    C is the first character read after the '#'.
1093 */
1094 static SCM
1095 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1096 {
1097   ssize_t rank;
1098   scm_t_wchar tag_buf[8];
1099   int tag_len;
1100
1101   SCM tag, shape = SCM_BOOL_F, elements, array;
1102
1103   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1104      the array code can not deal with zero-length dimensions yet, and
1105      we want to allow zero-length vectors, of course.
1106   */
1107   if (c == '(')
1108     return scm_read_vector (c, port, opts, line, column);
1109
1110   /* Disambiguate between '#f' and uniform floating point vectors.
1111    */
1112   if (c == 'f')
1113     {
1114       c = scm_getc (port);
1115       if (c != '3' && c != '6')
1116         {
1117           if (c != EOF)
1118             scm_ungetc (c, port);
1119           return SCM_BOOL_F;
1120         }
1121       rank = 1;
1122       tag_buf[0] = 'f';
1123       tag_len = 1;
1124       goto continue_reading_tag;
1125     }
1126
1127   /* Read rank. */
1128   rank = 1;
1129   c = read_decimal_integer (port, c, &rank);
1130   if (rank < 0)
1131     scm_i_input_error (NULL, port, "array rank must be non-negative",
1132                        SCM_EOL);
1133
1134   /* Read tag. */
1135   tag_len = 0;
1136  continue_reading_tag:
1137   while (c != EOF && c != '(' && c != '@' && c != ':'
1138          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1139     {
1140       tag_buf[tag_len++] = c;
1141       c = scm_getc (port);
1142     }
1143   if (tag_len == 0)
1144     tag = SCM_BOOL_T;
1145   else
1146     {
1147       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1148       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1149         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1150                            scm_list_1 (tag));
1151     }
1152
1153   /* Read shape. */
1154   if (c == '@' || c == ':')
1155     {
1156       shape = SCM_EOL;
1157
1158       do
1159         {
1160           ssize_t lbnd = 0, len = 0;
1161           SCM s;
1162
1163           if (c == '@')
1164             {
1165               c = scm_getc (port);
1166               c = read_decimal_integer (port, c, &lbnd);
1167             }
1168
1169           s = scm_from_ssize_t (lbnd);
1170
1171           if (c == ':')
1172             {
1173               c = scm_getc (port);
1174               c = read_decimal_integer (port, c, &len);
1175               if (len < 0)
1176                 scm_i_input_error (NULL, port,
1177                                    "array length must be non-negative",
1178                                    SCM_EOL);
1179
1180               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1181             }
1182
1183           shape = scm_cons (s, shape);
1184         } while (c == '@' || c == ':');
1185
1186       shape = scm_reverse_x (shape, SCM_EOL);
1187     }
1188
1189   /* Read nested lists of elements. */
1190   if (c != '(')
1191     scm_i_input_error (NULL, port,
1192                        "missing '(' in vector or array literal",
1193                        SCM_EOL);
1194   elements = scm_read_sexp (c, port, opts);
1195
1196   if (scm_is_false (shape))
1197     shape = scm_from_ssize_t (rank);
1198   else if (scm_ilength (shape) != rank)
1199     scm_i_input_error
1200       (NULL, port,
1201        "the number of shape specifications must match the array rank",
1202        SCM_EOL);
1203
1204   /* Handle special print syntax of rank zero arrays; see
1205      scm_i_print_array for a rationale. */
1206   if (rank == 0)
1207     {
1208       if (!scm_is_pair (elements))
1209         scm_i_input_error (NULL, port,
1210                            "too few elements in array literal, need 1",
1211                            SCM_EOL);
1212       if (!scm_is_null (SCM_CDR (elements)))
1213         scm_i_input_error (NULL, port,
1214                            "too many elements in array literal, want 1",
1215                            SCM_EOL);
1216       elements = SCM_CAR (elements);
1217     }
1218
1219   /* Construct array, annotate with source location, and return. */
1220   array = scm_list_to_typed_array (tag, shape, elements);
1221   return maybe_annotate_source (array, port, opts, line, column);
1222 }
1223
1224 static SCM
1225 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1226                        long line, int column)
1227 {
1228   return scm_read_array (chr, port, opts, line, column);
1229 }
1230
1231 static SCM
1232 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1233                      long line, int column)
1234 {
1235   chr = scm_getc (port);
1236   if (chr != 'u')
1237     goto syntax;
1238
1239   chr = scm_getc (port);
1240   if (chr != '8')
1241     goto syntax;
1242
1243   chr = scm_getc (port);
1244   if (chr != '(')
1245     goto syntax;
1246
1247   return maybe_annotate_source
1248     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1249      port, opts, line, column);
1250
1251  syntax:
1252   scm_i_input_error ("read_bytevector", port,
1253                      "invalid bytevector prefix",
1254                      SCM_MAKE_CHAR (chr));
1255   return SCM_UNSPECIFIED;
1256 }
1257
1258 static SCM
1259 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1260                            long line, int column)
1261 {
1262   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1263      terribly inefficient but who cares?  */
1264   SCM s_bits = SCM_EOL;
1265
1266   for (chr = scm_getc (port);
1267        (chr != EOF) && ((chr == '0') || (chr == '1'));
1268        chr = scm_getc (port))
1269     {
1270       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1271     }
1272
1273   if (chr != EOF)
1274     scm_ungetc (chr, port);
1275
1276   return maybe_annotate_source
1277     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1278      port, opts, line, column);
1279 }
1280
1281 static SCM
1282 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1283 {
1284   int bang_seen = 0;
1285
1286   for (;;)
1287     {
1288       int c = scm_getc (port);
1289
1290       if (c == EOF)
1291         scm_i_input_error ("skip_block_comment", port,
1292                            "unterminated `#! ... !#' comment", SCM_EOL);
1293
1294       if (c == '!')
1295         bang_seen = 1;
1296       else if (c == '#' && bang_seen)
1297         break;
1298       else
1299         bang_seen = 0;
1300     }
1301
1302   return SCM_UNSPECIFIED;
1303 }
1304
1305 static SCM
1306 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1307 {
1308   int c = 0;
1309   if ((c = scm_get_byte_or_eof (port)) != 'r')
1310     {
1311       scm_ungetc (c, port);
1312       return scm_read_scsh_block_comment (chr, port);
1313     }
1314   if ((c = scm_get_byte_or_eof (port)) != '6')
1315     {
1316       scm_ungetc (c, port);
1317       scm_ungetc ('r', port);
1318       return scm_read_scsh_block_comment (chr, port);
1319     }
1320   if ((c = scm_get_byte_or_eof (port)) != 'r')
1321     {
1322       scm_ungetc (c, port);
1323       scm_ungetc ('6', port);
1324       scm_ungetc ('r', port);
1325       return scm_read_scsh_block_comment (chr, port);
1326     }
1327   if ((c = scm_get_byte_or_eof (port)) != 's')
1328     {
1329       scm_ungetc (c, port);
1330       scm_ungetc ('r', port);
1331       scm_ungetc ('6', port);
1332       scm_ungetc ('r', port);
1333       return scm_read_scsh_block_comment (chr, port);
1334     }
1335
1336   return SCM_UNSPECIFIED;
1337 }
1338
1339 static SCM
1340 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1341 {
1342   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1343      nested.  So care must be taken.  */
1344   int nesting_level = 1;
1345
1346   int a = scm_getc (port);
1347
1348   if (a == EOF)
1349     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1350                        "unterminated `#| ... |#' comment", SCM_EOL);
1351
1352   while (nesting_level > 0)
1353     {
1354       int b = scm_getc (port);
1355
1356       if (b == EOF)
1357         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1358                            "unterminated `#| ... |#' comment", SCM_EOL);
1359
1360       if (a == '|' && b == '#')
1361         {
1362           nesting_level--;
1363           b = EOF;
1364         }
1365       else if (a == '#' && b == '|')
1366         {
1367           nesting_level++;
1368           b = EOF;
1369         }
1370
1371       a = b;
1372     }
1373
1374   return SCM_UNSPECIFIED;
1375 }
1376
1377 static SCM
1378 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1379                                scm_t_read_opts *opts)
1380 {
1381   scm_t_wchar c;
1382
1383   c = flush_ws (port, opts, (char *) NULL);
1384   if (EOF == c)
1385     scm_i_input_error ("read_commented_expression", port,
1386                        "no expression after #; comment", SCM_EOL);
1387   scm_ungetc (c, port);
1388   scm_read_expression (port, opts);
1389   return SCM_UNSPECIFIED;
1390 }
1391
1392 static SCM
1393 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1394 {
1395   /* Guile's extended symbol read syntax looks like this:
1396
1397        #{This is all a symbol name}#
1398
1399      So here, CHR is expected to be `{'.  */
1400   int saw_brace = 0;
1401   size_t len = 0;
1402   SCM buf = scm_i_make_string (1024, NULL, 0);
1403
1404   buf = scm_i_string_start_writing (buf);
1405
1406   while ((chr = scm_getc (port)) != EOF)
1407     {
1408       if (saw_brace)
1409         {
1410           if (chr == '#')
1411             {
1412               break;
1413             }
1414           else
1415             {
1416               saw_brace = 0;
1417               scm_i_string_set_x (buf, len++, '}');
1418             }
1419         }
1420
1421       if (chr == '}')
1422         saw_brace = 1;
1423       else if (chr == '\\')
1424         {
1425           /* It used to be that print.c would print extended-read-syntax
1426              symbols with backslashes before "non-standard" chars, but
1427              this routine wouldn't do anything with those escapes.
1428              Bummer.  What we've done is to change print.c to output
1429              R6RS hex escapes for those characters, relying on the fact
1430              that the extended read syntax would never put a `\' before
1431              an `x'.  For now, we just ignore other instances of
1432              backslash in the string.  */
1433           switch ((chr = scm_getc (port)))
1434             {
1435             case EOF:
1436               goto done;
1437             case 'x':
1438               {
1439                 scm_t_wchar c;
1440
1441                 SCM_READ_HEX_ESCAPE (10, ';');
1442                 scm_i_string_set_x (buf, len++, c);
1443                 break;
1444
1445               str_eof:
1446                 chr = EOF;
1447                 goto done;
1448
1449               bad_escaped:
1450                 scm_i_string_stop_writing ();
1451                 scm_i_input_error ("scm_read_extended_symbol", port,
1452                                    "illegal character in escape sequence: ~S",
1453                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1454                 break;
1455               }
1456             default:
1457               scm_i_string_set_x (buf, len++, chr);
1458               break;
1459             }
1460         }
1461       else
1462         scm_i_string_set_x (buf, len++, chr);
1463
1464       if (len >= scm_i_string_length (buf) - 2)
1465         {
1466           SCM addy;
1467
1468           scm_i_string_stop_writing ();
1469           addy = scm_i_make_string (1024, NULL, 0);
1470           buf = scm_string_append (scm_list_2 (buf, addy));
1471           len = 0;
1472           buf = scm_i_string_start_writing (buf);
1473         }
1474     }
1475
1476  done:
1477   scm_i_string_stop_writing ();
1478   if (chr == EOF)
1479     scm_i_input_error ("scm_read_extended_symbol", port,
1480                        "end of file while reading symbol", SCM_EOL);
1481
1482   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1483 }
1484
1485
1486 \f
1487 /* Top-level token readers, i.e., dispatchers.  */
1488
1489 static SCM
1490 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1491 {
1492   SCM proc;
1493
1494   proc = scm_get_hash_procedure (chr);
1495   if (scm_is_true (scm_procedure_p (proc)))
1496     {
1497       long line = SCM_LINUM (port);
1498       int column = SCM_COL (port) - 2;
1499       SCM got;
1500
1501       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1502
1503       if (opts->record_positions_p && SCM_NIMP (got)
1504           && !scm_i_has_source_properties (got))
1505         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1506
1507       return got;
1508     }
1509
1510   return SCM_UNSPECIFIED;
1511 }
1512
1513 /* The reader for the sharp `#' character.  It basically dispatches reads
1514    among the above token readers.   */
1515 static SCM
1516 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1517                 long line, int column)
1518 #define FUNC_NAME "scm_lreadr"
1519 {
1520   SCM result;
1521
1522   chr = scm_getc (port);
1523
1524   result = scm_read_sharp_extension (chr, port, opts);
1525   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1526     return result;
1527
1528   switch (chr)
1529     {
1530     case '\\':
1531       return (scm_read_character (chr, port, opts));
1532     case '(':
1533       return (scm_read_vector (chr, port, opts, line, column));
1534     case 's':
1535     case 'u':
1536     case 'f':
1537     case 'c':
1538       /* This one may return either a boolean or an SRFI-4 vector.  */
1539       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1540     case 'v':
1541       return (scm_read_bytevector (chr, port, opts, line, column));
1542     case '*':
1543       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1544     case 't':
1545     case 'T':
1546     case 'F':
1547       return (scm_read_boolean (chr, port));
1548     case ':':
1549       return (scm_read_keyword (chr, port, opts));
1550     case '0': case '1': case '2': case '3': case '4':
1551     case '5': case '6': case '7': case '8': case '9':
1552     case '@':
1553 #if SCM_ENABLE_DEPRECATED
1554       /* See below for 'i' and 'e'. */
1555     case 'a':
1556     case 'y':
1557     case 'h':
1558     case 'l':
1559 #endif
1560       return (scm_read_array (chr, port, opts, line, column));
1561
1562     case 'i':
1563     case 'e':
1564 #if SCM_ENABLE_DEPRECATED
1565       {
1566         /* When next char is '(', it really is an old-style
1567            uniform array. */
1568         scm_t_wchar next_c = scm_getc (port);
1569         if (next_c != EOF)
1570           scm_ungetc (next_c, port);
1571         if (next_c == '(')
1572           return scm_read_array (chr, port, opts, line, column);
1573         /* Fall through. */
1574       }
1575 #endif
1576     case 'b':
1577     case 'B':
1578     case 'o':
1579     case 'O':
1580     case 'd':
1581     case 'D':
1582     case 'x':
1583     case 'X':
1584     case 'I':
1585     case 'E':
1586       return (scm_read_number_and_radix (chr, port, opts));
1587     case '{':
1588       return (scm_read_extended_symbol (chr, port));
1589     case '!':
1590       return (scm_read_shebang (chr, port, opts));
1591     case ';':
1592       return (scm_read_commented_expression (chr, port, opts));
1593     case '`':
1594     case '\'':
1595     case ',':
1596       return (scm_read_syntax (chr, port, opts));
1597     case 'n':
1598       return (scm_read_nil (chr, port, opts));
1599     default:
1600       result = scm_read_sharp_extension (chr, port, opts);
1601       if (scm_is_eq (result, SCM_UNSPECIFIED))
1602         {
1603           /* To remain compatible with 1.8 and earlier, the following
1604              characters have lower precedence than `read-hash-extend'
1605              characters.  */
1606           switch (chr)
1607             {
1608             case '|':
1609               return scm_read_r6rs_block_comment (chr, port);
1610             default:
1611               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1612                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1613             }
1614         }
1615       else
1616         return result;
1617     }
1618
1619   return SCM_UNSPECIFIED;
1620 }
1621 #undef FUNC_NAME
1622
1623 static SCM
1624 scm_read_expression (SCM port, scm_t_read_opts *opts)
1625 #define FUNC_NAME "scm_read_expression"
1626 {
1627   while (1)
1628     {
1629       scm_t_wchar chr;
1630
1631       chr = scm_getc (port);
1632
1633       switch (chr)
1634         {
1635         case SCM_WHITE_SPACES:
1636         case SCM_LINE_INCREMENTORS:
1637           break;
1638         case ';':
1639           (void) scm_read_semicolon_comment (chr, port);
1640           break;
1641         case '[':
1642           if (!opts->square_brackets_p)
1643             return (scm_read_mixed_case_symbol (chr, port, opts));
1644           /* otherwise fall through */
1645         case '(':
1646           return (scm_read_sexp (chr, port, opts));
1647         case '"':
1648           return (scm_read_string (chr, port, opts));
1649         case '\'':
1650         case '`':
1651         case ',':
1652           return (scm_read_quote (chr, port, opts));
1653         case '#':
1654           {
1655             long line  = SCM_LINUM (port);
1656             int column = SCM_COL (port) - 1;
1657             SCM result = scm_read_sharp (chr, port, opts, line, column);
1658             if (scm_is_eq (result, SCM_UNSPECIFIED))
1659               /* We read a comment or some such.  */
1660               break;
1661             else
1662               return result;
1663           }
1664         case ')':
1665           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1666           break;
1667         case ']':
1668           if (opts->square_brackets_p)
1669             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1670           /* otherwise fall through */
1671         case EOF:
1672           return SCM_EOF_VAL;
1673         case ':':
1674           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1675             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1676           /* Fall through.  */
1677
1678         default:
1679           {
1680             if (((chr >= '0') && (chr <= '9'))
1681                 || (strchr ("+-.", chr)))
1682               return (scm_read_number (chr, port, opts));
1683             else
1684               return (scm_read_mixed_case_symbol (chr, port, opts));
1685           }
1686         }
1687     }
1688 }
1689 #undef FUNC_NAME
1690
1691 \f
1692 /* Actual reader.  */
1693
1694 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1695             (SCM port),
1696             "Read an s-expression from the input port @var{port}, or from\n"
1697             "the current input port if @var{port} is not specified.\n"
1698             "Any whitespace before the next token is discarded.")
1699 #define FUNC_NAME s_scm_read
1700 {
1701   scm_t_read_opts opts;
1702   int c;
1703
1704   if (SCM_UNBNDP (port))
1705     port = scm_current_input_port ();
1706   SCM_VALIDATE_OPINPORT (1, port);
1707
1708   init_read_options (&opts);
1709
1710   c = flush_ws (port, &opts, (char *) NULL);
1711   if (EOF == c)
1712     return SCM_EOF_VAL;
1713   scm_ungetc (c, port);
1714
1715   return (scm_read_expression (port, &opts));
1716 }
1717 #undef FUNC_NAME
1718
1719
1720 \f
1721
1722 /* Manipulate the read-hash-procedures alist.  This could be written in
1723    Scheme, but maybe it will also be used by C code during initialisation.  */
1724 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1725             (SCM chr, SCM proc),
1726             "Install the procedure @var{proc} for reading expressions\n"
1727             "starting with the character sequence @code{#} and @var{chr}.\n"
1728             "@var{proc} will be called with two arguments:  the character\n"
1729             "@var{chr} and the port to read further data from. The object\n"
1730             "returned will be the return value of @code{read}. \n"
1731             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1732             )
1733 #define FUNC_NAME s_scm_read_hash_extend
1734 {
1735   SCM this;
1736   SCM prev;
1737
1738   SCM_VALIDATE_CHAR (1, chr);
1739   SCM_ASSERT (scm_is_false (proc)
1740               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1741               proc, SCM_ARG2, FUNC_NAME);
1742
1743   /* Check if chr is already in the alist.  */
1744   this = scm_i_read_hash_procedures_ref ();
1745   prev = SCM_BOOL_F;
1746   while (1)
1747     {
1748       if (scm_is_null (this))
1749         {
1750           /* not found, so add it to the beginning.  */
1751           if (scm_is_true (proc))
1752             {
1753               SCM new = scm_cons (scm_cons (chr, proc),
1754                                   scm_i_read_hash_procedures_ref ());
1755               scm_i_read_hash_procedures_set_x (new);
1756             }
1757           break;
1758         }
1759       if (scm_is_eq (chr, SCM_CAAR (this)))
1760         {
1761           /* already in the alist.  */
1762           if (scm_is_false (proc))
1763             {
1764               /* remove it.  */
1765               if (scm_is_false (prev))
1766                 {
1767                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1768                   scm_i_read_hash_procedures_set_x (rest);
1769                 }
1770               else
1771                 scm_set_cdr_x (prev, SCM_CDR (this));
1772             }
1773           else
1774             {
1775               /* replace it.  */
1776               scm_set_cdr_x (SCM_CAR (this), proc);
1777             }
1778           break;
1779         }
1780       prev = this;
1781       this = SCM_CDR (this);
1782     }
1783
1784   return SCM_UNSPECIFIED;
1785 }
1786 #undef FUNC_NAME
1787
1788 /* Recover the read-hash procedure corresponding to char c.  */
1789 static SCM
1790 scm_get_hash_procedure (int c)
1791 {
1792   SCM rest = scm_i_read_hash_procedures_ref ();
1793
1794   while (1)
1795     {
1796       if (scm_is_null (rest))
1797         return SCM_BOOL_F;
1798
1799       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1800         return SCM_CDAR (rest);
1801
1802       rest = SCM_CDR (rest);
1803     }
1804 }
1805
1806 #define SCM_ENCODING_SEARCH_SIZE (500)
1807
1808 /* Search the first few hundred characters of a file for an Emacs-like coding
1809    declaration.  Returns either NULL or a string whose storage has been
1810    allocated with `scm_gc_malloc ()'.  */
1811 char *
1812 scm_i_scan_for_encoding (SCM port)
1813 {
1814   scm_t_port *pt;
1815   char header[SCM_ENCODING_SEARCH_SIZE+1];
1816   size_t bytes_read, encoding_length, i;
1817   char *encoding = NULL;
1818   int utf8_bom = 0;
1819   char *pos, *encoding_start;
1820   int in_comment;
1821
1822   pt = SCM_PTAB_ENTRY (port);
1823
1824   if (pt->rw_active == SCM_PORT_WRITE)
1825     scm_flush (port);
1826
1827   if (pt->rw_random)
1828     pt->rw_active = SCM_PORT_READ;
1829
1830   if (pt->read_pos == pt->read_end)
1831     {
1832       /* We can use the read buffer, and thus avoid a seek. */
1833       if (scm_fill_input (port) == EOF)
1834         return NULL;
1835
1836       bytes_read = pt->read_end - pt->read_pos;
1837       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1838         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1839
1840       if (bytes_read <= 1)
1841         /* An unbuffered port -- don't scan.  */
1842         return NULL;
1843
1844       memcpy (header, pt->read_pos, bytes_read);
1845       header[bytes_read] = '\0';
1846     }
1847   else
1848     {
1849       /* Try to read some bytes and then seek back.  Not all ports
1850          support seeking back; and indeed some file ports (like
1851          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1852          check performed by SCM_FPORT_FDES---but fail to seek
1853          backwards.  Hence this block comes second.  We prefer to use
1854          the read buffer in-place.  */
1855       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1856         return NULL;
1857
1858       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1859       header[bytes_read] = '\0';
1860       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1861     }
1862
1863   if (bytes_read > 3
1864       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1865     utf8_bom = 1;
1866
1867   /* search past "coding[:=]" */
1868   pos = header;
1869   while (1)
1870     {
1871       if ((pos = strstr(pos, "coding")) == NULL)
1872         return NULL;
1873
1874       pos += strlen("coding");
1875       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1876           (*pos == ':' || *pos == '='))
1877         {
1878           pos ++;
1879           break;
1880         }
1881     }
1882
1883   /* skip spaces */
1884   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1885          (*pos == ' ' || *pos == '\t'))
1886     pos ++;
1887
1888   /* grab the next token */
1889   encoding_start = pos;
1890   i = 0;
1891   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1892          && encoding_start + i - header < bytes_read
1893          && (isalnum ((int) encoding_start[i])
1894              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1895     i++;
1896
1897   encoding_length = i;
1898   if (encoding_length == 0)
1899     return NULL;
1900
1901   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1902   for (i = 0; i < encoding_length; i++)
1903     encoding[i] = toupper ((int) encoding[i]);
1904
1905   /* push backwards to make sure we were in a comment */
1906   in_comment = 0;
1907   pos = encoding_start;
1908   while (pos >= header)
1909     {
1910       if (*pos == ';')
1911         {
1912           in_comment = 1;
1913           break;
1914         }
1915       else if (*pos == '\n' || pos == header)
1916         {
1917           /* This wasn't in a semicolon comment. Check for a
1918            hash-bang comment. */
1919           char *beg = strstr (header, "#!");
1920           char *end = strstr (header, "!#");
1921           if (beg < encoding_start && encoding_start + encoding_length <= end)
1922             in_comment = 1;
1923           break;
1924         }
1925       else
1926         {
1927           pos --;
1928           continue;
1929         }
1930     }
1931   if (!in_comment)
1932     /* This wasn't in a comment */
1933     return NULL;
1934
1935   if (utf8_bom && strcmp(encoding, "UTF-8"))
1936     scm_misc_error (NULL,
1937                     "the port input declares the encoding ~s but is encoded as UTF-8",
1938                     scm_list_1 (scm_from_locale_string (encoding)));
1939
1940   return encoding;
1941 }
1942
1943 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1944             (SCM port),
1945             "Scans the port for an Emacs-like character coding declaration\n"
1946             "near the top of the contents of a port with random-accessible contents.\n"
1947             "The coding declaration is of the form\n"
1948             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1949             "\n"
1950             "Returns a string containing the character encoding of the file\n"
1951             "if a declaration was found, or @code{#f} otherwise.\n")
1952 #define FUNC_NAME s_scm_file_encoding
1953 {
1954   char *enc;
1955   SCM s_enc;
1956
1957   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1958
1959   enc = scm_i_scan_for_encoding (port);
1960   if (enc == NULL)
1961     return SCM_BOOL_F;
1962   else
1963     {
1964       s_enc = scm_from_locale_string (enc);
1965       return s_enc;
1966     }
1967
1968   return SCM_BOOL_F;
1969 }
1970 #undef FUNC_NAME
1971
1972 void
1973 scm_init_read ()
1974 {
1975   SCM read_hash_procs;
1976
1977   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1978
1979   scm_i_read_hash_procedures =
1980     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1981
1982   scm_init_opts (scm_read_options, scm_read_opts);
1983 #include "libguile/read.x"
1984 }
1985
1986 /*
1987   Local Variables:
1988   c-file-style: "gnu"
1989   End:
1990 */