libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33 #include <c-strcase.h>
  34 #include <c-ctype.h>
  35
  36 #include "libguile/_scm.h"
  37 #include "libguile/bytevectors.h"
  38 #include "libguile/chars.h"
  39 #include "libguile/eval.h"
  40 #include "libguile/arrays.h"
  41 #include "libguile/bitvectors.h"
  42 #include "libguile/keywords.h"
  43 #include "libguile/alist.h"
  44 #include "libguile/srcprop.h"
  45 #include "libguile/hashtab.h"
  46 #include "libguile/hash.h"
  47 #include "libguile/ports.h"
  48 #include "libguile/fports.h"
  49 #include "libguile/root.h"
  50 #include "libguile/strings.h"
  51 #include "libguile/strports.h"
  52 #include "libguile/vectors.h"
  53 #include "libguile/validate.h"
  54 #include "libguile/srfi-4.h"
  55 #include "libguile/srfi-13.h"
  56
  57 #include "libguile/read.h"
  58 #include "libguile/private-options.h"
  59
  60
  61 \f
  62
  63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  66 SCM_SYMBOL (sym_nil, "nil");
  67
  68 /* SRFI-105 curly infix expression support */
  69 SCM_SYMBOL (sym_nfx, "$nfx$");
  70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  72
  73 scm_t_option scm_read_opts[] =
  74   {
  75     { SCM_OPTION_BOOLEAN, "copy", 0,
  76       "Copy source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "positions", 1,
  78       "Record positions of source code expressions." },
  79     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  80       "Convert symbols to lower case."},
  81     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  82       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  83     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  84       "Use R6RS variable-length character and string hex escapes."},
  85     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  86       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  87     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  88       "In strings, consume leading whitespace after an escaped end-of-line."},
  89     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  90       "Support SRFI-105 curly infix expressions."},
  91     { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
  92       "Support R7RS |...| symbol notation."},
  93     { 0, },
  94   };
  95
  96 /* Internal read options structure.  This is initialized by 'scm_read'
  97    from the global and per-port read options, and a pointer is passed
  98    down to all helper functions. */
  99
 100 enum t_keyword_style
 101   {
 102     KEYWORD_STYLE_HASH_PREFIX,
 103     KEYWORD_STYLE_PREFIX,
 104     KEYWORD_STYLE_POSTFIX
 105   };
 106
 107 struct t_read_opts
 108 {
 109   enum t_keyword_style keyword_style;
 110   unsigned int copy_source_p        : 1;
 111   unsigned int record_positions_p   : 1;
 112   unsigned int case_insensitive_p   : 1;
 113   unsigned int r6rs_escapes_p       : 1;
 114   unsigned int square_brackets_p    : 1;
 115   unsigned int hungry_eol_escapes_p : 1;
 116   unsigned int curly_infix_p        : 1;
 117   unsigned int neoteric_p           : 1;
 118   unsigned int r7rs_symbols_p       : 1;
 119 };
 120
 121 typedef struct t_read_opts scm_t_read_opts;
 122
 123
 124 /*
 125   Give meaningful error messages for errors
 126
 127   We use the format
 128
 129   FILE:LINE:COL: MESSAGE
 130   This happened in ....
 131
 132   This is not standard GNU format, but the test-suite likes the real
 133   message to be in front.
 134
 135  */
 136
 137
 138 void
 139 scm_i_input_error (char const *function,
 140                    SCM port, const char *message, SCM arg)
 141 {
 142   SCM fn = (scm_is_string (SCM_FILENAME(port))
 143             ? SCM_FILENAME(port)
 144             : scm_from_locale_string ("#<unknown port>"));
 145
 146   SCM string_port = scm_open_output_string ();
 147   SCM string = SCM_EOL;
 148   scm_simple_format (string_port,
 149                      scm_from_locale_string ("~A:~S:~S: ~A"),
 150                      scm_list_4 (fn,
 151                                  scm_from_long (SCM_LINUM (port) + 1),
 152                                  scm_from_int (SCM_COL (port) + 1),
 153                                  scm_from_locale_string (message)));
 154
 155   string = scm_get_output_string (string_port);
 156   scm_close_output_port (string_port);
 157   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 158                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 159                  string,
 160                  arg,
 161                  SCM_BOOL_F);
 162 }
 163
 164
 165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 166             (SCM setting),
 167             "Option interface for the read options. Instead of using\n"
 168             "this procedure directly, use the procedures @code{read-enable},\n"
 169             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 170 #define FUNC_NAME s_scm_read_options
 171 {
 172   SCM ans = scm_options (setting,
 173                          scm_read_opts,
 174                          FUNC_NAME);
 175   if (SCM_COPY_SOURCE_P)
 176     SCM_RECORD_POSITIONS_P = 1;
 177   return ans;
 178 }
 179 #undef FUNC_NAME
 180
 181 /* A fluid referring to an association list mapping extra hash
 182    characters to procedures.  */
 183 static SCM *scm_i_read_hash_procedures;
 184
 185 static SCM
 186 scm_i_read_hash_procedures_ref (void)
 187 {
 188   return scm_fluid_ref (*scm_i_read_hash_procedures);
 189 }
 190
 191 static void
 192 scm_i_read_hash_procedures_set_x (SCM value)
 193 {
 194   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 195 }
 196
 197 \f
 198 /* Token readers.  */
 199
 200
 201 /* Size of the C buffer used to read symbols and numbers.  */
 202 #define READER_BUFFER_SIZE            128
 203
 204 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 205 #define READER_STRING_BUFFER_SIZE     128
 206
 207 /* The maximum size of Scheme character names.  */
 208 #define READER_CHAR_NAME_MAX_SIZE      50
 209
 210 /* The maximum size of reader directive names.  */
 211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 212
 213
 214 /* `isblank' is only in C99.  */
 215 #define CHAR_IS_BLANK_(_chr)                                    \
 216   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 217    || ((_chr) == '\f') || ((_chr) == '\r'))
 218
 219 #ifdef MSDOS
 220 # define CHAR_IS_BLANK(_chr)                    \
 221   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 222 #else
 223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 224 #endif
 225
 226
 227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 228    structure'').  */
 229 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 230   (CHAR_IS_BLANK (c)                                            \
 231    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 232
 233 #define CHAR_IS_DELIMITER(c)                                    \
 234   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 235    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 236                                       || opts->curly_infix_p))  \
 237    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 238
 239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 240    Structure''.  */
 241 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 242   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 243    || ((_chr) == 'd') || ((_chr) == 'l'))
 244
 245 /* Read an SCSH block comment.  */
 246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 250 static SCM scm_get_hash_procedure (int);
 251
 252 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 253    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 254    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 255    bytes actually read.  */
 256 static int
 257 read_token (SCM port, scm_t_read_opts *opts,
 258             char *buf, size_t buf_size, size_t *read)
 259 {
 260    *read = 0;
 261
 262    while (*read < buf_size)
 263      {
 264        int chr;
 265
 266        chr = scm_get_byte_or_eof (port);
 267
 268        if (chr == EOF)
 269         return 0;
 270       else if (CHAR_IS_DELIMITER (chr))
 271         {
 272           scm_unget_byte (chr, port);
 273           return 0;
 274         }
 275       else
 276         {
 277           *buf = (char) chr;
 278           buf++, (*read)++;
 279         }
 280      }
 281
 282    return 1;
 283  }
 284
 285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 286    if the token doesn't fit in BUFFER_SIZE bytes.  */
 287 static char *
 288 read_complete_token (SCM port, scm_t_read_opts *opts,
 289                      char *buffer, size_t buffer_size, size_t *read)
 290 {
 291   int overflow = 0;
 292   size_t bytes_read, overflow_size = 0;
 293   char *overflow_buffer = NULL;
 294
 295   do
 296     {
 297       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 298       if (bytes_read == 0)
 299         break;
 300       if (overflow || overflow_size != 0)
 301         {
 302           if (overflow_size == 0)
 303             {
 304               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 305               memcpy (overflow_buffer, buffer, bytes_read);
 306               overflow_size = bytes_read;
 307             }
 308           else
 309             {
 310               char *new_buf =
 311                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 312
 313               memcpy (new_buf, overflow_buffer, overflow_size);
 314               memcpy (new_buf + overflow_size, buffer, bytes_read);
 315
 316               overflow_buffer = new_buf;
 317               overflow_size += bytes_read;
 318             }
 319         }
 320     }
 321   while (overflow);
 322
 323   if (overflow_size)
 324     *read = overflow_size;
 325   else
 326     *read = bytes_read;
 327
 328   return (overflow_size > 0 ? overflow_buffer : buffer);
 329 }
 330
 331 /* Skip whitespace from PORT and return the first non-whitespace character
 332    read.  Raise an error on end-of-file.  */
 333 static int
 334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 335 {
 336   scm_t_wchar c;
 337   while (1)
 338     switch (c = scm_getc (port))
 339       {
 340       case EOF:
 341       goteof:
 342         if (eoferr)
 343           {
 344             scm_i_input_error (eoferr,
 345                                port,
 346                                "end of file",
 347                                SCM_EOL);
 348           }
 349         return c;
 350
 351       case ';':
 352       lp:
 353         switch (c = scm_getc (port))
 354           {
 355           case EOF:
 356             goto goteof;
 357           default:
 358             goto lp;
 359           case SCM_LINE_INCREMENTORS:
 360             break;
 361           }
 362         break;
 363
 364       case '#':
 365         switch (c = scm_getc (port))
 366           {
 367           case EOF:
 368             eoferr = "read_sharp";
 369             goto goteof;
 370           case '!':
 371             scm_read_shebang (c, port, opts);
 372             break;
 373           case ';':
 374             scm_read_commented_expression (c, port, opts);
 375             break;
 376           case '|':
 377             if (scm_is_false (scm_get_hash_procedure (c)))
 378               {
 379                 scm_read_r6rs_block_comment (c, port);
 380                 break;
 381               }
 382             /* fall through */
 383           default:
 384             scm_ungetc (c, port);
 385             return '#';
 386           }
 387         break;
 388
 389       case SCM_LINE_INCREMENTORS:
 390       case SCM_SINGLE_SPACES:
 391       case '\t':
 392         break;
 393
 394       default:
 395         return c;
 396       }
 397
 398   return 0;
 399 }
 400
 401
 402 \f
 403 /* Token readers.  */
 404
 405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 407                            long line, int column);
 408
 409
 410 static SCM
 411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 412                        long line, int column)
 413 {
 414   if (opts->record_positions_p)
 415     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 416   return x;
 417 }
 418
 419 static SCM
 420 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 421 #define FUNC_NAME "scm_i_lreadparen"
 422 {
 423   int c;
 424   SCM tmp, tl, ans = SCM_EOL;
 425   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 426   const int terminating_char = ((chr == '{') ? '}'
 427                                 : ((chr == '[') ? ']'
 428                                    : ')'));
 429
 430   /* Need to capture line and column numbers here. */
 431   long line = SCM_LINUM (port);
 432   int column = SCM_COL (port) - 1;
 433
 434   c = flush_ws (port, opts, FUNC_NAME);
 435   if (terminating_char == c)
 436     return SCM_EOL;
 437
 438   scm_ungetc (c, port);
 439   tmp = scm_read_expression (port, opts);
 440
 441   /* Note that it is possible for scm_read_expression to return
 442      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 443      check that it's a real dot by checking `c'.  */
 444   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 445     {
 446       ans = scm_read_expression (port, opts);
 447       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 448         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 449                            SCM_EOL);
 450       return ans;
 451     }
 452
 453   /* Build the head of the list structure. */
 454   ans = tl = scm_cons (tmp, SCM_EOL);
 455
 456   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 457     {
 458       SCM new_tail;
 459
 460       if (c == ')' || (c == ']' && opts->square_brackets_p)
 461           || ((c == '}' || c == ']') && opts->curly_infix_p))
 462         scm_i_input_error (FUNC_NAME, port,
 463                            "in pair: mismatched close paren: ~A",
 464                            scm_list_1 (SCM_MAKE_CHAR (c)));
 465
 466       scm_ungetc (c, port);
 467       tmp = scm_read_expression (port, opts);
 468
 469       /* See above note about scm_sym_dot.  */
 470       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 471         {
 472           SCM_SETCDR (tl, scm_read_expression (port, opts));
 473
 474           c = flush_ws (port, opts, FUNC_NAME);
 475           if (terminating_char != c)
 476             scm_i_input_error (FUNC_NAME, port,
 477                                "in pair: missing close paren", SCM_EOL);
 478           break;
 479         }
 480
 481       new_tail = scm_cons (tmp, SCM_EOL);
 482       SCM_SETCDR (tl, new_tail);
 483       tl = new_tail;
 484     }
 485
 486   if (curly_list_p)
 487     {
 488       /* In addition to finding the length, 'scm_ilength' checks for
 489          improper or circular lists, in which case it returns -1. */
 490       int len = scm_ilength (ans);
 491
 492       /* The (len == 0) case is handled above */
 493       if (len == 1)
 494         /* Return directly to avoid re-annotating the element's source
 495            location with the position of the outer brace.  Also, it
 496            might not be possible to annotate the element. */
 497         return scm_car (ans);  /* {e} => e */
 498       else if (len == 2)
 499         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 500       else if (len >= 3 && (len & 1))
 501         {
 502           /* It's a proper list whose length is odd and at least 3.  If
 503              the elements at odd indices (the infix operator positions)
 504              are all 'equal?', then it's a simple curly-infix list.
 505              Otherwise it's a mixed curly-infix list. */
 506           SCM op = scm_cadr (ans);
 507
 508           /* Check to see if the elements at odd indices are 'equal?' */
 509           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 510             {
 511               if (scm_is_null (tl))
 512                 {
 513                   /* Convert simple curly-infix list to prefix:
 514                      {a <op> b <op> ...} => (<op> a b ...) */
 515                   tl = ans;
 516                   while (scm_is_pair (scm_cdr (tl)))
 517                     {
 518                       tmp = scm_cddr (tl);
 519                       SCM_SETCDR (tl, tmp);
 520                       tl = tmp;
 521                     }
 522                   ans = scm_cons (op, ans);
 523                   break;
 524                 }
 525               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 526                 {
 527                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 528                   ans = scm_cons (sym_nfx, ans);
 529                   break;
 530                 }
 531             }
 532         }
 533       else
 534         /* Mixed curly-infix (possibly improper) list:
 535            {e . tail} => ($nfx$ e . tail) */
 536         ans = scm_cons (sym_nfx, ans);
 537     }
 538
 539   return maybe_annotate_source (ans, port, opts, line, column);
 540 }
 541 #undef FUNC_NAME
 542
 543
 544 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 545    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 546    found.  */
 547 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 548   do                                                               \
 549     {                                                              \
 550       scm_t_wchar a;                                               \
 551       size_t i = 0;                                                \
 552       c = 0;                                                       \
 553       while (i < ndigits)                                          \
 554         {                                                          \
 555           a = scm_getc (port);                                     \
 556           if (a == EOF)                                            \
 557             goto str_eof;                                          \
 558           if (terminator                                           \
 559               && (a == (scm_t_wchar) terminator)                   \
 560               && (i > 0))                                          \
 561             break;                                                 \
 562           if ('0' <= a && a <= '9')                                \
 563             a -= '0';                                              \
 564           else if ('A' <= a && a <= 'F')                           \
 565             a = a - 'A' + 10;                                      \
 566           else if ('a' <= a && a <= 'f')                           \
 567             a = a - 'a' + 10;                                      \
 568           else                                                     \
 569             {                                                      \
 570               c = a;                                               \
 571               goto bad_escaped;                                    \
 572             }                                                      \
 573           c = c * 16 + a;                                          \
 574           i ++;                                                    \
 575         }                                                          \
 576     } while (0)
 577
 578 static void
 579 skip_intraline_whitespace (SCM port)
 580 {
 581   scm_t_wchar c;
 582
 583   do
 584     {
 585       c = scm_getc (port);
 586       if (c == EOF)
 587         return;
 588     }
 589   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 590
 591   scm_ungetc (c, port);
 592 }
 593
 594 /* Read either a double-quoted string or an R7RS-style symbol delimited
 595    by vertical lines, depending on the value of 'chr' ('"' or '|').
 596    Regardless, the result is always returned as a string.  */
 597 static SCM
 598 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
 599 #define FUNC_NAME "scm_lreadr"
 600 {
 601   /* For strings smaller than C_STR, this function creates only one Scheme
 602      object (the string returned).  */
 603
 604   SCM str = SCM_EOL;
 605   size_t c_str_len = 0;
 606   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 607
 608   /* Need to capture line and column numbers here. */
 609   long line = SCM_LINUM (port);
 610   int column = SCM_COL (port) - 1;
 611
 612   while (chr != (c = scm_getc (port)))
 613     {
 614       if (c == EOF)
 615         {
 616         str_eof:
 617           scm_i_input_error (FUNC_NAME, port,
 618                              (chr == '|'
 619                               ? "end of file in symbol"
 620                               : "end of file in string constant"),
 621                              SCM_EOL);
 622         }
 623
 624       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 625         {
 626           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 627           c_str_len = 0;
 628         }
 629
 630       if (c == '\\')
 631         {
 632           switch (c = scm_getc (port))
 633             {
 634             case EOF:
 635               goto str_eof;
 636             case '|':
 637             case '\\':
 638               break;
 639             case '\n':
 640               if (opts->hungry_eol_escapes_p)
 641                 skip_intraline_whitespace (port);
 642               continue;
 643             case '0':
 644               c = '\0';
 645               break;
 646             case 'f':
 647               c = '\f';
 648               break;
 649             case 'n':
 650               c = '\n';
 651               break;
 652             case 'r':
 653               c = '\r';
 654               break;
 655             case 't':
 656               c = '\t';
 657               break;
 658             case 'a':
 659               c = '\007';
 660               break;
 661             case 'v':
 662               c = '\v';
 663               break;
 664             case 'b':
 665               c = '\010';
 666               break;
 667             case 'x':
 668               if (opts->r6rs_escapes_p || chr == '|')
 669                 SCM_READ_HEX_ESCAPE (10, ';');
 670               else
 671                 SCM_READ_HEX_ESCAPE (2, '\0');
 672               break;
 673             case 'u':
 674               if (!opts->r6rs_escapes_p)
 675                 {
 676                   SCM_READ_HEX_ESCAPE (4, '\0');
 677                   break;
 678                 }
 679             case 'U':
 680               if (!opts->r6rs_escapes_p)
 681                 {
 682                   SCM_READ_HEX_ESCAPE (6, '\0');
 683                   break;
 684                 }
 685             default:
 686               if (c == chr)
 687                 break;
 688             bad_escaped:
 689               scm_i_input_error (FUNC_NAME, port,
 690                                  "illegal character in escape sequence: ~S",
 691                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 692             }
 693         }
 694
 695       c_str[c_str_len++] = c;
 696     }
 697
 698   if (scm_is_null (str))
 699     /* Fast path: we got a string that fits in C_STR.  */
 700     str = scm_from_utf32_stringn (c_str, c_str_len);
 701   else
 702     {
 703       if (c_str_len > 0)
 704         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 705
 706       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 707     }
 708
 709   return maybe_annotate_source (str, port, opts, line, column);
 710 }
 711 #undef FUNC_NAME
 712
 713 static SCM
 714 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 715 {
 716   return scm_read_string_like_syntax (chr, port, opts);
 717 }
 718
 719 static SCM
 720 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
 721 {
 722   return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
 723 }
 724
 725 static SCM
 726 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 727 {
 728   SCM result, str = SCM_EOL;
 729   char local_buffer[READER_BUFFER_SIZE], *buffer;
 730   size_t bytes_read;
 731   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 732
 733   /* Need to capture line and column numbers here. */
 734   long line = SCM_LINUM (port);
 735   int column = SCM_COL (port) - 1;
 736
 737   scm_ungetc (chr, port);
 738   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 739                                 &bytes_read);
 740
 741   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 742
 743   result = scm_string_to_number (str, SCM_UNDEFINED);
 744   if (scm_is_false (result))
 745     {
 746       /* Return a symbol instead of a number */
 747       if (opts->case_insensitive_p)
 748         str = scm_string_downcase_x (str);
 749       result = scm_string_to_symbol (str);
 750     }
 751   else if (SCM_NIMP (result))
 752     result = maybe_annotate_source (result, port, opts, line, column);
 753
 754   SCM_COL (port) += scm_i_string_length (str);
 755   return result;
 756 }
 757
 758 static SCM
 759 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 760 {
 761   SCM result;
 762   int ends_with_colon = 0;
 763   size_t bytes_read;
 764   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 765   char local_buffer[READER_BUFFER_SIZE], *buffer;
 766   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 767   SCM str;
 768
 769   scm_ungetc (chr, port);
 770   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 771                                 &bytes_read);
 772   if (bytes_read > 0)
 773     ends_with_colon = buffer[bytes_read - 1] == ':';
 774
 775   if (postfix && ends_with_colon && (bytes_read > 1))
 776     {
 777       str = scm_from_stringn (buffer, bytes_read - 1,
 778                               pt->encoding, pt->ilseq_handler);
 779
 780       if (opts->case_insensitive_p)
 781         str = scm_string_downcase_x (str);
 782       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 783     }
 784   else
 785     {
 786       str = scm_from_stringn (buffer, bytes_read,
 787                               pt->encoding, pt->ilseq_handler);
 788
 789       if (opts->case_insensitive_p)
 790         str = scm_string_downcase_x (str);
 791       result = scm_string_to_symbol (str);
 792     }
 793
 794   SCM_COL (port) += scm_i_string_length (str);
 795   return result;
 796 }
 797
 798 static SCM
 799 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 800 #define FUNC_NAME "scm_lreadr"
 801 {
 802   SCM result;
 803   size_t read;
 804   char local_buffer[READER_BUFFER_SIZE], *buffer;
 805   unsigned int radix;
 806   SCM str;
 807   scm_t_port *pt;
 808
 809   switch (chr)
 810     {
 811     case 'B':
 812     case 'b':
 813       radix = 2;
 814       break;
 815
 816     case 'o':
 817     case 'O':
 818       radix = 8;
 819       break;
 820
 821     case 'd':
 822     case 'D':
 823       radix = 10;
 824       break;
 825
 826     case 'x':
 827     case 'X':
 828       radix = 16;
 829       break;
 830
 831     default:
 832       scm_ungetc (chr, port);
 833       scm_ungetc ('#', port);
 834       radix = 10;
 835     }
 836
 837   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 838                                 &read);
 839
 840   pt = SCM_PTAB_ENTRY (port);
 841   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 842
 843   result = scm_string_to_number (str, scm_from_uint (radix));
 844
 845   SCM_COL (port) += scm_i_string_length (str);
 846
 847   if (scm_is_true (result))
 848     return result;
 849
 850   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 851
 852   return SCM_BOOL_F;
 853 }
 854 #undef FUNC_NAME
 855
 856 static SCM
 857 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 858 {
 859   SCM p;
 860   long line = SCM_LINUM (port);
 861   int column = SCM_COL (port) - 1;
 862
 863   switch (chr)
 864     {
 865     case '`':
 866       p = scm_sym_quasiquote;
 867       break;
 868
 869     case '\'':
 870       p = scm_sym_quote;
 871       break;
 872
 873     case ',':
 874       {
 875         scm_t_wchar c;
 876
 877         c = scm_getc (port);
 878         if ('@' == c)
 879           p = scm_sym_uq_splicing;
 880         else
 881           {
 882             scm_ungetc (c, port);
 883             p = scm_sym_unquote;
 884           }
 885         break;
 886       }
 887
 888     default:
 889       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 890                "scm_read_quote", chr);
 891       abort ();
 892     }
 893
 894   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 895   return maybe_annotate_source (p, port, opts, line, column);
 896 }
 897
 898 SCM_SYMBOL (sym_syntax, "syntax");
 899 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 900 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 901 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 902
 903 static SCM
 904 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 905 {
 906   SCM p;
 907   long line = SCM_LINUM (port);
 908   int column = SCM_COL (port) - 1;
 909
 910   switch (chr)
 911     {
 912     case '`':
 913       p = sym_quasisyntax;
 914       break;
 915
 916     case '\'':
 917       p = sym_syntax;
 918       break;
 919
 920     case ',':
 921       {
 922         int c;
 923
 924         c = scm_getc (port);
 925         if ('@' == c)
 926           p = sym_unsyntax_splicing;
 927         else
 928           {
 929             scm_ungetc (c, port);
 930             p = sym_unsyntax;
 931           }
 932         break;
 933       }
 934
 935     default:
 936       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 937                "scm_read_syntax", chr);
 938       abort ();
 939     }
 940
 941   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 942   return maybe_annotate_source (p, port, opts, line, column);
 943 }
 944
 945 static SCM
 946 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 947 {
 948   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 949
 950   if (!scm_is_eq (id, sym_nil))
 951     scm_i_input_error ("scm_read_nil", port,
 952                        "unexpected input while reading #nil: ~a",
 953                        scm_list_1 (id));
 954
 955   return SCM_ELISP_NIL;
 956 }
 957
 958 static SCM
 959 scm_read_semicolon_comment (int chr, SCM port)
 960 {
 961   int c;
 962
 963   /* We use the get_byte here because there is no need to get the
 964      locale correct with comment input. This presumes that newline
 965      always represents itself no matter what the encoding is.  */
 966   for (c = scm_get_byte_or_eof (port);
 967        (c != EOF) && (c != '\n');
 968        c = scm_get_byte_or_eof (port));
 969
 970   return SCM_UNSPECIFIED;
 971 }
 972
 973 /* If the EXPECTED_CHARS are the next ones available from PORT, then
 974    consume them and return 1.  Otherwise leave the port position where
 975    it was and return 0.  EXPECTED_CHARS should be all lowercase, and
 976    will be matched case-insensitively against the characters read from
 977    PORT. */
 978 static int
 979 try_read_ci_chars (SCM port, const char *expected_chars)
 980 {
 981   int num_chars_wanted = strlen (expected_chars);
 982   int num_chars_read = 0;
 983   char *chars_read = alloca (num_chars_wanted);
 984   int c;
 985
 986   while (num_chars_read < num_chars_wanted)
 987     {
 988       c = scm_getc (port);
 989       if (c == EOF)
 990         break;
 991       else if (c_tolower (c) != expected_chars[num_chars_read])
 992         {
 993           scm_ungetc (c, port);
 994           break;
 995         }
 996       else
 997         chars_read[num_chars_read++] = c;
 998     }
 999
1000   if (num_chars_read == num_chars_wanted)
1001     return 1;
1002   else
1003     {
1004       while (num_chars_read > 0)
1005         scm_ungetc (chars_read[--num_chars_read], port);
1006       return 0;
1007     }
1008 }
1009
1010 \f
1011 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
1012
1013 static SCM
1014 scm_read_boolean (int chr, SCM port)
1015 {
1016   switch (chr)
1017     {
1018     case 't':
1019     case 'T':
1020       try_read_ci_chars (port, "rue");
1021       return SCM_BOOL_T;
1022
1023     case 'f':
1024     case 'F':
1025       try_read_ci_chars (port, "alse");
1026       return SCM_BOOL_F;
1027     }
1028
1029   return SCM_UNSPECIFIED;
1030 }
1031
1032 static SCM
1033 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1034 #define FUNC_NAME "scm_lreadr"
1035 {
1036   char buffer[READER_CHAR_NAME_MAX_SIZE];
1037   SCM charname;
1038   size_t charname_len, bytes_read;
1039   scm_t_wchar cp;
1040   int overflow;
1041   scm_t_port *pt;
1042
1043   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1044                          &bytes_read);
1045   if (overflow)
1046     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1047
1048   if (bytes_read == 0)
1049     {
1050       chr = scm_getc (port);
1051       if (chr == EOF)
1052         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1053                            "while reading character", SCM_EOL);
1054
1055       /* CHR must be a token delimiter, like a whitespace.  */
1056       return (SCM_MAKE_CHAR (chr));
1057     }
1058
1059   pt = SCM_PTAB_ENTRY (port);
1060
1061   /* Simple ASCII characters can be processed immediately.  Also, simple
1062      ISO-8859-1 characters can be processed immediately if the encoding for this
1063      port is ISO-8859-1.  */
1064   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1065     {
1066       SCM_COL (port) += 1;
1067       return SCM_MAKE_CHAR (buffer[0]);
1068     }
1069
1070   /* Otherwise, convert the buffer into a proper scheme string for
1071      processing.  */
1072   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1073                                pt->ilseq_handler);
1074   charname_len = scm_i_string_length (charname);
1075   SCM_COL (port) += charname_len;
1076   cp = scm_i_string_ref (charname, 0);
1077   if (charname_len == 1)
1078     return SCM_MAKE_CHAR (cp);
1079
1080   /* Ignore dotted circles, which may be used to keep combining characters from
1081      combining with the backslash in #\charname.  */
1082   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1083     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1084
1085   if (cp >= '0' && cp < '8')
1086     {
1087       /* Dirk:FIXME::  This type of character syntax is not R5RS
1088        * compliant.  Further, it should be verified that the constant
1089        * does only consist of octal digits.  */
1090       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1091       if (SCM_I_INUMP (p))
1092         {
1093           scm_t_wchar c = scm_to_uint32 (p);
1094           if (SCM_IS_UNICODE_CHAR (c))
1095             return SCM_MAKE_CHAR (c);
1096           else
1097             scm_i_input_error (FUNC_NAME, port,
1098                                "out-of-range octal character escape: ~a",
1099                                scm_list_1 (charname));
1100         }
1101     }
1102
1103   if (cp == 'x' && (charname_len > 1))
1104     {
1105       SCM p;
1106
1107       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1109                                 scm_from_uint (16));
1110       if (SCM_I_INUMP (p))
1111         {
1112           scm_t_wchar c = scm_to_uint32 (p);
1113           if (SCM_IS_UNICODE_CHAR (c))
1114             return SCM_MAKE_CHAR (c);
1115           else
1116             scm_i_input_error (FUNC_NAME, port,
1117                                "out-of-range hex character escape: ~a",
1118                                scm_list_1 (charname));
1119         }
1120     }
1121
1122   /* The names of characters should never have non-Latin1
1123      characters.  */
1124   if (scm_i_is_narrow_string (charname)
1125       || scm_i_try_narrow_string (charname))
1126     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1127                                        charname_len);
1128       if (scm_is_true (ch))
1129         return ch;
1130     }
1131
1132   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1133                      scm_list_1 (charname));
1134
1135   return SCM_UNSPECIFIED;
1136 }
1137 #undef FUNC_NAME
1138
1139 static SCM
1140 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1141 {
1142   SCM symbol;
1143
1144   /* Read the symbol that comprises the keyword.  Doing this instead of
1145      invoking a specific symbol reader function allows `scm_read_keyword ()'
1146      to adapt to the delimiters currently valid of symbols.
1147
1148      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1149   symbol = scm_read_expression (port, opts);
1150   if (!scm_is_symbol (symbol))
1151     scm_i_input_error ("scm_read_keyword", port,
1152                        "keyword prefix `~a' not followed by a symbol: ~s",
1153                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1154
1155   return (scm_symbol_to_keyword (symbol));
1156 }
1157
1158 static SCM
1159 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1160                  long line, int column)
1161 {
1162   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163      guarantee that it's going to do what we want.  After all, this is an
1164      implementation detail of `scm_read_vector ()', not a desirable
1165      property.  */
1166   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1167                                 port, opts, line, column);
1168 }
1169
1170 /* Helper used by scm_read_array */
1171 static int
1172 read_decimal_integer (SCM port, int c, ssize_t *resp)
1173 {
1174   ssize_t sign = 1;
1175   ssize_t res = 0;
1176   int got_it = 0;
1177
1178   if (c == '-')
1179     {
1180       sign = -1;
1181       c = scm_getc (port);
1182     }
1183
1184   while ('0' <= c && c <= '9')
1185     {
1186       if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1187         scm_i_input_error ("read_decimal_integer", port,
1188                            "number too large", SCM_EOL);
1189       res = 10*res + c-'0';
1190       got_it = 1;
1191       c = scm_getc (port);
1192     }
1193
1194   if (got_it)
1195     *resp = sign * res;
1196   return c;
1197 }
1198
1199 /* Read an array.  This function can also read vectors and uniform
1200    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1201    handled here.
1202
1203    C is the first character read after the '#'. */
1204 static SCM
1205 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1206 {
1207   ssize_t rank;
1208   scm_t_wchar tag_buf[8];
1209   int tag_len;
1210
1211   SCM tag, shape = SCM_BOOL_F, elements, array;
1212
1213   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1214      the array code can not deal with zero-length dimensions yet, and
1215      we want to allow zero-length vectors, of course. */
1216   if (c == '(')
1217     return scm_read_vector (c, port, opts, line, column);
1218
1219   /* Disambiguate between '#f' and uniform floating point vectors. */
1220   if (c == 'f')
1221     {
1222       c = scm_getc (port);
1223       if (c != '3' && c != '6')
1224         {
1225           if (c == 'a' && try_read_ci_chars (port, "lse"))
1226             return SCM_BOOL_F;
1227           else if (c != EOF)
1228             scm_ungetc (c, port);
1229           return SCM_BOOL_F;
1230         }
1231       rank = 1;
1232       tag_buf[0] = 'f';
1233       tag_len = 1;
1234       goto continue_reading_tag;
1235     }
1236
1237   /* Read rank. */
1238   rank = 1;
1239   c = read_decimal_integer (port, c, &rank);
1240   if (rank < 0)
1241     scm_i_input_error (NULL, port, "array rank must be non-negative",
1242                        SCM_EOL);
1243
1244   /* Read tag. */
1245   tag_len = 0;
1246  continue_reading_tag:
1247   while (c != EOF && c != '(' && c != '@' && c != ':'
1248          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1249     {
1250       tag_buf[tag_len++] = c;
1251       c = scm_getc (port);
1252     }
1253   if (tag_len == 0)
1254     tag = SCM_BOOL_T;
1255   else
1256     {
1257       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1258       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1259         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1260                            scm_list_1 (tag));
1261     }
1262
1263   /* Read shape. */
1264   if (c == '@' || c == ':')
1265     {
1266       shape = SCM_EOL;
1267
1268       do
1269         {
1270           ssize_t lbnd = 0, len = 0;
1271           SCM s;
1272
1273           if (c == '@')
1274             {
1275               c = scm_getc (port);
1276               c = read_decimal_integer (port, c, &lbnd);
1277             }
1278
1279           s = scm_from_ssize_t (lbnd);
1280
1281           if (c == ':')
1282             {
1283               c = scm_getc (port);
1284               c = read_decimal_integer (port, c, &len);
1285               if (len < 0)
1286                 scm_i_input_error (NULL, port,
1287                                    "array length must be non-negative",
1288                                    SCM_EOL);
1289
1290               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1291             }
1292
1293           shape = scm_cons (s, shape);
1294         } while (c == '@' || c == ':');
1295
1296       shape = scm_reverse_x (shape, SCM_EOL);
1297     }
1298
1299   /* Read nested lists of elements. */
1300   if (c != '(')
1301     scm_i_input_error (NULL, port,
1302                        "missing '(' in vector or array literal",
1303                        SCM_EOL);
1304   elements = scm_read_sexp (c, port, opts);
1305
1306   if (scm_is_false (shape))
1307     shape = scm_from_ssize_t (rank);
1308   else if (scm_ilength (shape) != rank)
1309     scm_i_input_error
1310       (NULL, port,
1311        "the number of shape specifications must match the array rank",
1312        SCM_EOL);
1313
1314   /* Handle special print syntax of rank zero arrays; see
1315      scm_i_print_array for a rationale. */
1316   if (rank == 0)
1317     {
1318       if (!scm_is_pair (elements))
1319         scm_i_input_error (NULL, port,
1320                            "too few elements in array literal, need 1",
1321                            SCM_EOL);
1322       if (!scm_is_null (SCM_CDR (elements)))
1323         scm_i_input_error (NULL, port,
1324                            "too many elements in array literal, want 1",
1325                            SCM_EOL);
1326       elements = SCM_CAR (elements);
1327     }
1328
1329   /* Construct array, annotate with source location, and return. */
1330   array = scm_list_to_typed_array (tag, shape, elements);
1331   return maybe_annotate_source (array, port, opts, line, column);
1332 }
1333
1334 static SCM
1335 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1336                        long line, int column)
1337 {
1338   return scm_read_array (chr, port, opts, line, column);
1339 }
1340
1341 static SCM
1342 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1343                      long line, int column)
1344 {
1345   chr = scm_getc (port);
1346   if (chr != 'u')
1347     goto syntax;
1348
1349   chr = scm_getc (port);
1350   if (chr != '8')
1351     goto syntax;
1352
1353   chr = scm_getc (port);
1354   if (chr != '(')
1355     goto syntax;
1356
1357   return maybe_annotate_source
1358     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1359      port, opts, line, column);
1360
1361  syntax:
1362   scm_i_input_error ("read_bytevector", port,
1363                      "invalid bytevector prefix",
1364                      SCM_MAKE_CHAR (chr));
1365   return SCM_UNSPECIFIED;
1366 }
1367
1368 static SCM
1369 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1370                            long line, int column)
1371 {
1372   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1373      terribly inefficient but who cares?  */
1374   SCM s_bits = SCM_EOL;
1375
1376   for (chr = scm_getc (port);
1377        (chr != EOF) && ((chr == '0') || (chr == '1'));
1378        chr = scm_getc (port))
1379     {
1380       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1381     }
1382
1383   if (chr != EOF)
1384     scm_ungetc (chr, port);
1385
1386   return maybe_annotate_source
1387     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1388      port, opts, line, column);
1389 }
1390
1391 static SCM
1392 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1393 {
1394   int bang_seen = 0;
1395
1396   for (;;)
1397     {
1398       int c = scm_getc (port);
1399
1400       if (c == EOF)
1401         scm_i_input_error ("skip_block_comment", port,
1402                            "unterminated `#! ... !#' comment", SCM_EOL);
1403
1404       if (c == '!')
1405         bang_seen = 1;
1406       else if (c == '#' && bang_seen)
1407         break;
1408       else
1409         bang_seen = 0;
1410     }
1411
1412   return SCM_UNSPECIFIED;
1413 }
1414
1415 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1416                                          int value);
1417 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1418                                         int value);
1419 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1420                                     int value);
1421
1422 static SCM
1423 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1424 {
1425   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1426   int c;
1427   int i = 0;
1428
1429   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1430     {
1431       c = scm_getc (port);
1432       if (c == EOF)
1433         scm_i_input_error ("skip_block_comment", port,
1434                            "unterminated `#! ... !#' comment", SCM_EOL);
1435       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1436         name[i++] = c;
1437       else if (CHAR_IS_DELIMITER (c))
1438         {
1439           scm_ungetc (c, port);
1440           name[i] = '\0';
1441           if (0 == strcmp ("r6rs", name))
1442             ;  /* Silently ignore */
1443           else if (0 == strcmp ("fold-case", name))
1444             set_port_case_insensitive_p (port, opts, 1);
1445           else if (0 == strcmp ("no-fold-case", name))
1446             set_port_case_insensitive_p (port, opts, 0);
1447           else if (0 == strcmp ("curly-infix", name))
1448             set_port_curly_infix_p (port, opts, 1);
1449           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1450             {
1451               set_port_curly_infix_p (port, opts, 1);
1452               set_port_square_brackets_p (port, opts, 0);
1453             }
1454           else
1455             break;
1456
1457           return SCM_UNSPECIFIED;
1458         }
1459       else
1460         {
1461           scm_ungetc (c, port);
1462           break;
1463         }
1464     }
1465   while (i > 0)
1466     scm_ungetc (name[--i], port);
1467   return scm_read_scsh_block_comment (chr, port);
1468 }
1469
1470 static SCM
1471 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1472 {
1473   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474      nested.  So care must be taken.  */
1475   int nesting_level = 1;
1476
1477   int a = scm_getc (port);
1478
1479   if (a == EOF)
1480     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1481                        "unterminated `#| ... |#' comment", SCM_EOL);
1482
1483   while (nesting_level > 0)
1484     {
1485       int b = scm_getc (port);
1486
1487       if (b == EOF)
1488         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1489                            "unterminated `#| ... |#' comment", SCM_EOL);
1490
1491       if (a == '|' && b == '#')
1492         {
1493           nesting_level--;
1494           b = EOF;
1495         }
1496       else if (a == '#' && b == '|')
1497         {
1498           nesting_level++;
1499           b = EOF;
1500         }
1501
1502       a = b;
1503     }
1504
1505   return SCM_UNSPECIFIED;
1506 }
1507
1508 static SCM
1509 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1510                                scm_t_read_opts *opts)
1511 {
1512   scm_t_wchar c;
1513
1514   c = flush_ws (port, opts, (char *) NULL);
1515   if (EOF == c)
1516     scm_i_input_error ("read_commented_expression", port,
1517                        "no expression after #; comment", SCM_EOL);
1518   scm_ungetc (c, port);
1519   scm_read_expression (port, opts);
1520   return SCM_UNSPECIFIED;
1521 }
1522
1523 static SCM
1524 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1525 {
1526   /* Guile's extended symbol read syntax looks like this:
1527
1528        #{This is all a symbol name}#
1529
1530      So here, CHR is expected to be `{'.  */
1531   int saw_brace = 0;
1532   size_t len = 0;
1533   SCM buf = scm_i_make_string (1024, NULL, 0);
1534
1535   buf = scm_i_string_start_writing (buf);
1536
1537   while ((chr = scm_getc (port)) != EOF)
1538     {
1539       if (saw_brace)
1540         {
1541           if (chr == '#')
1542             {
1543               break;
1544             }
1545           else
1546             {
1547               saw_brace = 0;
1548               scm_i_string_set_x (buf, len++, '}');
1549             }
1550         }
1551
1552       if (chr == '}')
1553         saw_brace = 1;
1554       else if (chr == '\\')
1555         {
1556           /* It used to be that print.c would print extended-read-syntax
1557              symbols with backslashes before "non-standard" chars, but
1558              this routine wouldn't do anything with those escapes.
1559              Bummer.  What we've done is to change print.c to output
1560              R6RS hex escapes for those characters, relying on the fact
1561              that the extended read syntax would never put a `\' before
1562              an `x'.  For now, we just ignore other instances of
1563              backslash in the string.  */
1564           switch ((chr = scm_getc (port)))
1565             {
1566             case EOF:
1567               goto done;
1568             case 'x':
1569               {
1570                 scm_t_wchar c;
1571
1572                 SCM_READ_HEX_ESCAPE (10, ';');
1573                 scm_i_string_set_x (buf, len++, c);
1574                 break;
1575
1576               str_eof:
1577                 chr = EOF;
1578                 goto done;
1579
1580               bad_escaped:
1581                 scm_i_string_stop_writing ();
1582                 scm_i_input_error ("scm_read_extended_symbol", port,
1583                                    "illegal character in escape sequence: ~S",
1584                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1585                 break;
1586               }
1587             default:
1588               scm_i_string_set_x (buf, len++, chr);
1589               break;
1590             }
1591         }
1592       else
1593         scm_i_string_set_x (buf, len++, chr);
1594
1595       if (len >= scm_i_string_length (buf) - 2)
1596         {
1597           SCM addy;
1598
1599           scm_i_string_stop_writing ();
1600           addy = scm_i_make_string (1024, NULL, 0);
1601           buf = scm_string_append (scm_list_2 (buf, addy));
1602           len = 0;
1603           buf = scm_i_string_start_writing (buf);
1604         }
1605     }
1606
1607  done:
1608   scm_i_string_stop_writing ();
1609   if (chr == EOF)
1610     scm_i_input_error ("scm_read_extended_symbol", port,
1611                        "end of file while reading symbol", SCM_EOL);
1612
1613   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1614 }
1615
1616
1617 \f
1618 /* Top-level token readers, i.e., dispatchers.  */
1619
1620 static SCM
1621 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1622 {
1623   SCM proc;
1624
1625   proc = scm_get_hash_procedure (chr);
1626   if (scm_is_true (scm_procedure_p (proc)))
1627     {
1628       long line = SCM_LINUM (port);
1629       int column = SCM_COL (port) - 2;
1630       SCM got;
1631
1632       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1633
1634       if (opts->record_positions_p && SCM_NIMP (got)
1635           && !scm_i_has_source_properties (got))
1636         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1637
1638       return got;
1639     }
1640
1641   return SCM_UNSPECIFIED;
1642 }
1643
1644 /* The reader for the sharp `#' character.  It basically dispatches reads
1645    among the above token readers.   */
1646 static SCM
1647 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1648                 long line, int column)
1649 #define FUNC_NAME "scm_lreadr"
1650 {
1651   SCM result;
1652
1653   chr = scm_getc (port);
1654
1655   result = scm_read_sharp_extension (chr, port, opts);
1656   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1657     return result;
1658
1659   switch (chr)
1660     {
1661     case '\\':
1662       return (scm_read_character (chr, port, opts));
1663     case '(':
1664       return (scm_read_vector (chr, port, opts, line, column));
1665     case 's':
1666     case 'u':
1667     case 'f':
1668     case 'c':
1669       /* This one may return either a boolean or an SRFI-4 vector.  */
1670       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1671     case 'v':
1672       return (scm_read_bytevector (chr, port, opts, line, column));
1673     case '*':
1674       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1675     case 't':
1676     case 'T':
1677     case 'F':
1678       return (scm_read_boolean (chr, port));
1679     case ':':
1680       return (scm_read_keyword (chr, port, opts));
1681     case '0': case '1': case '2': case '3': case '4':
1682     case '5': case '6': case '7': case '8': case '9':
1683     case '@':
1684 #if SCM_ENABLE_DEPRECATED
1685       /* See below for 'i' and 'e'. */
1686     case 'a':
1687     case 'y':
1688     case 'h':
1689     case 'l':
1690 #endif
1691       return (scm_read_array (chr, port, opts, line, column));
1692
1693     case 'i':
1694     case 'e':
1695 #if SCM_ENABLE_DEPRECATED
1696       {
1697         /* When next char is '(', it really is an old-style
1698            uniform array. */
1699         scm_t_wchar next_c = scm_getc (port);
1700         if (next_c != EOF)
1701           scm_ungetc (next_c, port);
1702         if (next_c == '(')
1703           return scm_read_array (chr, port, opts, line, column);
1704         /* Fall through. */
1705       }
1706 #endif
1707     case 'b':
1708     case 'B':
1709     case 'o':
1710     case 'O':
1711     case 'd':
1712     case 'D':
1713     case 'x':
1714     case 'X':
1715     case 'I':
1716     case 'E':
1717       return (scm_read_number_and_radix (chr, port, opts));
1718     case '{':
1719       return (scm_read_extended_symbol (chr, port));
1720     case '!':
1721       return (scm_read_shebang (chr, port, opts));
1722     case ';':
1723       return (scm_read_commented_expression (chr, port, opts));
1724     case '`':
1725     case '\'':
1726     case ',':
1727       return (scm_read_syntax (chr, port, opts));
1728     case 'n':
1729       return (scm_read_nil (chr, port, opts));
1730     default:
1731       result = scm_read_sharp_extension (chr, port, opts);
1732       if (scm_is_eq (result, SCM_UNSPECIFIED))
1733         {
1734           /* To remain compatible with 1.8 and earlier, the following
1735              characters have lower precedence than `read-hash-extend'
1736              characters.  */
1737           switch (chr)
1738             {
1739             case '|':
1740               return scm_read_r6rs_block_comment (chr, port);
1741             default:
1742               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1743                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1744             }
1745         }
1746       else
1747         return result;
1748     }
1749
1750   return SCM_UNSPECIFIED;
1751 }
1752 #undef FUNC_NAME
1753
1754 static SCM
1755 read_inner_expression (SCM port, scm_t_read_opts *opts)
1756 #define FUNC_NAME "read_inner_expression"
1757 {
1758   while (1)
1759     {
1760       scm_t_wchar chr;
1761
1762       chr = scm_getc (port);
1763
1764       switch (chr)
1765         {
1766         case SCM_WHITE_SPACES:
1767         case SCM_LINE_INCREMENTORS:
1768           break;
1769         case ';':
1770           (void) scm_read_semicolon_comment (chr, port);
1771           break;
1772         case '{':
1773           if (opts->curly_infix_p)
1774             {
1775               if (opts->neoteric_p)
1776                 return scm_read_sexp (chr, port, opts);
1777               else
1778                 {
1779                   SCM expr;
1780
1781                   /* Enable neoteric expressions within curly braces */
1782                   opts->neoteric_p = 1;
1783                   expr = scm_read_sexp (chr, port, opts);
1784                   opts->neoteric_p = 0;
1785                   return expr;
1786                 }
1787             }
1788           else
1789             return scm_read_mixed_case_symbol (chr, port, opts);
1790         case '[':
1791           if (opts->square_brackets_p)
1792             return scm_read_sexp (chr, port, opts);
1793           else if (opts->curly_infix_p)
1794             {
1795               /* The syntax of neoteric expressions requires that '[' be
1796                  a delimiter when curly-infix is enabled, so it cannot
1797                  be part of an unescaped symbol.  We might as well do
1798                  something useful with it, so we adopt Kawa's convention:
1799                  [...] => ($bracket-list$ ...) */
1800               long line = SCM_LINUM (port);
1801               int column = SCM_COL (port) - 1;
1802               return maybe_annotate_source
1803                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1804                  port, opts, line, column);
1805             }
1806           else
1807             return scm_read_mixed_case_symbol (chr, port, opts);
1808         case '(':
1809           return (scm_read_sexp (chr, port, opts));
1810         case '"':
1811           return (scm_read_string (chr, port, opts));
1812         case '|':
1813           if (opts->r7rs_symbols_p)
1814             return scm_read_r7rs_symbol (chr, port, opts);
1815           else
1816             return scm_read_mixed_case_symbol (chr, port, opts);
1817         case '\'':
1818         case '`':
1819         case ',':
1820           return (scm_read_quote (chr, port, opts));
1821         case '#':
1822           {
1823             long line  = SCM_LINUM (port);
1824             int column = SCM_COL (port) - 1;
1825             SCM result = scm_read_sharp (chr, port, opts, line, column);
1826             if (scm_is_eq (result, SCM_UNSPECIFIED))
1827               /* We read a comment or some such.  */
1828               break;
1829             else
1830               return result;
1831           }
1832         case ')':
1833           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1834           break;
1835         case '}':
1836           if (opts->curly_infix_p)
1837             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1838           else
1839             return scm_read_mixed_case_symbol (chr, port, opts);
1840         case ']':
1841           if (opts->square_brackets_p)
1842             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1843           /* otherwise fall through */
1844         case EOF:
1845           return SCM_EOF_VAL;
1846         case ':':
1847           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1848             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1849           /* Fall through.  */
1850
1851         default:
1852           {
1853             if (((chr >= '0') && (chr <= '9'))
1854                 || (strchr ("+-.", chr)))
1855               return (scm_read_number (chr, port, opts));
1856             else
1857               return (scm_read_mixed_case_symbol (chr, port, opts));
1858           }
1859         }
1860     }
1861 }
1862 #undef FUNC_NAME
1863
1864 static SCM
1865 scm_read_expression (SCM port, scm_t_read_opts *opts)
1866 #define FUNC_NAME "scm_read_expression"
1867 {
1868   if (!opts->neoteric_p)
1869     return read_inner_expression (port, opts);
1870   else
1871     {
1872       long line = 0;
1873       int column = 0;
1874       SCM expr;
1875
1876       if (opts->record_positions_p)
1877         {
1878           /* We need to get the position of the first non-whitespace
1879              character in order to correctly annotate neoteric
1880              expressions.  For example, for the expression 'f(x)', the
1881              first call to 'read_inner_expression' reads the 'f' (which
1882              cannot be annotated), and then we later read the '(x)' and
1883              use it to construct the new list (f x). */
1884           int c = flush_ws (port, opts, (char *) NULL);
1885           if (c == EOF)
1886             return SCM_EOF_VAL;
1887           scm_ungetc (c, port);
1888           line = SCM_LINUM (port);
1889           column = SCM_COL (port);
1890         }
1891
1892       expr = read_inner_expression (port, opts);
1893
1894       /* 'expr' is the first component of the neoteric expression.  Now
1895          we loop, and as long as the next character is '(', '[', or '{',
1896          (without any intervening whitespace), we use it to construct a
1897          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1898       for (;;)
1899         {
1900           int chr = scm_getc (port);
1901
1902           if (chr == '(')
1903             /* e(...) => (e ...) */
1904             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1905           else if (chr == '[')
1906             /* e[...] => ($bracket-apply$ e ...) */
1907             expr = scm_cons (sym_bracket_apply,
1908                              scm_cons (expr,
1909                                        scm_read_sexp (chr, port, opts)));
1910           else if (chr == '{')
1911             {
1912               SCM arg = scm_read_sexp (chr, port, opts);
1913
1914               if (scm_is_null (arg))
1915                 expr = scm_list_1 (expr);       /* e{} => (e) */
1916               else
1917                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1918             }
1919           else
1920             {
1921               if (chr != EOF)
1922                 scm_ungetc (chr, port);
1923               break;
1924             }
1925           maybe_annotate_source (expr, port, opts, line, column);
1926         }
1927       return expr;
1928     }
1929 }
1930 #undef FUNC_NAME
1931
1932 \f
1933 /* Actual reader.  */
1934
1935 static void init_read_options (SCM port, scm_t_read_opts *opts);
1936
1937 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1938             (SCM port),
1939             "Read an s-expression from the input port @var{port}, or from\n"
1940             "the current input port if @var{port} is not specified.\n"
1941             "Any whitespace before the next token is discarded.")
1942 #define FUNC_NAME s_scm_read
1943 {
1944   scm_t_read_opts opts;
1945   int c;
1946
1947   if (SCM_UNBNDP (port))
1948     port = scm_current_input_port ();
1949   SCM_VALIDATE_OPINPORT (1, port);
1950
1951   init_read_options (port, &opts);
1952
1953   c = flush_ws (port, &opts, (char *) NULL);
1954   if (EOF == c)
1955     return SCM_EOF_VAL;
1956   scm_ungetc (c, port);
1957
1958   return (scm_read_expression (port, &opts));
1959 }
1960 #undef FUNC_NAME
1961
1962
1963 \f
1964
1965 /* Manipulate the read-hash-procedures alist.  This could be written in
1966    Scheme, but maybe it will also be used by C code during initialisation.  */
1967 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1968             (SCM chr, SCM proc),
1969             "Install the procedure @var{proc} for reading expressions\n"
1970             "starting with the character sequence @code{#} and @var{chr}.\n"
1971             "@var{proc} will be called with two arguments:  the character\n"
1972             "@var{chr} and the port to read further data from. The object\n"
1973             "returned will be the return value of @code{read}. \n"
1974             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1975             )
1976 #define FUNC_NAME s_scm_read_hash_extend
1977 {
1978   SCM this;
1979   SCM prev;
1980
1981   SCM_VALIDATE_CHAR (1, chr);
1982   SCM_ASSERT (scm_is_false (proc)
1983               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1984               proc, SCM_ARG2, FUNC_NAME);
1985
1986   /* Check if chr is already in the alist.  */
1987   this = scm_i_read_hash_procedures_ref ();
1988   prev = SCM_BOOL_F;
1989   while (1)
1990     {
1991       if (scm_is_null (this))
1992         {
1993           /* not found, so add it to the beginning.  */
1994           if (scm_is_true (proc))
1995             {
1996               SCM new = scm_cons (scm_cons (chr, proc),
1997                                   scm_i_read_hash_procedures_ref ());
1998               scm_i_read_hash_procedures_set_x (new);
1999             }
2000           break;
2001         }
2002       if (scm_is_eq (chr, SCM_CAAR (this)))
2003         {
2004           /* already in the alist.  */
2005           if (scm_is_false (proc))
2006             {
2007               /* remove it.  */
2008               if (scm_is_false (prev))
2009                 {
2010                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
2011                   scm_i_read_hash_procedures_set_x (rest);
2012                 }
2013               else
2014                 scm_set_cdr_x (prev, SCM_CDR (this));
2015             }
2016           else
2017             {
2018               /* replace it.  */
2019               scm_set_cdr_x (SCM_CAR (this), proc);
2020             }
2021           break;
2022         }
2023       prev = this;
2024       this = SCM_CDR (this);
2025     }
2026
2027   return SCM_UNSPECIFIED;
2028 }
2029 #undef FUNC_NAME
2030
2031 /* Recover the read-hash procedure corresponding to char c.  */
2032 static SCM
2033 scm_get_hash_procedure (int c)
2034 {
2035   SCM rest = scm_i_read_hash_procedures_ref ();
2036
2037   while (1)
2038     {
2039       if (scm_is_null (rest))
2040         return SCM_BOOL_F;
2041
2042       if (SCM_CHAR (SCM_CAAR (rest)) == c)
2043         return SCM_CDAR (rest);
2044
2045       rest = SCM_CDR (rest);
2046     }
2047 }
2048
2049 /* Maximum size of an encoding name.  This is a bit more than the
2050    longest name listed at
2051    <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2052    characters.)  */
2053 #define ENCODING_NAME_MAX_SIZE 20
2054
2055 /* Number of bytes at the beginning or end of a file that are scanned
2056    for a "coding:" declaration.  */
2057 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2058
2059
2060 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2061    coding declaration.  Returns either NULL or a string whose storage
2062    has been allocated with `scm_gc_malloc'.  */
2063 char *
2064 scm_i_scan_for_encoding (SCM port)
2065 {
2066   scm_t_port *pt;
2067   char header[SCM_ENCODING_SEARCH_SIZE+1];
2068   size_t bytes_read, encoding_length, i;
2069   char *encoding = NULL;
2070   char *pos, *encoding_start;
2071   int in_comment;
2072
2073   pt = SCM_PTAB_ENTRY (port);
2074
2075   if (pt->rw_active == SCM_PORT_WRITE)
2076     scm_flush (port);
2077
2078   if (pt->rw_random)
2079     pt->rw_active = SCM_PORT_READ;
2080
2081   if (pt->read_pos == pt->read_end)
2082     {
2083       /* We can use the read buffer, and thus avoid a seek. */
2084       if (scm_fill_input (port) == EOF)
2085         return NULL;
2086
2087       bytes_read = pt->read_end - pt->read_pos;
2088       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2089         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2090
2091       if (bytes_read <= 1)
2092         /* An unbuffered port -- don't scan.  */
2093         return NULL;
2094
2095       memcpy (header, pt->read_pos, bytes_read);
2096       header[bytes_read] = '\0';
2097     }
2098   else
2099     {
2100       /* Try to read some bytes and then seek back.  Not all ports
2101          support seeking back; and indeed some file ports (like
2102          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2103          check performed by SCM_FPORT_FDES---but fail to seek
2104          backwards.  Hence this block comes second.  We prefer to use
2105          the read buffer in-place.  */
2106       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2107         return NULL;
2108
2109       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2110       header[bytes_read] = '\0';
2111       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2112     }
2113
2114   /* search past "coding[:=]" */
2115   pos = header;
2116   while (1)
2117     {
2118       if ((pos = strstr(pos, "coding")) == NULL)
2119         return NULL;
2120
2121       pos += strlen ("coding");
2122       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2123           (*pos == ':' || *pos == '='))
2124         {
2125           pos ++;
2126           break;
2127         }
2128     }
2129
2130   /* skip spaces */
2131   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2132          (*pos == ' ' || *pos == '\t'))
2133     pos ++;
2134
2135   if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2136     /* We found the "coding:" string, but there is probably not enough
2137        room to store an encoding name in its entirety, so ignore it.
2138        This makes sure we do not end up returning a truncated encoding
2139        name.  */
2140     return NULL;
2141
2142   /* grab the next token */
2143   encoding_start = pos;
2144   i = 0;
2145   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2146          && encoding_start + i - header < bytes_read
2147          && (isalnum ((int) encoding_start[i])
2148              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2149     i++;
2150
2151   encoding_length = i;
2152   if (encoding_length == 0)
2153     return NULL;
2154
2155   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2156   for (i = 0; i < encoding_length; i++)
2157     encoding[i] = toupper ((int) encoding[i]);
2158
2159   /* push backwards to make sure we were in a comment */
2160   in_comment = 0;
2161   pos = encoding_start;
2162   while (pos >= header)
2163     {
2164       if (*pos == ';')
2165         {
2166           in_comment = 1;
2167           break;
2168         }
2169       else if (*pos == '\n' || pos == header)
2170         {
2171           /* This wasn't in a semicolon comment. Check for a
2172            hash-bang comment. */
2173           char *beg = strstr (header, "#!");
2174           char *end = strstr (header, "!#");
2175           if (beg < encoding_start && encoding_start + encoding_length <= end)
2176             in_comment = 1;
2177           break;
2178         }
2179       else
2180         {
2181           pos --;
2182           continue;
2183         }
2184     }
2185   if (!in_comment)
2186     /* This wasn't in a comment */
2187     return NULL;
2188
2189   return encoding;
2190 }
2191
2192 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2193             (SCM port),
2194             "Scans the port for an Emacs-like character coding declaration\n"
2195             "near the top of the contents of a port with random-accessible contents.\n"
2196             "The coding declaration is of the form\n"
2197             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2198             "\n"
2199             "Returns a string containing the character encoding of the file\n"
2200             "if a declaration was found, or @code{#f} otherwise.\n")
2201 #define FUNC_NAME s_scm_file_encoding
2202 {
2203   char *enc;
2204   SCM s_enc;
2205
2206   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2207
2208   enc = scm_i_scan_for_encoding (port);
2209   if (enc == NULL)
2210     return SCM_BOOL_F;
2211   else
2212     {
2213       s_enc = scm_from_locale_string (enc);
2214       return s_enc;
2215     }
2216
2217   return SCM_BOOL_F;
2218 }
2219 #undef FUNC_NAME
2220
2221 \f
2222 /* Per-port read options.
2223
2224    We store per-port read options in the 'port-read-options' port
2225    property, which is stored in the internal port structure.  The value
2226    stored is a single integer that contains a two-bit field for each
2227    read option.
2228
2229    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2230    the applicable value should be inherited from the corresponding
2231    global read option.  Otherwise, the bit field contains the value of
2232    the read option.  For boolean read options that have been set
2233    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2234    read option has been set per-port, its possible values are those in
2235    'enum t_keyword_style'. */
2236
2237 /* Key to read options in port properties. */
2238 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2239
2240 /* Offsets of bit fields for each per-port override */
2241 #define READ_OPTION_COPY_SOURCE_P          0
2242 #define READ_OPTION_RECORD_POSITIONS_P     2
2243 #define READ_OPTION_CASE_INSENSITIVE_P     4
2244 #define READ_OPTION_KEYWORD_STYLE          6
2245 #define READ_OPTION_R6RS_ESCAPES_P         8
2246 #define READ_OPTION_SQUARE_BRACKETS_P     10
2247 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2248 #define READ_OPTION_CURLY_INFIX_P         14
2249 #define READ_OPTION_R7RS_SYMBOLS_P        16
2250
2251 /* The total width in bits of the per-port overrides */
2252 #define READ_OPTIONS_NUM_BITS             18
2253
2254 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2255 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2256
2257 #define READ_OPTION_MASK     3
2258 #define READ_OPTION_INHERIT  3
2259
2260 static void
2261 set_port_read_option (SCM port, int option, int new_value)
2262 {
2263   SCM scm_read_options;
2264   unsigned int read_options;
2265
2266   new_value &= READ_OPTION_MASK;
2267   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2268   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2269     read_options = scm_to_uint (scm_read_options);
2270   else
2271     read_options = READ_OPTIONS_INHERIT_ALL;
2272   read_options &= ~(READ_OPTION_MASK << option);
2273   read_options |= new_value << option;
2274   scm_read_options = scm_from_uint (read_options);
2275   scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2276 }
2277
2278 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2279 static void
2280 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2281 {
2282   value = !!value;
2283   opts->case_insensitive_p = value;
2284   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2285 }
2286
2287 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2288 static void
2289 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2290 {
2291   value = !!value;
2292   opts->square_brackets_p = value;
2293   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2294 }
2295
2296 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2297 static void
2298 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2299 {
2300   value = !!value;
2301   opts->curly_infix_p = value;
2302   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2303 }
2304
2305 /* Initialize OPTS based on PORT's read options and the global read
2306    options. */
2307 static void
2308 init_read_options (SCM port, scm_t_read_opts *opts)
2309 {
2310   SCM val, scm_read_options;
2311   unsigned int read_options, x;
2312
2313   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2314
2315   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2316     read_options = scm_to_uint (scm_read_options);
2317   else
2318     read_options = READ_OPTIONS_INHERIT_ALL;
2319
2320   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2321   if (x == READ_OPTION_INHERIT)
2322     {
2323       val = SCM_PACK (SCM_KEYWORD_STYLE);
2324       if (scm_is_eq (val, scm_keyword_prefix))
2325         x = KEYWORD_STYLE_PREFIX;
2326       else if (scm_is_eq (val, scm_keyword_postfix))
2327         x = KEYWORD_STYLE_POSTFIX;
2328       else
2329         x = KEYWORD_STYLE_HASH_PREFIX;
2330     }
2331   opts->keyword_style = x;
2332
2333 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2334   do                                                                    \
2335     {                                                                   \
2336       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2337       if (x == READ_OPTION_INHERIT)                                     \
2338         x = !!SCM_ ## NAME;                                             \
2339           opts->name = x;                                               \
2340     }                                                                   \
2341   while (0)
2342
2343   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2344   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2345   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2346   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2347   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2348   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2349   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2350   RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P,       r7rs_symbols_p);
2351
2352 #undef RESOLVE_BOOLEAN_OPTION
2353
2354   opts->neoteric_p = 0;
2355 }
2356
2357 void
2358 scm_init_read ()
2359 {
2360   SCM read_hash_procs;
2361
2362   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2363
2364   scm_i_read_hash_procedures =
2365     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2366
2367   scm_init_opts (scm_read_options, scm_read_opts);
2368 #include "libguile/read.x"
2369 }
2370
2371 /*
2372   Local Variables:
2373   c-file-style: "gnu"
2374   End:
2375 */