libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <unistd.h>
  30 #include <unicase.h>
  31 #include <unictype.h>
  32 #include <c-strcase.h>
  33 #include <c-ctype.h>
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/bytevectors.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/eval.h"
  39 #include "libguile/arrays.h"
  40 #include "libguile/bitvectors.h"
  41 #include "libguile/keywords.h"
  42 #include "libguile/alist.h"
  43 #include "libguile/srcprop.h"
  44 #include "libguile/hashtab.h"
  45 #include "libguile/hash.h"
  46 #include "libguile/ports.h"
  47 #include "libguile/ports-internal.h"
  48 #include "libguile/fports.h"
  49 #include "libguile/root.h"
  50 #include "libguile/strings.h"
  51 #include "libguile/strports.h"
  52 #include "libguile/vectors.h"
  53 #include "libguile/validate.h"
  54 #include "libguile/srfi-4.h"
  55 #include "libguile/srfi-13.h"
  56
  57 #include "libguile/read.h"
  58 #include "libguile/private-options.h"
  59
  60
  61 \f
  62
  63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  66 SCM_SYMBOL (sym_nil, "nil");
  67
  68 /* SRFI-105 curly infix expression support */
  69 SCM_SYMBOL (sym_nfx, "$nfx$");
  70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  72
  73 scm_t_option scm_read_opts[] =
  74   {
  75     { SCM_OPTION_BOOLEAN, "copy", 0,
  76       "Copy source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "positions", 1,
  78       "Record positions of source code expressions." },
  79     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  80       "Convert symbols to lower case."},
  81     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  82       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  83     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  84       "Use R6RS variable-length character and string hex escapes."},
  85     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  86       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  87     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  88       "In strings, consume leading whitespace after an escaped end-of-line."},
  89     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  90       "Support SRFI-105 curly infix expressions."},
  91     { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
  92       "Support R7RS |...| symbol notation."},
  93     { 0, },
  94   };
  95
  96 /* Internal read options structure.  This is initialized by 'scm_read'
  97    from the global and per-port read options, and a pointer is passed
  98    down to all helper functions. */
  99
 100 enum t_keyword_style
 101   {
 102     KEYWORD_STYLE_HASH_PREFIX,
 103     KEYWORD_STYLE_PREFIX,
 104     KEYWORD_STYLE_POSTFIX
 105   };
 106
 107 struct t_read_opts
 108 {
 109   enum t_keyword_style keyword_style;
 110   unsigned int copy_source_p        : 1;
 111   unsigned int record_positions_p   : 1;
 112   unsigned int case_insensitive_p   : 1;
 113   unsigned int r6rs_escapes_p       : 1;
 114   unsigned int square_brackets_p    : 1;
 115   unsigned int hungry_eol_escapes_p : 1;
 116   unsigned int curly_infix_p        : 1;
 117   unsigned int neoteric_p           : 1;
 118   unsigned int r7rs_symbols_p       : 1;
 119 };
 120
 121 typedef struct t_read_opts scm_t_read_opts;
 122
 123
 124 /*
 125   Give meaningful error messages for errors
 126
 127   We use the format
 128
 129   FILE:LINE:COL: MESSAGE
 130   This happened in ....
 131
 132   This is not standard GNU format, but the test-suite likes the real
 133   message to be in front.
 134
 135  */
 136
 137
 138 void
 139 scm_i_input_error (char const *function,
 140                    SCM port, const char *message, SCM arg)
 141 {
 142   SCM fn = (scm_is_string (SCM_FILENAME(port))
 143             ? SCM_FILENAME(port)
 144             : scm_from_locale_string ("#<unknown port>"));
 145
 146   SCM string_port = scm_open_output_string ();
 147   SCM string = SCM_EOL;
 148   scm_simple_format (string_port,
 149                      scm_from_locale_string ("~A:~S:~S: ~A"),
 150                      scm_list_4 (fn,
 151                                  scm_from_long (SCM_LINUM (port) + 1),
 152                                  scm_from_int (SCM_COL (port) + 1),
 153                                  scm_from_locale_string (message)));
 154
 155   string = scm_get_output_string (string_port);
 156   scm_close_output_port (string_port);
 157   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 158                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 159                  string,
 160                  arg,
 161                  SCM_BOOL_F);
 162 }
 163
 164
 165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 166             (SCM setting),
 167             "Option interface for the read options. Instead of using\n"
 168             "this procedure directly, use the procedures @code{read-enable},\n"
 169             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 170 #define FUNC_NAME s_scm_read_options
 171 {
 172   SCM ans = scm_options (setting,
 173                          scm_read_opts,
 174                          FUNC_NAME);
 175   if (SCM_COPY_SOURCE_P)
 176     SCM_RECORD_POSITIONS_P = 1;
 177   return ans;
 178 }
 179 #undef FUNC_NAME
 180
 181 /* A fluid referring to an association list mapping extra hash
 182    characters to procedures.  */
 183 static SCM *scm_i_read_hash_procedures;
 184
 185 static SCM
 186 scm_i_read_hash_procedures_ref (void)
 187 {
 188   return scm_fluid_ref (*scm_i_read_hash_procedures);
 189 }
 190
 191 static void
 192 scm_i_read_hash_procedures_set_x (SCM value)
 193 {
 194   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 195 }
 196
 197 \f
 198 /* Token readers.  */
 199
 200
 201 /* Size of the C buffer used to read symbols and numbers.  */
 202 #define READER_BUFFER_SIZE            128
 203
 204 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 205 #define READER_STRING_BUFFER_SIZE     128
 206
 207 /* The maximum size of Scheme character names.  */
 208 #define READER_CHAR_NAME_MAX_SIZE      50
 209
 210 /* The maximum size of reader directive names.  */
 211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 212
 213
 214 /* `isblank' is only in C99.  */
 215 #define CHAR_IS_BLANK_(_chr)                                    \
 216   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 217    || ((_chr) == '\f') || ((_chr) == '\r'))
 218
 219 #ifdef MSDOS
 220 # define CHAR_IS_BLANK(_chr)                    \
 221   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 222 #else
 223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 224 #endif
 225
 226
 227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 228    structure'').  */
 229 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 230   (CHAR_IS_BLANK (c)                                            \
 231    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 232
 233 #define CHAR_IS_DELIMITER(c)                                    \
 234   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 235    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 236                                       || opts->curly_infix_p))  \
 237    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 238
 239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 240    Structure''.  */
 241 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 242   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 243    || ((_chr) == 'd') || ((_chr) == 'l'))
 244
 245 /* Read an SCSH block comment.  */
 246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 250 static SCM scm_get_hash_procedure (int);
 251
 252 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 253    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 254    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 255    bytes actually read.  */
 256 static int
 257 read_token (SCM port, scm_t_read_opts *opts,
 258             char *buf, size_t buf_size, size_t *read)
 259 {
 260    *read = 0;
 261
 262    while (*read < buf_size)
 263      {
 264        int chr;
 265
 266        chr = scm_get_byte_or_eof_unlocked (port);
 267
 268        if (chr == EOF)
 269         return 0;
 270       else if (CHAR_IS_DELIMITER (chr))
 271         {
 272           scm_unget_byte_unlocked (chr, port);
 273           return 0;
 274         }
 275       else
 276         {
 277           *buf = (char) chr;
 278           buf++, (*read)++;
 279         }
 280      }
 281
 282    return 1;
 283  }
 284
 285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 286    if the token doesn't fit in BUFFER_SIZE bytes.  */
 287 static char *
 288 read_complete_token (SCM port, scm_t_read_opts *opts,
 289                      char *buffer, size_t buffer_size, size_t *read)
 290 {
 291   int overflow = 0;
 292   size_t bytes_read, overflow_size = 0;
 293   char *overflow_buffer = NULL;
 294
 295   do
 296     {
 297       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 298       if (bytes_read == 0)
 299         break;
 300       if (overflow || overflow_size != 0)
 301         {
 302           if (overflow_size == 0)
 303             {
 304               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 305               memcpy (overflow_buffer, buffer, bytes_read);
 306               overflow_size = bytes_read;
 307             }
 308           else
 309             {
 310               char *new_buf =
 311                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 312
 313               memcpy (new_buf, overflow_buffer, overflow_size);
 314               memcpy (new_buf + overflow_size, buffer, bytes_read);
 315
 316               overflow_buffer = new_buf;
 317               overflow_size += bytes_read;
 318             }
 319         }
 320     }
 321   while (overflow);
 322
 323   if (overflow_size)
 324     *read = overflow_size;
 325   else
 326     *read = bytes_read;
 327
 328   return (overflow_size > 0 ? overflow_buffer : buffer);
 329 }
 330
 331 /* Skip whitespace from PORT and return the first non-whitespace character
 332    read.  Raise an error on end-of-file.  */
 333 static int
 334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 335 {
 336   scm_t_wchar c;
 337   while (1)
 338     switch (c = scm_getc_unlocked (port))
 339       {
 340       case EOF:
 341       goteof:
 342         if (eoferr)
 343           {
 344             scm_i_input_error (eoferr,
 345                                port,
 346                                "end of file",
 347                                SCM_EOL);
 348           }
 349         return c;
 350
 351       case ';':
 352       lp:
 353         switch (c = scm_getc_unlocked (port))
 354           {
 355           case EOF:
 356             goto goteof;
 357           default:
 358             goto lp;
 359           case SCM_LINE_INCREMENTORS:
 360             break;
 361           }
 362         break;
 363
 364       case '#':
 365         switch (c = scm_getc_unlocked (port))
 366           {
 367           case EOF:
 368             eoferr = "read_sharp";
 369             goto goteof;
 370           case '!':
 371             scm_read_shebang (c, port, opts);
 372             break;
 373           case ';':
 374             scm_read_commented_expression (c, port, opts);
 375             break;
 376           case '|':
 377             if (scm_is_false (scm_get_hash_procedure (c)))
 378               {
 379                 scm_read_r6rs_block_comment (c, port);
 380                 break;
 381               }
 382             /* fall through */
 383           default:
 384             scm_ungetc_unlocked (c, port);
 385             return '#';
 386           }
 387         break;
 388
 389       case SCM_LINE_INCREMENTORS:
 390       case SCM_SINGLE_SPACES:
 391       case '\t':
 392         break;
 393
 394       default:
 395         return c;
 396       }
 397
 398   return 0;
 399 }
 400
 401
 402 \f
 403 /* Token readers.  */
 404
 405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 407                            long line, int column);
 408
 409
 410 static SCM
 411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 412                        long line, int column)
 413 {
 414   /* This condition can be caused by a user calling
 415      set-port-column!.  */
 416   if (line < 0 || column < 0)
 417     return x;
 418
 419   if (opts->record_positions_p)
 420     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 421   return x;
 422 }
 423
 424 static SCM
 425 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 426 #define FUNC_NAME "scm_i_lreadparen"
 427 {
 428   int c;
 429   SCM tmp, tl, ans = SCM_EOL;
 430   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 431   const int terminating_char = ((chr == '{') ? '}'
 432                                 : ((chr == '[') ? ']'
 433                                    : ')'));
 434
 435   /* Need to capture line and column numbers here. */
 436   long line = SCM_LINUM (port);
 437   int column = SCM_COL (port) - 1;
 438
 439   c = flush_ws (port, opts, FUNC_NAME);
 440   if (terminating_char == c)
 441     return SCM_EOL;
 442
 443   scm_ungetc_unlocked (c, port);
 444   tmp = scm_read_expression (port, opts);
 445
 446   /* Note that it is possible for scm_read_expression to return
 447      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 448      check that it's a real dot by checking `c'.  */
 449   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 450     {
 451       ans = scm_read_expression (port, opts);
 452       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 453         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 454                            SCM_EOL);
 455       return ans;
 456     }
 457
 458   /* Build the head of the list structure. */
 459   ans = tl = scm_cons (tmp, SCM_EOL);
 460
 461   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 462     {
 463       SCM new_tail;
 464
 465       if (c == ')' || (c == ']' && opts->square_brackets_p)
 466           || ((c == '}' || c == ']') && opts->curly_infix_p))
 467         scm_i_input_error (FUNC_NAME, port,
 468                            "in pair: mismatched close paren: ~A",
 469                            scm_list_1 (SCM_MAKE_CHAR (c)));
 470
 471       scm_ungetc_unlocked (c, port);
 472       tmp = scm_read_expression (port, opts);
 473
 474       /* See above note about scm_sym_dot.  */
 475       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 476         {
 477           SCM_SETCDR (tl, scm_read_expression (port, opts));
 478
 479           c = flush_ws (port, opts, FUNC_NAME);
 480           if (terminating_char != c)
 481             scm_i_input_error (FUNC_NAME, port,
 482                                "in pair: missing close paren", SCM_EOL);
 483           break;
 484         }
 485
 486       new_tail = scm_cons (tmp, SCM_EOL);
 487       SCM_SETCDR (tl, new_tail);
 488       tl = new_tail;
 489     }
 490
 491   if (curly_list_p)
 492     {
 493       /* In addition to finding the length, 'scm_ilength' checks for
 494          improper or circular lists, in which case it returns -1. */
 495       int len = scm_ilength (ans);
 496
 497       /* The (len == 0) case is handled above */
 498       if (len == 1)
 499         /* Return directly to avoid re-annotating the element's source
 500            location with the position of the outer brace.  Also, it
 501            might not be possible to annotate the element. */
 502         return scm_car (ans);  /* {e} => e */
 503       else if (len == 2)
 504         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 505       else if (len >= 3 && (len & 1))
 506         {
 507           /* It's a proper list whose length is odd and at least 3.  If
 508              the elements at odd indices (the infix operator positions)
 509              are all 'equal?', then it's a simple curly-infix list.
 510              Otherwise it's a mixed curly-infix list. */
 511           SCM op = scm_cadr (ans);
 512
 513           /* Check to see if the elements at odd indices are 'equal?' */
 514           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 515             {
 516               if (scm_is_null (tl))
 517                 {
 518                   /* Convert simple curly-infix list to prefix:
 519                      {a <op> b <op> ...} => (<op> a b ...) */
 520                   tl = ans;
 521                   while (scm_is_pair (scm_cdr (tl)))
 522                     {
 523                       tmp = scm_cddr (tl);
 524                       SCM_SETCDR (tl, tmp);
 525                       tl = tmp;
 526                     }
 527                   ans = scm_cons (op, ans);
 528                   break;
 529                 }
 530               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 531                 {
 532                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 533                   ans = scm_cons (sym_nfx, ans);
 534                   break;
 535                 }
 536             }
 537         }
 538       else
 539         /* Mixed curly-infix (possibly improper) list:
 540            {e . tail} => ($nfx$ e . tail) */
 541         ans = scm_cons (sym_nfx, ans);
 542     }
 543
 544   return maybe_annotate_source (ans, port, opts, line, column);
 545 }
 546 #undef FUNC_NAME
 547
 548
 549 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 550    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 551    found.  */
 552 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 553   do                                                               \
 554     {                                                              \
 555       scm_t_wchar a;                                               \
 556       size_t i = 0;                                                \
 557       c = 0;                                                       \
 558       while (i < ndigits)                                          \
 559         {                                                          \
 560           a = scm_getc_unlocked (port);                                     \
 561           if (a == EOF)                                            \
 562             goto str_eof;                                          \
 563           if (terminator                                           \
 564               && (a == (scm_t_wchar) terminator)                   \
 565               && (i > 0))                                          \
 566             break;                                                 \
 567           if ('0' <= a && a <= '9')                                \
 568             a -= '0';                                              \
 569           else if ('A' <= a && a <= 'F')                           \
 570             a = a - 'A' + 10;                                      \
 571           else if ('a' <= a && a <= 'f')                           \
 572             a = a - 'a' + 10;                                      \
 573           else                                                     \
 574             {                                                      \
 575               c = a;                                               \
 576               goto bad_escaped;                                    \
 577             }                                                      \
 578           c = c * 16 + a;                                          \
 579           i ++;                                                    \
 580         }                                                          \
 581     } while (0)
 582
 583 static void
 584 skip_intraline_whitespace (SCM port)
 585 {
 586   scm_t_wchar c;
 587
 588   do
 589     {
 590       c = scm_getc_unlocked (port);
 591       if (c == EOF)
 592         return;
 593     }
 594   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 595
 596   scm_ungetc_unlocked (c, port);
 597 }
 598
 599 /* Read either a double-quoted string or an R7RS-style symbol delimited
 600    by vertical lines, depending on the value of 'chr' ('"' or '|').
 601    Regardless, the result is always returned as a string.  */
 602 static SCM
 603 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
 604 #define FUNC_NAME "scm_lreadr"
 605 {
 606   /* For strings smaller than C_STR, this function creates only one Scheme
 607      object (the string returned).  */
 608
 609   SCM str = SCM_EOL;
 610   size_t c_str_len = 0;
 611   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 612
 613   /* Need to capture line and column numbers here. */
 614   long line = SCM_LINUM (port);
 615   int column = SCM_COL (port) - 1;
 616
 617   while (chr != (c = scm_getc_unlocked (port)))
 618     {
 619       if (c == EOF)
 620         {
 621         str_eof:
 622           scm_i_input_error (FUNC_NAME, port,
 623                              (chr == '|'
 624                               ? "end of file in symbol"
 625                               : "end of file in string constant"),
 626                              SCM_EOL);
 627         }
 628
 629       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 630         {
 631           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 632           c_str_len = 0;
 633         }
 634
 635       if (c == '\\')
 636         {
 637           switch (c = scm_getc_unlocked (port))
 638             {
 639             case EOF:
 640               goto str_eof;
 641             case '|':
 642             case '\\':
 643               break;
 644             case '\n':
 645               if (opts->hungry_eol_escapes_p)
 646                 skip_intraline_whitespace (port);
 647               continue;
 648             case '0':
 649               c = '\0';
 650               break;
 651             case 'f':
 652               c = '\f';
 653               break;
 654             case 'n':
 655               c = '\n';
 656               break;
 657             case 'r':
 658               c = '\r';
 659               break;
 660             case 't':
 661               c = '\t';
 662               break;
 663             case 'a':
 664               c = '\007';
 665               break;
 666             case 'v':
 667               c = '\v';
 668               break;
 669             case 'b':
 670               c = '\010';
 671               break;
 672             case 'x':
 673               if (opts->r6rs_escapes_p || chr == '|')
 674                 SCM_READ_HEX_ESCAPE (10, ';');
 675               else
 676                 SCM_READ_HEX_ESCAPE (2, '\0');
 677               break;
 678             case 'u':
 679               if (!opts->r6rs_escapes_p)
 680                 {
 681                   SCM_READ_HEX_ESCAPE (4, '\0');
 682                   break;
 683                 }
 684             case 'U':
 685               if (!opts->r6rs_escapes_p)
 686                 {
 687                   SCM_READ_HEX_ESCAPE (6, '\0');
 688                   break;
 689                 }
 690             default:
 691               if (c == chr)
 692                 break;
 693             bad_escaped:
 694               scm_i_input_error (FUNC_NAME, port,
 695                                  "illegal character in escape sequence: ~S",
 696                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 697             }
 698         }
 699
 700       c_str[c_str_len++] = c;
 701     }
 702
 703   if (scm_is_null (str))
 704     /* Fast path: we got a string that fits in C_STR.  */
 705     str = scm_from_utf32_stringn (c_str, c_str_len);
 706   else
 707     {
 708       if (c_str_len > 0)
 709         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 710
 711       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 712     }
 713
 714   return maybe_annotate_source (str, port, opts, line, column);
 715 }
 716 #undef FUNC_NAME
 717
 718 static SCM
 719 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 720 {
 721   return scm_read_string_like_syntax (chr, port, opts);
 722 }
 723
 724 static SCM
 725 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
 726 {
 727   return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
 728 }
 729
 730 static SCM
 731 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 732 {
 733   SCM result, str = SCM_EOL;
 734   char local_buffer[READER_BUFFER_SIZE], *buffer;
 735   size_t bytes_read;
 736
 737   /* Need to capture line and column numbers here. */
 738   long line = SCM_LINUM (port);
 739   int column = SCM_COL (port) - 1;
 740
 741   scm_ungetc_unlocked (chr, port);
 742   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 743                                 &bytes_read);
 744
 745   str = scm_from_port_stringn (buffer, bytes_read, port);
 746
 747   result = scm_string_to_number (str, SCM_UNDEFINED);
 748   if (scm_is_false (result))
 749     {
 750       /* Return a symbol instead of a number */
 751       if (opts->case_insensitive_p)
 752         str = scm_string_downcase_x (str);
 753       result = scm_string_to_symbol (str);
 754     }
 755   else if (SCM_NIMP (result))
 756     result = maybe_annotate_source (result, port, opts, line, column);
 757
 758   SCM_COL (port) += scm_i_string_length (str);
 759   return result;
 760 }
 761
 762 static SCM
 763 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 764 {
 765   SCM result;
 766   int ends_with_colon = 0;
 767   size_t bytes_read;
 768   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 769   char local_buffer[READER_BUFFER_SIZE], *buffer;
 770   SCM str;
 771
 772   scm_ungetc_unlocked (chr, port);
 773   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 774                                 &bytes_read);
 775   if (bytes_read > 0)
 776     ends_with_colon = buffer[bytes_read - 1] == ':';
 777
 778   if (postfix && ends_with_colon && (bytes_read > 1))
 779     {
 780       str = scm_from_port_stringn (buffer, bytes_read - 1, port);
 781
 782       if (opts->case_insensitive_p)
 783         str = scm_string_downcase_x (str);
 784       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 785     }
 786   else
 787     {
 788       str = scm_from_port_stringn (buffer, bytes_read, port);
 789
 790       if (opts->case_insensitive_p)
 791         str = scm_string_downcase_x (str);
 792       result = scm_string_to_symbol (str);
 793     }
 794
 795   SCM_COL (port) += scm_i_string_length (str);
 796   return result;
 797 }
 798
 799 static SCM
 800 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 801 #define FUNC_NAME "scm_lreadr"
 802 {
 803   SCM result;
 804   size_t read;
 805   char local_buffer[READER_BUFFER_SIZE], *buffer;
 806   unsigned int radix;
 807   SCM str;
 808
 809   switch (chr)
 810     {
 811     case 'B':
 812     case 'b':
 813       radix = 2;
 814       break;
 815
 816     case 'o':
 817     case 'O':
 818       radix = 8;
 819       break;
 820
 821     case 'd':
 822     case 'D':
 823       radix = 10;
 824       break;
 825
 826     case 'x':
 827     case 'X':
 828       radix = 16;
 829       break;
 830
 831     default:
 832       scm_ungetc_unlocked (chr, port);
 833       scm_ungetc_unlocked ('#', port);
 834       radix = 10;
 835     }
 836
 837   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 838                                 &read);
 839
 840   str = scm_from_port_stringn (buffer, read, port);
 841
 842   result = scm_string_to_number (str, scm_from_uint (radix));
 843
 844   SCM_COL (port) += scm_i_string_length (str);
 845
 846   if (scm_is_true (result))
 847     return result;
 848
 849   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 850
 851   return SCM_BOOL_F;
 852 }
 853 #undef FUNC_NAME
 854
 855 static SCM
 856 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 857 {
 858   SCM p;
 859   long line = SCM_LINUM (port);
 860   int column = SCM_COL (port) - 1;
 861
 862   switch (chr)
 863     {
 864     case '`':
 865       p = scm_sym_quasiquote;
 866       break;
 867
 868     case '\'':
 869       p = scm_sym_quote;
 870       break;
 871
 872     case ',':
 873       {
 874         scm_t_wchar c;
 875
 876         c = scm_getc_unlocked (port);
 877         if ('@' == c)
 878           p = scm_sym_uq_splicing;
 879         else
 880           {
 881             scm_ungetc_unlocked (c, port);
 882             p = scm_sym_unquote;
 883           }
 884         break;
 885       }
 886
 887     default:
 888       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 889                "scm_read_quote", chr);
 890       abort ();
 891     }
 892
 893   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 894   return maybe_annotate_source (p, port, opts, line, column);
 895 }
 896
 897 SCM_SYMBOL (sym_syntax, "syntax");
 898 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 899 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 900 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 901
 902 static SCM
 903 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 904 {
 905   SCM p;
 906   long line = SCM_LINUM (port);
 907   int column = SCM_COL (port) - 1;
 908
 909   switch (chr)
 910     {
 911     case '`':
 912       p = sym_quasisyntax;
 913       break;
 914
 915     case '\'':
 916       p = sym_syntax;
 917       break;
 918
 919     case ',':
 920       {
 921         int c;
 922
 923         c = scm_getc_unlocked (port);
 924         if ('@' == c)
 925           p = sym_unsyntax_splicing;
 926         else
 927           {
 928             scm_ungetc_unlocked (c, port);
 929             p = sym_unsyntax;
 930           }
 931         break;
 932       }
 933
 934     default:
 935       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 936                "scm_read_syntax", chr);
 937       abort ();
 938     }
 939
 940   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 941   return maybe_annotate_source (p, port, opts, line, column);
 942 }
 943
 944 static SCM
 945 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 946 {
 947   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 948
 949   if (!scm_is_eq (id, sym_nil))
 950     scm_i_input_error ("scm_read_nil", port,
 951                        "unexpected input while reading #nil: ~a",
 952                        scm_list_1 (id));
 953
 954   return SCM_ELISP_NIL;
 955 }
 956
 957 static SCM
 958 scm_read_semicolon_comment (int chr, SCM port)
 959 {
 960   int c;
 961
 962   /* We use the get_byte here because there is no need to get the
 963      locale correct with comment input. This presumes that newline
 964      always represents itself no matter what the encoding is.  */
 965   for (c = scm_get_byte_or_eof_unlocked (port);
 966        (c != EOF) && (c != '\n');
 967        c = scm_get_byte_or_eof_unlocked (port));
 968
 969   return SCM_UNSPECIFIED;
 970 }
 971
 972 /* If the EXPECTED_CHARS are the next ones available from PORT, then
 973    consume them and return 1.  Otherwise leave the port position where
 974    it was and return 0.  EXPECTED_CHARS should be all lowercase, and
 975    will be matched case-insensitively against the characters read from
 976    PORT. */
 977 static int
 978 try_read_ci_chars (SCM port, const char *expected_chars)
 979 {
 980   int num_chars_wanted = strlen (expected_chars);
 981   int num_chars_read = 0;
 982   char *chars_read = alloca (num_chars_wanted);
 983   int c;
 984
 985   while (num_chars_read < num_chars_wanted)
 986     {
 987       c = scm_getc_unlocked (port);
 988       if (c == EOF)
 989         break;
 990       else if (c_tolower (c) != expected_chars[num_chars_read])
 991         {
 992           scm_ungetc_unlocked (c, port);
 993           break;
 994         }
 995       else
 996         chars_read[num_chars_read++] = c;
 997     }
 998
 999   if (num_chars_read == num_chars_wanted)
1000     return 1;
1001   else
1002     {
1003       while (num_chars_read > 0)
1004         scm_ungetc_unlocked (chars_read[--num_chars_read], port);
1005       return 0;
1006     }
1007 }
1008
1009 \f
1010 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
1011
1012 static SCM
1013 scm_read_boolean (int chr, SCM port)
1014 {
1015   switch (chr)
1016     {
1017     case 't':
1018     case 'T':
1019       try_read_ci_chars (port, "rue");
1020       return SCM_BOOL_T;
1021
1022     case 'f':
1023     case 'F':
1024       try_read_ci_chars (port, "alse");
1025       return SCM_BOOL_F;
1026     }
1027
1028   return SCM_UNSPECIFIED;
1029 }
1030
1031 static SCM
1032 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1033 #define FUNC_NAME "scm_lreadr"
1034 {
1035   char buffer[READER_CHAR_NAME_MAX_SIZE];
1036   SCM charname;
1037   size_t charname_len, bytes_read;
1038   scm_t_wchar cp;
1039   int overflow;
1040   scm_t_port_internal *pti;
1041
1042   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1043                          &bytes_read);
1044   if (overflow)
1045     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1046
1047   if (bytes_read == 0)
1048     {
1049       chr = scm_getc_unlocked (port);
1050       if (chr == EOF)
1051         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1052                            "while reading character", SCM_EOL);
1053
1054       /* CHR must be a token delimiter, like a whitespace.  */
1055       return (SCM_MAKE_CHAR (chr));
1056     }
1057
1058   pti = SCM_PORT_GET_INTERNAL (port);
1059
1060   /* Simple ASCII characters can be processed immediately.  Also, simple
1061      ISO-8859-1 characters can be processed immediately if the encoding for this
1062      port is ISO-8859-1.  */
1063   if (bytes_read == 1 &&
1064       ((unsigned char) buffer[0] <= 127
1065        || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
1066     {
1067       SCM_COL (port) += 1;
1068       return SCM_MAKE_CHAR (buffer[0]);
1069     }
1070
1071   /* Otherwise, convert the buffer into a proper scheme string for
1072      processing.  */
1073   charname = scm_from_port_stringn (buffer, bytes_read, port);
1074   charname_len = scm_i_string_length (charname);
1075   SCM_COL (port) += charname_len;
1076   cp = scm_i_string_ref (charname, 0);
1077   if (charname_len == 1)
1078     return SCM_MAKE_CHAR (cp);
1079
1080   /* Ignore dotted circles, which may be used to keep combining characters from
1081      combining with the backslash in #\charname.  */
1082   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1083     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1084
1085   if (cp >= '0' && cp < '8')
1086     {
1087       /* Dirk:FIXME::  This type of character syntax is not R5RS
1088        * compliant.  Further, it should be verified that the constant
1089        * does only consist of octal digits.  */
1090       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1091       if (SCM_I_INUMP (p))
1092         {
1093           scm_t_wchar c = scm_to_uint32 (p);
1094           if (SCM_IS_UNICODE_CHAR (c))
1095             return SCM_MAKE_CHAR (c);
1096           else
1097             scm_i_input_error (FUNC_NAME, port,
1098                                "out-of-range octal character escape: ~a",
1099                                scm_list_1 (charname));
1100         }
1101     }
1102
1103   if (cp == 'x' && (charname_len > 1))
1104     {
1105       SCM p;
1106
1107       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1109                                 scm_from_uint (16));
1110       if (SCM_I_INUMP (p))
1111         {
1112           scm_t_wchar c = scm_to_uint32 (p);
1113           if (SCM_IS_UNICODE_CHAR (c))
1114             return SCM_MAKE_CHAR (c);
1115           else
1116             scm_i_input_error (FUNC_NAME, port,
1117                                "out-of-range hex character escape: ~a",
1118                                scm_list_1 (charname));
1119         }
1120     }
1121
1122   /* The names of characters should never have non-Latin1
1123      characters.  */
1124   if (scm_i_is_narrow_string (charname)
1125       || scm_i_try_narrow_string (charname))
1126     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1127                                        charname_len);
1128       if (scm_is_true (ch))
1129         return ch;
1130     }
1131
1132   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1133                      scm_list_1 (charname));
1134
1135   return SCM_UNSPECIFIED;
1136 }
1137 #undef FUNC_NAME
1138
1139 static SCM
1140 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1141 {
1142   SCM symbol;
1143
1144   /* Read the symbol that comprises the keyword.  Doing this instead of
1145      invoking a specific symbol reader function allows `scm_read_keyword ()'
1146      to adapt to the delimiters currently valid of symbols.
1147
1148      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1149   symbol = scm_read_expression (port, opts);
1150   if (!scm_is_symbol (symbol))
1151     scm_i_input_error ("scm_read_keyword", port,
1152                        "keyword prefix `~a' not followed by a symbol: ~s",
1153                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1154
1155   return (scm_symbol_to_keyword (symbol));
1156 }
1157
1158 static SCM
1159 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1160                  long line, int column)
1161 {
1162   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163      guarantee that it's going to do what we want.  After all, this is an
1164      implementation detail of `scm_read_vector ()', not a desirable
1165      property.  */
1166   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1167                                 port, opts, line, column);
1168 }
1169
1170 /* Helper used by scm_read_array */
1171 static int
1172 read_decimal_integer (SCM port, int c, ssize_t *resp)
1173 {
1174   ssize_t sign = 1;
1175   ssize_t res = 0;
1176   int got_it = 0;
1177
1178   if (c == '-')
1179     {
1180       sign = -1;
1181       c = scm_getc_unlocked (port);
1182     }
1183
1184   while ('0' <= c && c <= '9')
1185     {
1186       if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1187         scm_i_input_error ("read_decimal_integer", port,
1188                            "number too large", SCM_EOL);
1189       res = 10*res + c-'0';
1190       got_it = 1;
1191       c = scm_getc_unlocked (port);
1192     }
1193
1194   if (got_it)
1195     *resp = sign * res;
1196   return c;
1197 }
1198
1199 /* Read an array.  This function can also read vectors and uniform
1200    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1201    handled here.
1202
1203    C is the first character read after the '#'. */
1204 static SCM
1205 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1206 {
1207   ssize_t rank;
1208   scm_t_wchar tag_buf[8];
1209   int tag_len;
1210
1211   SCM tag, shape = SCM_BOOL_F, elements, array;
1212
1213   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1214      the array code can not deal with zero-length dimensions yet, and
1215      we want to allow zero-length vectors, of course. */
1216   if (c == '(')
1217     return scm_read_vector (c, port, opts, line, column);
1218
1219   /* Disambiguate between '#f' and uniform floating point vectors. */
1220   if (c == 'f')
1221     {
1222       c = scm_getc_unlocked (port);
1223       if (c != '3' && c != '6')
1224         {
1225           if (c == 'a' && try_read_ci_chars (port, "lse"))
1226             return SCM_BOOL_F;
1227           else if (c != EOF)
1228             scm_ungetc_unlocked (c, port);
1229           return SCM_BOOL_F;
1230         }
1231       rank = 1;
1232       tag_buf[0] = 'f';
1233       tag_len = 1;
1234       goto continue_reading_tag;
1235     }
1236
1237   /* Read rank. */
1238   rank = 1;
1239   c = read_decimal_integer (port, c, &rank);
1240   if (rank < 0)
1241     scm_i_input_error (NULL, port, "array rank must be non-negative",
1242                        SCM_EOL);
1243
1244   /* Read tag. */
1245   tag_len = 0;
1246  continue_reading_tag:
1247   while (c != EOF && c != '(' && c != '@' && c != ':'
1248          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1249     {
1250       tag_buf[tag_len++] = c;
1251       c = scm_getc_unlocked (port);
1252     }
1253   if (tag_len == 0)
1254     tag = SCM_BOOL_T;
1255   else
1256     {
1257       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1258       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1259         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1260                            scm_list_1 (tag));
1261     }
1262
1263   /* Read shape. */
1264   if (c == '@' || c == ':')
1265     {
1266       shape = SCM_EOL;
1267
1268       do
1269         {
1270           ssize_t lbnd = 0, len = 0;
1271           SCM s;
1272
1273           if (c == '@')
1274             {
1275               c = scm_getc_unlocked (port);
1276               c = read_decimal_integer (port, c, &lbnd);
1277             }
1278
1279           s = scm_from_ssize_t (lbnd);
1280
1281           if (c == ':')
1282             {
1283               c = scm_getc_unlocked (port);
1284               c = read_decimal_integer (port, c, &len);
1285               if (len < 0)
1286                 scm_i_input_error (NULL, port,
1287                                    "array length must be non-negative",
1288                                    SCM_EOL);
1289
1290               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1291             }
1292
1293           shape = scm_cons (s, shape);
1294         } while (c == '@' || c == ':');
1295
1296       shape = scm_reverse_x (shape, SCM_EOL);
1297     }
1298
1299   /* Read nested lists of elements. */
1300   if (c != '(')
1301     scm_i_input_error (NULL, port,
1302                        "missing '(' in vector or array literal",
1303                        SCM_EOL);
1304   elements = scm_read_sexp (c, port, opts);
1305
1306   if (scm_is_false (shape))
1307     shape = scm_from_ssize_t (rank);
1308   else if (scm_ilength (shape) != rank)
1309     scm_i_input_error
1310       (NULL, port,
1311        "the number of shape specifications must match the array rank",
1312        SCM_EOL);
1313
1314   /* Handle special print syntax of rank zero arrays; see
1315      scm_i_print_array for a rationale. */
1316   if (rank == 0)
1317     {
1318       if (!scm_is_pair (elements))
1319         scm_i_input_error (NULL, port,
1320                            "too few elements in array literal, need 1",
1321                            SCM_EOL);
1322       if (!scm_is_null (SCM_CDR (elements)))
1323         scm_i_input_error (NULL, port,
1324                            "too many elements in array literal, want 1",
1325                            SCM_EOL);
1326       elements = SCM_CAR (elements);
1327     }
1328
1329   /* Construct array, annotate with source location, and return. */
1330   array = scm_list_to_typed_array (tag, shape, elements);
1331   return maybe_annotate_source (array, port, opts, line, column);
1332 }
1333
1334 static SCM
1335 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1336                        long line, int column)
1337 {
1338   return scm_read_array (chr, port, opts, line, column);
1339 }
1340
1341 static SCM
1342 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1343                      long line, int column)
1344 {
1345   chr = scm_getc_unlocked (port);
1346   if (chr != 'u')
1347     goto syntax;
1348
1349   chr = scm_getc_unlocked (port);
1350   if (chr != '8')
1351     goto syntax;
1352
1353   chr = scm_getc_unlocked (port);
1354   if (chr != '(')
1355     goto syntax;
1356
1357   return maybe_annotate_source
1358     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1359      port, opts, line, column);
1360
1361  syntax:
1362   scm_i_input_error ("read_bytevector", port,
1363                      "invalid bytevector prefix",
1364                      SCM_MAKE_CHAR (chr));
1365   return SCM_UNSPECIFIED;
1366 }
1367
1368 static SCM
1369 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1370                            long line, int column)
1371 {
1372   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1373      terribly inefficient but who cares?  */
1374   SCM s_bits = SCM_EOL;
1375
1376   for (chr = scm_getc_unlocked (port);
1377        (chr != EOF) && ((chr == '0') || (chr == '1'));
1378        chr = scm_getc_unlocked (port))
1379     {
1380       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1381     }
1382
1383   if (chr != EOF)
1384     scm_ungetc_unlocked (chr, port);
1385
1386   return maybe_annotate_source
1387     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1388      port, opts, line, column);
1389 }
1390
1391 static SCM
1392 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1393 {
1394   int bang_seen = 0;
1395
1396   for (;;)
1397     {
1398       int c = scm_getc_unlocked (port);
1399
1400       if (c == EOF)
1401         scm_i_input_error ("skip_block_comment", port,
1402                            "unterminated `#! ... !#' comment", SCM_EOL);
1403
1404       if (c == '!')
1405         bang_seen = 1;
1406       else if (c == '#' && bang_seen)
1407         break;
1408       else
1409         bang_seen = 0;
1410     }
1411
1412   return SCM_UNSPECIFIED;
1413 }
1414
1415 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1416                                          int value);
1417 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1418                                         int value);
1419 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1420                                     int value);
1421
1422 static SCM
1423 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1424 {
1425   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1426   int c;
1427   int i = 0;
1428
1429   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1430     {
1431       c = scm_getc_unlocked (port);
1432       if (c == EOF)
1433         scm_i_input_error ("skip_block_comment", port,
1434                            "unterminated `#! ... !#' comment", SCM_EOL);
1435       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1436         name[i++] = c;
1437       else if (CHAR_IS_DELIMITER (c))
1438         {
1439           scm_ungetc_unlocked (c, port);
1440           name[i] = '\0';
1441           if (0 == strcmp ("r6rs", name))
1442             ;  /* Silently ignore */
1443           else if (0 == strcmp ("fold-case", name))
1444             set_port_case_insensitive_p (port, opts, 1);
1445           else if (0 == strcmp ("no-fold-case", name))
1446             set_port_case_insensitive_p (port, opts, 0);
1447           else if (0 == strcmp ("curly-infix", name))
1448             set_port_curly_infix_p (port, opts, 1);
1449           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1450             {
1451               set_port_curly_infix_p (port, opts, 1);
1452               set_port_square_brackets_p (port, opts, 0);
1453             }
1454           else
1455             break;
1456
1457           return SCM_UNSPECIFIED;
1458         }
1459       else
1460         {
1461           scm_ungetc_unlocked (c, port);
1462           break;
1463         }
1464     }
1465   while (i > 0)
1466     scm_ungetc_unlocked (name[--i], port);
1467   return scm_read_scsh_block_comment (chr, port);
1468 }
1469
1470 static SCM
1471 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1472 {
1473   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474      nested.  So care must be taken.  */
1475   int nesting_level = 1;
1476
1477   int a = scm_getc_unlocked (port);
1478
1479   if (a == EOF)
1480     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1481                        "unterminated `#| ... |#' comment", SCM_EOL);
1482
1483   while (nesting_level > 0)
1484     {
1485       int b = scm_getc_unlocked (port);
1486
1487       if (b == EOF)
1488         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1489                            "unterminated `#| ... |#' comment", SCM_EOL);
1490
1491       if (a == '|' && b == '#')
1492         {
1493           nesting_level--;
1494           b = EOF;
1495         }
1496       else if (a == '#' && b == '|')
1497         {
1498           nesting_level++;
1499           b = EOF;
1500         }
1501
1502       a = b;
1503     }
1504
1505   return SCM_UNSPECIFIED;
1506 }
1507
1508 static SCM
1509 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1510                                scm_t_read_opts *opts)
1511 {
1512   scm_t_wchar c;
1513
1514   c = flush_ws (port, opts, (char *) NULL);
1515   if (EOF == c)
1516     scm_i_input_error ("read_commented_expression", port,
1517                        "no expression after #; comment", SCM_EOL);
1518   scm_ungetc_unlocked (c, port);
1519   scm_read_expression (port, opts);
1520   return SCM_UNSPECIFIED;
1521 }
1522
1523 static SCM
1524 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1525 {
1526   /* Guile's extended symbol read syntax looks like this:
1527
1528        #{This is all a symbol name}#
1529
1530      So here, CHR is expected to be `{'.  */
1531   int saw_brace = 0;
1532   size_t len = 0;
1533   SCM buf = scm_i_make_string (1024, NULL, 0);
1534
1535   buf = scm_i_string_start_writing (buf);
1536
1537   while ((chr = scm_getc_unlocked (port)) != EOF)
1538     {
1539       if (saw_brace)
1540         {
1541           if (chr == '#')
1542             {
1543               break;
1544             }
1545           else
1546             {
1547               saw_brace = 0;
1548               scm_i_string_set_x (buf, len++, '}');
1549             }
1550         }
1551
1552       if (chr == '}')
1553         saw_brace = 1;
1554       else if (chr == '\\')
1555         {
1556           /* It used to be that print.c would print extended-read-syntax
1557              symbols with backslashes before "non-standard" chars, but
1558              this routine wouldn't do anything with those escapes.
1559              Bummer.  What we've done is to change print.c to output
1560              R6RS hex escapes for those characters, relying on the fact
1561              that the extended read syntax would never put a `\' before
1562              an `x'.  For now, we just ignore other instances of
1563              backslash in the string.  */
1564           switch ((chr = scm_getc_unlocked (port)))
1565             {
1566             case EOF:
1567               goto done;
1568             case 'x':
1569               {
1570                 scm_t_wchar c;
1571
1572                 SCM_READ_HEX_ESCAPE (10, ';');
1573                 scm_i_string_set_x (buf, len++, c);
1574                 break;
1575
1576               str_eof:
1577                 chr = EOF;
1578                 goto done;
1579
1580               bad_escaped:
1581                 scm_i_string_stop_writing ();
1582                 scm_i_input_error ("scm_read_extended_symbol", port,
1583                                    "illegal character in escape sequence: ~S",
1584                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1585                 break;
1586               }
1587             default:
1588               scm_i_string_set_x (buf, len++, chr);
1589               break;
1590             }
1591         }
1592       else
1593         scm_i_string_set_x (buf, len++, chr);
1594
1595       if (len >= scm_i_string_length (buf) - 2)
1596         {
1597           SCM addy;
1598
1599           scm_i_string_stop_writing ();
1600           addy = scm_i_make_string (1024, NULL, 0);
1601           buf = scm_string_append (scm_list_2 (buf, addy));
1602           len = 0;
1603           buf = scm_i_string_start_writing (buf);
1604         }
1605     }
1606
1607  done:
1608   scm_i_string_stop_writing ();
1609   if (chr == EOF)
1610     scm_i_input_error ("scm_read_extended_symbol", port,
1611                        "end of file while reading symbol", SCM_EOL);
1612
1613   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1614 }
1615
1616
1617 \f
1618 /* Top-level token readers, i.e., dispatchers.  */
1619
1620 static SCM
1621 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1622 {
1623   SCM proc;
1624
1625   proc = scm_get_hash_procedure (chr);
1626   if (scm_is_true (scm_procedure_p (proc)))
1627     {
1628       long line = SCM_LINUM (port);
1629       int column = SCM_COL (port) - 2;
1630       SCM got;
1631
1632       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1633
1634       if (opts->record_positions_p && SCM_NIMP (got)
1635           && !scm_i_has_source_properties (got))
1636         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1637
1638       return got;
1639     }
1640
1641   return SCM_UNSPECIFIED;
1642 }
1643
1644 /* The reader for the sharp `#' character.  It basically dispatches reads
1645    among the above token readers.   */
1646 static SCM
1647 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1648                 long line, int column)
1649 #define FUNC_NAME "scm_lreadr"
1650 {
1651   SCM result;
1652
1653   chr = scm_getc_unlocked (port);
1654
1655   result = scm_read_sharp_extension (chr, port, opts);
1656   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1657     return result;
1658
1659   switch (chr)
1660     {
1661     case '\\':
1662       return (scm_read_character (chr, port, opts));
1663     case '(':
1664       return (scm_read_vector (chr, port, opts, line, column));
1665     case 's':
1666     case 'u':
1667     case 'f':
1668     case 'c':
1669       /* This one may return either a boolean or an SRFI-4 vector.  */
1670       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1671     case 'v':
1672       return (scm_read_bytevector (chr, port, opts, line, column));
1673     case '*':
1674       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1675     case 't':
1676     case 'T':
1677     case 'F':
1678       return (scm_read_boolean (chr, port));
1679     case ':':
1680       return (scm_read_keyword (chr, port, opts));
1681     case '0': case '1': case '2': case '3': case '4':
1682     case '5': case '6': case '7': case '8': case '9':
1683     case '@':
1684       return (scm_read_array (chr, port, opts, line, column));
1685
1686     case 'i':
1687     case 'e':
1688     case 'b':
1689     case 'B':
1690     case 'o':
1691     case 'O':
1692     case 'd':
1693     case 'D':
1694     case 'x':
1695     case 'X':
1696     case 'I':
1697     case 'E':
1698       return (scm_read_number_and_radix (chr, port, opts));
1699     case '{':
1700       return (scm_read_extended_symbol (chr, port));
1701     case '!':
1702       return (scm_read_shebang (chr, port, opts));
1703     case ';':
1704       return (scm_read_commented_expression (chr, port, opts));
1705     case '`':
1706     case '\'':
1707     case ',':
1708       return (scm_read_syntax (chr, port, opts));
1709     case 'n':
1710       return (scm_read_nil (chr, port, opts));
1711     default:
1712       result = scm_read_sharp_extension (chr, port, opts);
1713       if (scm_is_eq (result, SCM_UNSPECIFIED))
1714         {
1715           /* To remain compatible with 1.8 and earlier, the following
1716              characters have lower precedence than `read-hash-extend'
1717              characters.  */
1718           switch (chr)
1719             {
1720             case '|':
1721               return scm_read_r6rs_block_comment (chr, port);
1722             default:
1723               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1724                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1725             }
1726         }
1727       else
1728         return result;
1729     }
1730
1731   return SCM_UNSPECIFIED;
1732 }
1733 #undef FUNC_NAME
1734
1735 static SCM
1736 read_inner_expression (SCM port, scm_t_read_opts *opts)
1737 #define FUNC_NAME "read_inner_expression"
1738 {
1739   while (1)
1740     {
1741       scm_t_wchar chr;
1742
1743       chr = scm_getc_unlocked (port);
1744
1745       switch (chr)
1746         {
1747         case SCM_WHITE_SPACES:
1748         case SCM_LINE_INCREMENTORS:
1749           break;
1750         case ';':
1751           (void) scm_read_semicolon_comment (chr, port);
1752           break;
1753         case '{':
1754           if (opts->curly_infix_p)
1755             {
1756               if (opts->neoteric_p)
1757                 return scm_read_sexp (chr, port, opts);
1758               else
1759                 {
1760                   SCM expr;
1761
1762                   /* Enable neoteric expressions within curly braces */
1763                   opts->neoteric_p = 1;
1764                   expr = scm_read_sexp (chr, port, opts);
1765                   opts->neoteric_p = 0;
1766                   return expr;
1767                 }
1768             }
1769           else
1770             return scm_read_mixed_case_symbol (chr, port, opts);
1771         case '[':
1772           if (opts->square_brackets_p)
1773             return scm_read_sexp (chr, port, opts);
1774           else if (opts->curly_infix_p)
1775             {
1776               /* The syntax of neoteric expressions requires that '[' be
1777                  a delimiter when curly-infix is enabled, so it cannot
1778                  be part of an unescaped symbol.  We might as well do
1779                  something useful with it, so we adopt Kawa's convention:
1780                  [...] => ($bracket-list$ ...) */
1781               long line = SCM_LINUM (port);
1782               int column = SCM_COL (port) - 1;
1783               return maybe_annotate_source
1784                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1785                  port, opts, line, column);
1786             }
1787           else
1788             return scm_read_mixed_case_symbol (chr, port, opts);
1789         case '(':
1790           return (scm_read_sexp (chr, port, opts));
1791         case '"':
1792           return (scm_read_string (chr, port, opts));
1793         case '|':
1794           if (opts->r7rs_symbols_p)
1795             return scm_read_r7rs_symbol (chr, port, opts);
1796           else
1797             return scm_read_mixed_case_symbol (chr, port, opts);
1798         case '\'':
1799         case '`':
1800         case ',':
1801           return (scm_read_quote (chr, port, opts));
1802         case '#':
1803           {
1804             long line  = SCM_LINUM (port);
1805             int column = SCM_COL (port) - 1;
1806             SCM result = scm_read_sharp (chr, port, opts, line, column);
1807             if (scm_is_eq (result, SCM_UNSPECIFIED))
1808               /* We read a comment or some such.  */
1809               break;
1810             else
1811               return result;
1812           }
1813         case ')':
1814           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1815           break;
1816         case '}':
1817           if (opts->curly_infix_p)
1818             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1819           else
1820             return scm_read_mixed_case_symbol (chr, port, opts);
1821         case ']':
1822           if (opts->square_brackets_p)
1823             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1824           /* otherwise fall through */
1825         case EOF:
1826           return SCM_EOF_VAL;
1827         case ':':
1828           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1829             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1830           /* Fall through.  */
1831
1832         default:
1833           {
1834             if (((chr >= '0') && (chr <= '9'))
1835                 || (strchr ("+-.", chr)))
1836               return (scm_read_number (chr, port, opts));
1837             else
1838               return (scm_read_mixed_case_symbol (chr, port, opts));
1839           }
1840         }
1841     }
1842 }
1843 #undef FUNC_NAME
1844
1845 static SCM
1846 scm_read_expression (SCM port, scm_t_read_opts *opts)
1847 #define FUNC_NAME "scm_read_expression"
1848 {
1849   if (!opts->neoteric_p)
1850     return read_inner_expression (port, opts);
1851   else
1852     {
1853       long line = 0;
1854       int column = 0;
1855       SCM expr;
1856
1857       if (opts->record_positions_p)
1858         {
1859           /* We need to get the position of the first non-whitespace
1860              character in order to correctly annotate neoteric
1861              expressions.  For example, for the expression 'f(x)', the
1862              first call to 'read_inner_expression' reads the 'f' (which
1863              cannot be annotated), and then we later read the '(x)' and
1864              use it to construct the new list (f x). */
1865           int c = flush_ws (port, opts, (char *) NULL);
1866           if (c == EOF)
1867             return SCM_EOF_VAL;
1868           scm_ungetc_unlocked (c, port);
1869           line = SCM_LINUM (port);
1870           column = SCM_COL (port);
1871         }
1872
1873       expr = read_inner_expression (port, opts);
1874
1875       /* 'expr' is the first component of the neoteric expression.  Now
1876          we loop, and as long as the next character is '(', '[', or '{',
1877          (without any intervening whitespace), we use it to construct a
1878          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1879       for (;;)
1880         {
1881           int chr = scm_getc_unlocked (port);
1882
1883           if (chr == '(')
1884             /* e(...) => (e ...) */
1885             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1886           else if (chr == '[')
1887             /* e[...] => ($bracket-apply$ e ...) */
1888             expr = scm_cons (sym_bracket_apply,
1889                              scm_cons (expr,
1890                                        scm_read_sexp (chr, port, opts)));
1891           else if (chr == '{')
1892             {
1893               SCM arg = scm_read_sexp (chr, port, opts);
1894
1895               if (scm_is_null (arg))
1896                 expr = scm_list_1 (expr);       /* e{} => (e) */
1897               else
1898                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1899             }
1900           else
1901             {
1902               if (chr != EOF)
1903                 scm_ungetc_unlocked (chr, port);
1904               break;
1905             }
1906           maybe_annotate_source (expr, port, opts, line, column);
1907         }
1908       return expr;
1909     }
1910 }
1911 #undef FUNC_NAME
1912
1913 \f
1914 /* Actual reader.  */
1915
1916 static void init_read_options (SCM port, scm_t_read_opts *opts);
1917
1918 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1919             (SCM port),
1920             "Read an s-expression from the input port @var{port}, or from\n"
1921             "the current input port if @var{port} is not specified.\n"
1922             "Any whitespace before the next token is discarded.")
1923 #define FUNC_NAME s_scm_read
1924 {
1925   scm_t_read_opts opts;
1926   int c;
1927
1928   if (SCM_UNBNDP (port))
1929     port = scm_current_input_port ();
1930   SCM_VALIDATE_OPINPORT (1, port);
1931
1932   init_read_options (port, &opts);
1933
1934   c = flush_ws (port, &opts, (char *) NULL);
1935   if (EOF == c)
1936     return SCM_EOF_VAL;
1937   scm_ungetc_unlocked (c, port);
1938
1939   return (scm_read_expression (port, &opts));
1940 }
1941 #undef FUNC_NAME
1942
1943
1944 \f
1945
1946 /* Manipulate the read-hash-procedures alist.  This could be written in
1947    Scheme, but maybe it will also be used by C code during initialisation.  */
1948 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1949             (SCM chr, SCM proc),
1950             "Install the procedure @var{proc} for reading expressions\n"
1951             "starting with the character sequence @code{#} and @var{chr}.\n"
1952             "@var{proc} will be called with two arguments:  the character\n"
1953             "@var{chr} and the port to read further data from. The object\n"
1954             "returned will be the return value of @code{read}. \n"
1955             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1956             )
1957 #define FUNC_NAME s_scm_read_hash_extend
1958 {
1959   SCM this;
1960   SCM prev;
1961
1962   SCM_VALIDATE_CHAR (1, chr);
1963   SCM_ASSERT (scm_is_false (proc)
1964               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1965               proc, SCM_ARG2, FUNC_NAME);
1966
1967   /* Check if chr is already in the alist.  */
1968   this = scm_i_read_hash_procedures_ref ();
1969   prev = SCM_BOOL_F;
1970   while (1)
1971     {
1972       if (scm_is_null (this))
1973         {
1974           /* not found, so add it to the beginning.  */
1975           if (scm_is_true (proc))
1976             {
1977               SCM new = scm_cons (scm_cons (chr, proc),
1978                                   scm_i_read_hash_procedures_ref ());
1979               scm_i_read_hash_procedures_set_x (new);
1980             }
1981           break;
1982         }
1983       if (scm_is_eq (chr, SCM_CAAR (this)))
1984         {
1985           /* already in the alist.  */
1986           if (scm_is_false (proc))
1987             {
1988               /* remove it.  */
1989               if (scm_is_false (prev))
1990                 {
1991                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1992                   scm_i_read_hash_procedures_set_x (rest);
1993                 }
1994               else
1995                 scm_set_cdr_x (prev, SCM_CDR (this));
1996             }
1997           else
1998             {
1999               /* replace it.  */
2000               scm_set_cdr_x (SCM_CAR (this), proc);
2001             }
2002           break;
2003         }
2004       prev = this;
2005       this = SCM_CDR (this);
2006     }
2007
2008   return SCM_UNSPECIFIED;
2009 }
2010 #undef FUNC_NAME
2011
2012 /* Recover the read-hash procedure corresponding to char c.  */
2013 static SCM
2014 scm_get_hash_procedure (int c)
2015 {
2016   SCM rest = scm_i_read_hash_procedures_ref ();
2017
2018   while (1)
2019     {
2020       if (scm_is_null (rest))
2021         return SCM_BOOL_F;
2022
2023       if (SCM_CHAR (SCM_CAAR (rest)) == c)
2024         return SCM_CDAR (rest);
2025
2026       rest = SCM_CDR (rest);
2027     }
2028 }
2029
2030 static int
2031 is_encoding_char (char c)
2032 {
2033   if (c >= 'a' && c <= 'z') return 1;
2034   if (c >= 'A' && c <= 'Z') return 1;
2035   if (c >= '0' && c <= '9') return 1;
2036   return strchr ("_-.:/,+=()", c) != NULL;
2037 }
2038
2039 /* Maximum size of an encoding name.  This is a bit more than the
2040    longest name listed at
2041    <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2042    characters.)  */
2043 #define ENCODING_NAME_MAX_SIZE 20
2044
2045 /* Number of bytes at the beginning or end of a file that are scanned
2046    for a "coding:" declaration.  */
2047 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2048
2049
2050 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2051    coding declaration.  Returns either NULL or a string whose storage
2052    has been allocated with `scm_gc_malloc'.  */
2053 char *
2054 scm_i_scan_for_encoding (SCM port)
2055 {
2056   scm_t_port *pt;
2057   char header[SCM_ENCODING_SEARCH_SIZE+1];
2058   size_t bytes_read, encoding_length, i;
2059   char *encoding = NULL;
2060   char *pos, *encoding_start;
2061   int in_comment;
2062
2063   pt = SCM_PTAB_ENTRY (port);
2064
2065   if (pt->rw_active == SCM_PORT_WRITE)
2066     scm_flush_unlocked (port);
2067
2068   if (pt->rw_random)
2069     pt->rw_active = SCM_PORT_READ;
2070
2071   if (pt->read_pos == pt->read_end)
2072     {
2073       /* We can use the read buffer, and thus avoid a seek. */
2074       if (scm_fill_input_unlocked (port) == EOF)
2075         return NULL;
2076
2077       bytes_read = pt->read_end - pt->read_pos;
2078       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2079         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2080
2081       if (bytes_read <= 1)
2082         /* An unbuffered port -- don't scan.  */
2083         return NULL;
2084
2085       memcpy (header, pt->read_pos, bytes_read);
2086       header[bytes_read] = '\0';
2087     }
2088   else
2089     {
2090       /* Try to read some bytes and then seek back.  Not all ports
2091          support seeking back; and indeed some file ports (like
2092          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2093          check performed by SCM_FPORT_FDES---but fail to seek
2094          backwards.  Hence this block comes second.  We prefer to use
2095          the read buffer in-place.  */
2096       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2097         return NULL;
2098
2099       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2100       header[bytes_read] = '\0';
2101       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2102     }
2103
2104   /* search past "coding[:=]" */
2105   pos = header;
2106   while (1)
2107     {
2108       if ((pos = strstr(pos, "coding")) == NULL)
2109         return NULL;
2110
2111       pos += strlen ("coding");
2112       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2113           (*pos == ':' || *pos == '='))
2114         {
2115           pos ++;
2116           break;
2117         }
2118     }
2119
2120   /* skip spaces */
2121   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2122          (*pos == ' ' || *pos == '\t'))
2123     pos ++;
2124
2125   if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2126     /* We found the "coding:" string, but there is probably not enough
2127        room to store an encoding name in its entirety, so ignore it.
2128        This makes sure we do not end up returning a truncated encoding
2129        name.  */
2130     return NULL;
2131
2132   /* grab the next token */
2133   encoding_start = pos;
2134   i = 0;
2135   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2136          && encoding_start + i - header < bytes_read
2137          && is_encoding_char (encoding_start[i]))
2138     i++;
2139
2140   encoding_length = i;
2141   if (encoding_length == 0)
2142     return NULL;
2143
2144   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2145
2146   /* push backwards to make sure we were in a comment */
2147   in_comment = 0;
2148   pos = encoding_start;
2149   while (pos >= header)
2150     {
2151       if (*pos == ';')
2152         {
2153           in_comment = 1;
2154           break;
2155         }
2156       else if (*pos == '\n' || pos == header)
2157         {
2158           /* This wasn't in a semicolon comment. Check for a
2159            hash-bang comment. */
2160           char *beg = strstr (header, "#!");
2161           char *end = strstr (header, "!#");
2162           if (beg < encoding_start && encoding_start + encoding_length <= end)
2163             in_comment = 1;
2164           break;
2165         }
2166       else
2167         {
2168           pos --;
2169           continue;
2170         }
2171     }
2172   if (!in_comment)
2173     /* This wasn't in a comment */
2174     return NULL;
2175
2176   return encoding;
2177 }
2178
2179 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2180             (SCM port),
2181             "Scans the port for an Emacs-like character coding declaration\n"
2182             "near the top of the contents of a port with random-accessible contents.\n"
2183             "The coding declaration is of the form\n"
2184             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2185             "\n"
2186             "Returns a string containing the character encoding of the file\n"
2187             "if a declaration was found, or @code{#f} otherwise.\n")
2188 #define FUNC_NAME s_scm_file_encoding
2189 {
2190   char *enc;
2191   SCM s_enc;
2192
2193   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2194
2195   enc = scm_i_scan_for_encoding (port);
2196   if (enc == NULL)
2197     return SCM_BOOL_F;
2198   else
2199     {
2200       s_enc = scm_string_upcase (scm_from_locale_string (enc));
2201       return s_enc;
2202     }
2203
2204   return SCM_BOOL_F;
2205 }
2206 #undef FUNC_NAME
2207
2208 \f
2209 /* Per-port read options.
2210
2211    We store per-port read options in the 'port-read-options' port
2212    property, which is stored in the internal port structure.  The value
2213    stored is a single integer that contains a two-bit field for each
2214    read option.
2215
2216    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2217    the applicable value should be inherited from the corresponding
2218    global read option.  Otherwise, the bit field contains the value of
2219    the read option.  For boolean read options that have been set
2220    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2221    read option has been set per-port, its possible values are those in
2222    'enum t_keyword_style'. */
2223
2224 /* Key to read options in port properties. */
2225 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2226
2227 /* Offsets of bit fields for each per-port override */
2228 #define READ_OPTION_COPY_SOURCE_P          0
2229 #define READ_OPTION_RECORD_POSITIONS_P     2
2230 #define READ_OPTION_CASE_INSENSITIVE_P     4
2231 #define READ_OPTION_KEYWORD_STYLE          6
2232 #define READ_OPTION_R6RS_ESCAPES_P         8
2233 #define READ_OPTION_SQUARE_BRACKETS_P     10
2234 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2235 #define READ_OPTION_CURLY_INFIX_P         14
2236 #define READ_OPTION_R7RS_SYMBOLS_P        16
2237
2238 /* The total width in bits of the per-port overrides */
2239 #define READ_OPTIONS_NUM_BITS             18
2240
2241 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2242 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2243
2244 #define READ_OPTION_MASK     3
2245 #define READ_OPTION_INHERIT  3
2246
2247 static void
2248 set_port_read_option (SCM port, int option, int new_value)
2249 {
2250   SCM scm_read_options;
2251   unsigned int read_options;
2252
2253   new_value &= READ_OPTION_MASK;
2254
2255   scm_dynwind_begin (0);
2256   scm_dynwind_lock_port (port);
2257
2258   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2259   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2260     read_options = scm_to_uint (scm_read_options);
2261   else
2262     read_options = READ_OPTIONS_INHERIT_ALL;
2263   read_options &= ~(READ_OPTION_MASK << option);
2264   read_options |= new_value << option;
2265   scm_read_options = scm_from_uint (read_options);
2266   scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2267
2268   scm_dynwind_end ();
2269 }
2270
2271 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2272 static void
2273 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2274 {
2275   value = !!value;
2276   opts->case_insensitive_p = value;
2277   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2278 }
2279
2280 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2281 static void
2282 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2283 {
2284   value = !!value;
2285   opts->square_brackets_p = value;
2286   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2287 }
2288
2289 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2290 static void
2291 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2292 {
2293   value = !!value;
2294   opts->curly_infix_p = value;
2295   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2296 }
2297
2298 /* Initialize OPTS based on PORT's read options and the global read
2299    options. */
2300 static void
2301 init_read_options (SCM port, scm_t_read_opts *opts)
2302 {
2303   SCM val, scm_read_options;
2304   unsigned int read_options, x;
2305
2306   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2307
2308   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2309     read_options = scm_to_uint (scm_read_options);
2310   else
2311     read_options = READ_OPTIONS_INHERIT_ALL;
2312
2313   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2314   if (x == READ_OPTION_INHERIT)
2315     {
2316       val = SCM_PACK (SCM_KEYWORD_STYLE);
2317       if (scm_is_eq (val, scm_keyword_prefix))
2318         x = KEYWORD_STYLE_PREFIX;
2319       else if (scm_is_eq (val, scm_keyword_postfix))
2320         x = KEYWORD_STYLE_POSTFIX;
2321       else
2322         x = KEYWORD_STYLE_HASH_PREFIX;
2323     }
2324   opts->keyword_style = x;
2325
2326 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2327   do                                                                    \
2328     {                                                                   \
2329       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2330       if (x == READ_OPTION_INHERIT)                                     \
2331         x = !!SCM_ ## NAME;                                             \
2332           opts->name = x;                                               \
2333     }                                                                   \
2334   while (0)
2335
2336   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2337   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2338   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2339   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2340   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2341   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2342   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2343   RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P,       r7rs_symbols_p);
2344
2345 #undef RESOLVE_BOOLEAN_OPTION
2346
2347   opts->neoteric_p = 0;
2348 }
2349
2350 void
2351 scm_init_read ()
2352 {
2353   SCM read_hash_procs;
2354
2355   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2356
2357   scm_i_read_hash_procedures =
2358     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2359
2360   scm_init_opts (scm_read_options, scm_read_opts);
2361 #include "libguile/read.x"
2362 }
2363
2364 /*
2365   Local Variables:
2366   c-file-style: "gnu"
2367   End:
2368 */