libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <unistd.h>
  30 #include <unicase.h>
  31 #include <unictype.h>
  32 #include <c-strcase.h>
  33 #include <c-ctype.h>
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/bytevectors.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/eval.h"
  39 #include "libguile/arrays.h"
  40 #include "libguile/bitvectors.h"
  41 #include "libguile/keywords.h"
  42 #include "libguile/alist.h"
  43 #include "libguile/srcprop.h"
  44 #include "libguile/hashtab.h"
  45 #include "libguile/hash.h"
  46 #include "libguile/ports.h"
  47 #include "libguile/ports-internal.h"
  48 #include "libguile/fports.h"
  49 #include "libguile/root.h"
  50 #include "libguile/strings.h"
  51 #include "libguile/strports.h"
  52 #include "libguile/vectors.h"
  53 #include "libguile/validate.h"
  54 #include "libguile/srfi-4.h"
  55 #include "libguile/srfi-13.h"
  56
  57 #include "libguile/read.h"
  58 #include "libguile/private-options.h"
  59
  60
  61 \f
  62
  63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  66 SCM_SYMBOL (sym_nil, "nil");
  67
  68 /* SRFI-105 curly infix expression support */
  69 SCM_SYMBOL (sym_nfx, "$nfx$");
  70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  72
  73 scm_t_option scm_read_opts[] =
  74   {
  75     { SCM_OPTION_BOOLEAN, "copy", 0,
  76       "Copy source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "positions", 1,
  78       "Record positions of source code expressions." },
  79     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  80       "Convert symbols to lower case."},
  81     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  82       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  83     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  84       "Use R6RS variable-length character and string hex escapes."},
  85     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  86       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  87     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  88       "In strings, consume leading whitespace after an escaped end-of-line."},
  89     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  90       "Support SRFI-105 curly infix expressions."},
  91     { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
  92       "Support R7RS |...| symbol notation."},
  93     { 0, },
  94   };
  95
  96 /* Internal read options structure.  This is initialized by 'scm_read'
  97    from the global and per-port read options, and a pointer is passed
  98    down to all helper functions. */
  99
 100 enum t_keyword_style
 101   {
 102     KEYWORD_STYLE_HASH_PREFIX,
 103     KEYWORD_STYLE_PREFIX,
 104     KEYWORD_STYLE_POSTFIX
 105   };
 106
 107 struct t_read_opts
 108 {
 109   enum t_keyword_style keyword_style;
 110   unsigned int copy_source_p        : 1;
 111   unsigned int record_positions_p   : 1;
 112   unsigned int case_insensitive_p   : 1;
 113   unsigned int r6rs_escapes_p       : 1;
 114   unsigned int square_brackets_p    : 1;
 115   unsigned int hungry_eol_escapes_p : 1;
 116   unsigned int curly_infix_p        : 1;
 117   unsigned int neoteric_p           : 1;
 118   unsigned int r7rs_symbols_p       : 1;
 119 };
 120
 121 typedef struct t_read_opts scm_t_read_opts;
 122
 123
 124 /*
 125   Give meaningful error messages for errors
 126
 127   We use the format
 128
 129   FILE:LINE:COL: MESSAGE
 130   This happened in ....
 131
 132   This is not standard GNU format, but the test-suite likes the real
 133   message to be in front.
 134
 135  */
 136
 137
 138 void
 139 scm_i_input_error (char const *function,
 140                    SCM port, const char *message, SCM arg)
 141 {
 142   SCM fn = (scm_is_string (SCM_FILENAME(port))
 143             ? SCM_FILENAME(port)
 144             : scm_from_locale_string ("#<unknown port>"));
 145
 146   SCM string_port = scm_open_output_string ();
 147   SCM string = SCM_EOL;
 148   scm_simple_format (string_port,
 149                      scm_from_locale_string ("~A:~S:~S: ~A"),
 150                      scm_list_4 (fn,
 151                                  scm_from_long (SCM_LINUM (port) + 1),
 152                                  scm_from_int (SCM_COL (port) + 1),
 153                                  scm_from_locale_string (message)));
 154
 155   string = scm_get_output_string (string_port);
 156   scm_close_output_port (string_port);
 157   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 158                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 159                  string,
 160                  arg,
 161                  SCM_BOOL_F);
 162 }
 163
 164
 165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 166             (SCM setting),
 167             "Option interface for the read options. Instead of using\n"
 168             "this procedure directly, use the procedures @code{read-enable},\n"
 169             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 170 #define FUNC_NAME s_scm_read_options
 171 {
 172   SCM ans = scm_options (setting,
 173                          scm_read_opts,
 174                          FUNC_NAME);
 175   if (SCM_COPY_SOURCE_P)
 176     SCM_RECORD_POSITIONS_P = 1;
 177   return ans;
 178 }
 179 #undef FUNC_NAME
 180
 181 /* A fluid referring to an association list mapping extra hash
 182    characters to procedures.  */
 183 static SCM *scm_i_read_hash_procedures;
 184
 185 static SCM
 186 scm_i_read_hash_procedures_ref (void)
 187 {
 188   return scm_fluid_ref (*scm_i_read_hash_procedures);
 189 }
 190
 191 static void
 192 scm_i_read_hash_procedures_set_x (SCM value)
 193 {
 194   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 195 }
 196
 197 \f
 198 /* Token readers.  */
 199
 200
 201 /* Size of the C buffer used to read symbols and numbers.  */
 202 #define READER_BUFFER_SIZE            128
 203
 204 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 205 #define READER_STRING_BUFFER_SIZE     128
 206
 207 /* The maximum size of Scheme character names.  */
 208 #define READER_CHAR_NAME_MAX_SIZE      50
 209
 210 /* The maximum size of reader directive names.  */
 211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 212
 213
 214 /* `isblank' is only in C99.  */
 215 #define CHAR_IS_BLANK_(_chr)                                    \
 216   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 217    || ((_chr) == '\f') || ((_chr) == '\r'))
 218
 219 #ifdef MSDOS
 220 # define CHAR_IS_BLANK(_chr)                    \
 221   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 222 #else
 223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 224 #endif
 225
 226
 227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 228    structure'').  */
 229 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 230   (CHAR_IS_BLANK (c)                                            \
 231    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 232
 233 #define CHAR_IS_DELIMITER(c)                                    \
 234   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 235    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 236                                       || opts->curly_infix_p))  \
 237    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 238
 239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 240    Structure''.  */
 241 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 242   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 243    || ((_chr) == 'd') || ((_chr) == 'l'))
 244
 245 /* Read an SCSH block comment.  */
 246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 250 static SCM scm_get_hash_procedure (int);
 251
 252 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 253    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 254    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 255    bytes actually read.  */
 256 static int
 257 read_token (SCM port, scm_t_read_opts *opts,
 258             char *buf, size_t buf_size, size_t *read)
 259 {
 260    *read = 0;
 261
 262    while (*read < buf_size)
 263      {
 264        int chr;
 265
 266        chr = scm_get_byte_or_eof_unlocked (port);
 267
 268        if (chr == EOF)
 269         return 0;
 270       else if (CHAR_IS_DELIMITER (chr))
 271         {
 272           scm_unget_byte_unlocked (chr, port);
 273           return 0;
 274         }
 275       else
 276         {
 277           *buf = (char) chr;
 278           buf++, (*read)++;
 279         }
 280      }
 281
 282    return 1;
 283  }
 284
 285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 286    if the token doesn't fit in BUFFER_SIZE bytes.  */
 287 static char *
 288 read_complete_token (SCM port, scm_t_read_opts *opts,
 289                      char *buffer, size_t buffer_size, size_t *read)
 290 {
 291   int overflow = 0;
 292   size_t bytes_read, overflow_size = 0;
 293   char *overflow_buffer = NULL;
 294
 295   do
 296     {
 297       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 298       if (bytes_read == 0)
 299         break;
 300       if (overflow || overflow_size != 0)
 301         {
 302           if (overflow_size == 0)
 303             {
 304               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 305               memcpy (overflow_buffer, buffer, bytes_read);
 306               overflow_size = bytes_read;
 307             }
 308           else
 309             {
 310               char *new_buf =
 311                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 312
 313               memcpy (new_buf, overflow_buffer, overflow_size);
 314               memcpy (new_buf + overflow_size, buffer, bytes_read);
 315
 316               overflow_buffer = new_buf;
 317               overflow_size += bytes_read;
 318             }
 319         }
 320     }
 321   while (overflow);
 322
 323   if (overflow_size)
 324     *read = overflow_size;
 325   else
 326     *read = bytes_read;
 327
 328   return (overflow_size > 0 ? overflow_buffer : buffer);
 329 }
 330
 331 /* Skip whitespace from PORT and return the first non-whitespace character
 332    read.  Raise an error on end-of-file.  */
 333 static int
 334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 335 {
 336   scm_t_wchar c;
 337   while (1)
 338     switch (c = scm_getc_unlocked (port))
 339       {
 340       case EOF:
 341       goteof:
 342         if (eoferr)
 343           {
 344             scm_i_input_error (eoferr,
 345                                port,
 346                                "end of file",
 347                                SCM_EOL);
 348           }
 349         return c;
 350
 351       case ';':
 352       lp:
 353         switch (c = scm_getc_unlocked (port))
 354           {
 355           case EOF:
 356             goto goteof;
 357           default:
 358             goto lp;
 359           case SCM_LINE_INCREMENTORS:
 360             break;
 361           }
 362         break;
 363
 364       case '#':
 365         switch (c = scm_getc_unlocked (port))
 366           {
 367           case EOF:
 368             eoferr = "read_sharp";
 369             goto goteof;
 370           case '!':
 371             scm_read_shebang (c, port, opts);
 372             break;
 373           case ';':
 374             scm_read_commented_expression (c, port, opts);
 375             break;
 376           case '|':
 377             if (scm_is_false (scm_get_hash_procedure (c)))
 378               {
 379                 scm_read_r6rs_block_comment (c, port);
 380                 break;
 381               }
 382             /* fall through */
 383           default:
 384             scm_ungetc_unlocked (c, port);
 385             return '#';
 386           }
 387         break;
 388
 389       case SCM_LINE_INCREMENTORS:
 390       case SCM_SINGLE_SPACES:
 391       case '\t':
 392         break;
 393
 394       default:
 395         return c;
 396       }
 397
 398   return 0;
 399 }
 400
 401
 402 \f
 403 /* Token readers.  */
 404
 405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 407                            long line, int column);
 408
 409
 410 static SCM
 411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 412                        long line, int column)
 413 {
 414   if (opts->record_positions_p)
 415     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 416   return x;
 417 }
 418
 419 static SCM
 420 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 421 #define FUNC_NAME "scm_i_lreadparen"
 422 {
 423   int c;
 424   SCM tmp, tl, ans = SCM_EOL;
 425   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 426   const int terminating_char = ((chr == '{') ? '}'
 427                                 : ((chr == '[') ? ']'
 428                                    : ')'));
 429
 430   /* Need to capture line and column numbers here. */
 431   long line = SCM_LINUM (port);
 432   int column = SCM_COL (port) - 1;
 433
 434   c = flush_ws (port, opts, FUNC_NAME);
 435   if (terminating_char == c)
 436     return SCM_EOL;
 437
 438   scm_ungetc_unlocked (c, port);
 439   tmp = scm_read_expression (port, opts);
 440
 441   /* Note that it is possible for scm_read_expression to return
 442      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 443      check that it's a real dot by checking `c'.  */
 444   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 445     {
 446       ans = scm_read_expression (port, opts);
 447       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 448         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 449                            SCM_EOL);
 450       return ans;
 451     }
 452
 453   /* Build the head of the list structure. */
 454   ans = tl = scm_cons (tmp, SCM_EOL);
 455
 456   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 457     {
 458       SCM new_tail;
 459
 460       if (c == ')' || (c == ']' && opts->square_brackets_p)
 461           || ((c == '}' || c == ']') && opts->curly_infix_p))
 462         scm_i_input_error (FUNC_NAME, port,
 463                            "in pair: mismatched close paren: ~A",
 464                            scm_list_1 (SCM_MAKE_CHAR (c)));
 465
 466       scm_ungetc_unlocked (c, port);
 467       tmp = scm_read_expression (port, opts);
 468
 469       /* See above note about scm_sym_dot.  */
 470       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 471         {
 472           SCM_SETCDR (tl, scm_read_expression (port, opts));
 473
 474           c = flush_ws (port, opts, FUNC_NAME);
 475           if (terminating_char != c)
 476             scm_i_input_error (FUNC_NAME, port,
 477                                "in pair: missing close paren", SCM_EOL);
 478           break;
 479         }
 480
 481       new_tail = scm_cons (tmp, SCM_EOL);
 482       SCM_SETCDR (tl, new_tail);
 483       tl = new_tail;
 484     }
 485
 486   if (curly_list_p)
 487     {
 488       /* In addition to finding the length, 'scm_ilength' checks for
 489          improper or circular lists, in which case it returns -1. */
 490       int len = scm_ilength (ans);
 491
 492       /* The (len == 0) case is handled above */
 493       if (len == 1)
 494         /* Return directly to avoid re-annotating the element's source
 495            location with the position of the outer brace.  Also, it
 496            might not be possible to annotate the element. */
 497         return scm_car (ans);  /* {e} => e */
 498       else if (len == 2)
 499         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 500       else if (len >= 3 && (len & 1))
 501         {
 502           /* It's a proper list whose length is odd and at least 3.  If
 503              the elements at odd indices (the infix operator positions)
 504              are all 'equal?', then it's a simple curly-infix list.
 505              Otherwise it's a mixed curly-infix list. */
 506           SCM op = scm_cadr (ans);
 507
 508           /* Check to see if the elements at odd indices are 'equal?' */
 509           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 510             {
 511               if (scm_is_null (tl))
 512                 {
 513                   /* Convert simple curly-infix list to prefix:
 514                      {a <op> b <op> ...} => (<op> a b ...) */
 515                   tl = ans;
 516                   while (scm_is_pair (scm_cdr (tl)))
 517                     {
 518                       tmp = scm_cddr (tl);
 519                       SCM_SETCDR (tl, tmp);
 520                       tl = tmp;
 521                     }
 522                   ans = scm_cons (op, ans);
 523                   break;
 524                 }
 525               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 526                 {
 527                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 528                   ans = scm_cons (sym_nfx, ans);
 529                   break;
 530                 }
 531             }
 532         }
 533       else
 534         /* Mixed curly-infix (possibly improper) list:
 535            {e . tail} => ($nfx$ e . tail) */
 536         ans = scm_cons (sym_nfx, ans);
 537     }
 538
 539   return maybe_annotate_source (ans, port, opts, line, column);
 540 }
 541 #undef FUNC_NAME
 542
 543
 544 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 545    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 546    found.  */
 547 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 548   do                                                               \
 549     {                                                              \
 550       scm_t_wchar a;                                               \
 551       size_t i = 0;                                                \
 552       c = 0;                                                       \
 553       while (i < ndigits)                                          \
 554         {                                                          \
 555           a = scm_getc_unlocked (port);                                     \
 556           if (a == EOF)                                            \
 557             goto str_eof;                                          \
 558           if (terminator                                           \
 559               && (a == (scm_t_wchar) terminator)                   \
 560               && (i > 0))                                          \
 561             break;                                                 \
 562           if ('0' <= a && a <= '9')                                \
 563             a -= '0';                                              \
 564           else if ('A' <= a && a <= 'F')                           \
 565             a = a - 'A' + 10;                                      \
 566           else if ('a' <= a && a <= 'f')                           \
 567             a = a - 'a' + 10;                                      \
 568           else                                                     \
 569             {                                                      \
 570               c = a;                                               \
 571               goto bad_escaped;                                    \
 572             }                                                      \
 573           c = c * 16 + a;                                          \
 574           i ++;                                                    \
 575         }                                                          \
 576     } while (0)
 577
 578 static void
 579 skip_intraline_whitespace (SCM port)
 580 {
 581   scm_t_wchar c;
 582
 583   do
 584     {
 585       c = scm_getc_unlocked (port);
 586       if (c == EOF)
 587         return;
 588     }
 589   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 590
 591   scm_ungetc_unlocked (c, port);
 592 }
 593
 594 /* Read either a double-quoted string or an R7RS-style symbol delimited
 595    by vertical lines, depending on the value of 'chr' ('"' or '|').
 596    Regardless, the result is always returned as a string.  */
 597 static SCM
 598 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
 599 #define FUNC_NAME "scm_lreadr"
 600 {
 601   /* For strings smaller than C_STR, this function creates only one Scheme
 602      object (the string returned).  */
 603
 604   SCM str = SCM_EOL;
 605   size_t c_str_len = 0;
 606   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 607
 608   /* Need to capture line and column numbers here. */
 609   long line = SCM_LINUM (port);
 610   int column = SCM_COL (port) - 1;
 611
 612   while (chr != (c = scm_getc_unlocked (port)))
 613     {
 614       if (c == EOF)
 615         {
 616         str_eof:
 617           scm_i_input_error (FUNC_NAME, port,
 618                              (chr == '|'
 619                               ? "end of file in symbol"
 620                               : "end of file in string constant"),
 621                              SCM_EOL);
 622         }
 623
 624       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 625         {
 626           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 627           c_str_len = 0;
 628         }
 629
 630       if (c == '\\')
 631         {
 632           switch (c = scm_getc_unlocked (port))
 633             {
 634             case EOF:
 635               goto str_eof;
 636             case '|':
 637             case '\\':
 638               break;
 639             case '\n':
 640               if (opts->hungry_eol_escapes_p)
 641                 skip_intraline_whitespace (port);
 642               continue;
 643             case '0':
 644               c = '\0';
 645               break;
 646             case 'f':
 647               c = '\f';
 648               break;
 649             case 'n':
 650               c = '\n';
 651               break;
 652             case 'r':
 653               c = '\r';
 654               break;
 655             case 't':
 656               c = '\t';
 657               break;
 658             case 'a':
 659               c = '\007';
 660               break;
 661             case 'v':
 662               c = '\v';
 663               break;
 664             case 'b':
 665               c = '\010';
 666               break;
 667             case 'x':
 668               if (opts->r6rs_escapes_p || chr == '|')
 669                 SCM_READ_HEX_ESCAPE (10, ';');
 670               else
 671                 SCM_READ_HEX_ESCAPE (2, '\0');
 672               break;
 673             case 'u':
 674               if (!opts->r6rs_escapes_p)
 675                 {
 676                   SCM_READ_HEX_ESCAPE (4, '\0');
 677                   break;
 678                 }
 679             case 'U':
 680               if (!opts->r6rs_escapes_p)
 681                 {
 682                   SCM_READ_HEX_ESCAPE (6, '\0');
 683                   break;
 684                 }
 685             default:
 686               if (c == chr)
 687                 break;
 688             bad_escaped:
 689               scm_i_input_error (FUNC_NAME, port,
 690                                  "illegal character in escape sequence: ~S",
 691                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 692             }
 693         }
 694
 695       c_str[c_str_len++] = c;
 696     }
 697
 698   if (scm_is_null (str))
 699     /* Fast path: we got a string that fits in C_STR.  */
 700     str = scm_from_utf32_stringn (c_str, c_str_len);
 701   else
 702     {
 703       if (c_str_len > 0)
 704         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 705
 706       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 707     }
 708
 709   return maybe_annotate_source (str, port, opts, line, column);
 710 }
 711 #undef FUNC_NAME
 712
 713 static SCM
 714 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 715 {
 716   return scm_read_string_like_syntax (chr, port, opts);
 717 }
 718
 719 static SCM
 720 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
 721 {
 722   return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
 723 }
 724
 725 static SCM
 726 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 727 {
 728   SCM result, str = SCM_EOL;
 729   char local_buffer[READER_BUFFER_SIZE], *buffer;
 730   size_t bytes_read;
 731
 732   /* Need to capture line and column numbers here. */
 733   long line = SCM_LINUM (port);
 734   int column = SCM_COL (port) - 1;
 735
 736   scm_ungetc_unlocked (chr, port);
 737   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 738                                 &bytes_read);
 739
 740   str = scm_from_port_stringn (buffer, bytes_read, port);
 741
 742   result = scm_string_to_number (str, SCM_UNDEFINED);
 743   if (scm_is_false (result))
 744     {
 745       /* Return a symbol instead of a number */
 746       if (opts->case_insensitive_p)
 747         str = scm_string_downcase_x (str);
 748       result = scm_string_to_symbol (str);
 749     }
 750   else if (SCM_NIMP (result))
 751     result = maybe_annotate_source (result, port, opts, line, column);
 752
 753   SCM_COL (port) += scm_i_string_length (str);
 754   return result;
 755 }
 756
 757 static SCM
 758 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 759 {
 760   SCM result;
 761   int ends_with_colon = 0;
 762   size_t bytes_read;
 763   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 764   char local_buffer[READER_BUFFER_SIZE], *buffer;
 765   SCM str;
 766
 767   scm_ungetc_unlocked (chr, port);
 768   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 769                                 &bytes_read);
 770   if (bytes_read > 0)
 771     ends_with_colon = buffer[bytes_read - 1] == ':';
 772
 773   if (postfix && ends_with_colon && (bytes_read > 1))
 774     {
 775       str = scm_from_port_stringn (buffer, bytes_read - 1, port);
 776
 777       if (opts->case_insensitive_p)
 778         str = scm_string_downcase_x (str);
 779       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 780     }
 781   else
 782     {
 783       str = scm_from_port_stringn (buffer, bytes_read, port);
 784
 785       if (opts->case_insensitive_p)
 786         str = scm_string_downcase_x (str);
 787       result = scm_string_to_symbol (str);
 788     }
 789
 790   SCM_COL (port) += scm_i_string_length (str);
 791   return result;
 792 }
 793
 794 static SCM
 795 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 796 #define FUNC_NAME "scm_lreadr"
 797 {
 798   SCM result;
 799   size_t read;
 800   char local_buffer[READER_BUFFER_SIZE], *buffer;
 801   unsigned int radix;
 802   SCM str;
 803
 804   switch (chr)
 805     {
 806     case 'B':
 807     case 'b':
 808       radix = 2;
 809       break;
 810
 811     case 'o':
 812     case 'O':
 813       radix = 8;
 814       break;
 815
 816     case 'd':
 817     case 'D':
 818       radix = 10;
 819       break;
 820
 821     case 'x':
 822     case 'X':
 823       radix = 16;
 824       break;
 825
 826     default:
 827       scm_ungetc_unlocked (chr, port);
 828       scm_ungetc_unlocked ('#', port);
 829       radix = 10;
 830     }
 831
 832   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 833                                 &read);
 834
 835   str = scm_from_port_stringn (buffer, read, port);
 836
 837   result = scm_string_to_number (str, scm_from_uint (radix));
 838
 839   SCM_COL (port) += scm_i_string_length (str);
 840
 841   if (scm_is_true (result))
 842     return result;
 843
 844   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 845
 846   return SCM_BOOL_F;
 847 }
 848 #undef FUNC_NAME
 849
 850 static SCM
 851 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 852 {
 853   SCM p;
 854   long line = SCM_LINUM (port);
 855   int column = SCM_COL (port) - 1;
 856
 857   switch (chr)
 858     {
 859     case '`':
 860       p = scm_sym_quasiquote;
 861       break;
 862
 863     case '\'':
 864       p = scm_sym_quote;
 865       break;
 866
 867     case ',':
 868       {
 869         scm_t_wchar c;
 870
 871         c = scm_getc_unlocked (port);
 872         if ('@' == c)
 873           p = scm_sym_uq_splicing;
 874         else
 875           {
 876             scm_ungetc_unlocked (c, port);
 877             p = scm_sym_unquote;
 878           }
 879         break;
 880       }
 881
 882     default:
 883       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 884                "scm_read_quote", chr);
 885       abort ();
 886     }
 887
 888   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 889   return maybe_annotate_source (p, port, opts, line, column);
 890 }
 891
 892 SCM_SYMBOL (sym_syntax, "syntax");
 893 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 894 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 895 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 896
 897 static SCM
 898 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 899 {
 900   SCM p;
 901   long line = SCM_LINUM (port);
 902   int column = SCM_COL (port) - 1;
 903
 904   switch (chr)
 905     {
 906     case '`':
 907       p = sym_quasisyntax;
 908       break;
 909
 910     case '\'':
 911       p = sym_syntax;
 912       break;
 913
 914     case ',':
 915       {
 916         int c;
 917
 918         c = scm_getc_unlocked (port);
 919         if ('@' == c)
 920           p = sym_unsyntax_splicing;
 921         else
 922           {
 923             scm_ungetc_unlocked (c, port);
 924             p = sym_unsyntax;
 925           }
 926         break;
 927       }
 928
 929     default:
 930       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 931                "scm_read_syntax", chr);
 932       abort ();
 933     }
 934
 935   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 936   return maybe_annotate_source (p, port, opts, line, column);
 937 }
 938
 939 static SCM
 940 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 941 {
 942   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 943
 944   if (!scm_is_eq (id, sym_nil))
 945     scm_i_input_error ("scm_read_nil", port,
 946                        "unexpected input while reading #nil: ~a",
 947                        scm_list_1 (id));
 948
 949   return SCM_ELISP_NIL;
 950 }
 951
 952 static SCM
 953 scm_read_semicolon_comment (int chr, SCM port)
 954 {
 955   int c;
 956
 957   /* We use the get_byte here because there is no need to get the
 958      locale correct with comment input. This presumes that newline
 959      always represents itself no matter what the encoding is.  */
 960   for (c = scm_get_byte_or_eof_unlocked (port);
 961        (c != EOF) && (c != '\n');
 962        c = scm_get_byte_or_eof_unlocked (port));
 963
 964   return SCM_UNSPECIFIED;
 965 }
 966
 967 /* If the EXPECTED_CHARS are the next ones available from PORT, then
 968    consume them and return 1.  Otherwise leave the port position where
 969    it was and return 0.  EXPECTED_CHARS should be all lowercase, and
 970    will be matched case-insensitively against the characters read from
 971    PORT. */
 972 static int
 973 try_read_ci_chars (SCM port, const char *expected_chars)
 974 {
 975   int num_chars_wanted = strlen (expected_chars);
 976   int num_chars_read = 0;
 977   char *chars_read = alloca (num_chars_wanted);
 978   int c;
 979
 980   while (num_chars_read < num_chars_wanted)
 981     {
 982       c = scm_getc_unlocked (port);
 983       if (c == EOF)
 984         break;
 985       else if (c_tolower (c) != expected_chars[num_chars_read])
 986         {
 987           scm_ungetc_unlocked (c, port);
 988           break;
 989         }
 990       else
 991         chars_read[num_chars_read++] = c;
 992     }
 993
 994   if (num_chars_read == num_chars_wanted)
 995     return 1;
 996   else
 997     {
 998       while (num_chars_read > 0)
 999         scm_ungetc_unlocked (chars_read[--num_chars_read], port);
1000       return 0;
1001     }
1002 }
1003
1004 \f
1005 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
1006
1007 static SCM
1008 scm_read_boolean (int chr, SCM port)
1009 {
1010   switch (chr)
1011     {
1012     case 't':
1013     case 'T':
1014       try_read_ci_chars (port, "rue");
1015       return SCM_BOOL_T;
1016
1017     case 'f':
1018     case 'F':
1019       try_read_ci_chars (port, "alse");
1020       return SCM_BOOL_F;
1021     }
1022
1023   return SCM_UNSPECIFIED;
1024 }
1025
1026 static SCM
1027 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1028 #define FUNC_NAME "scm_lreadr"
1029 {
1030   char buffer[READER_CHAR_NAME_MAX_SIZE];
1031   SCM charname;
1032   size_t charname_len, bytes_read;
1033   scm_t_wchar cp;
1034   int overflow;
1035   scm_t_port_internal *pti;
1036
1037   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1038                          &bytes_read);
1039   if (overflow)
1040     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1041
1042   if (bytes_read == 0)
1043     {
1044       chr = scm_getc_unlocked (port);
1045       if (chr == EOF)
1046         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1047                            "while reading character", SCM_EOL);
1048
1049       /* CHR must be a token delimiter, like a whitespace.  */
1050       return (SCM_MAKE_CHAR (chr));
1051     }
1052
1053   pti = SCM_PORT_GET_INTERNAL (port);
1054
1055   /* Simple ASCII characters can be processed immediately.  Also, simple
1056      ISO-8859-1 characters can be processed immediately if the encoding for this
1057      port is ISO-8859-1.  */
1058   if (bytes_read == 1 &&
1059       ((unsigned char) buffer[0] <= 127
1060        || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
1061     {
1062       SCM_COL (port) += 1;
1063       return SCM_MAKE_CHAR (buffer[0]);
1064     }
1065
1066   /* Otherwise, convert the buffer into a proper scheme string for
1067      processing.  */
1068   charname = scm_from_port_stringn (buffer, bytes_read, port);
1069   charname_len = scm_i_string_length (charname);
1070   SCM_COL (port) += charname_len;
1071   cp = scm_i_string_ref (charname, 0);
1072   if (charname_len == 1)
1073     return SCM_MAKE_CHAR (cp);
1074
1075   /* Ignore dotted circles, which may be used to keep combining characters from
1076      combining with the backslash in #\charname.  */
1077   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1078     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1079
1080   if (cp >= '0' && cp < '8')
1081     {
1082       /* Dirk:FIXME::  This type of character syntax is not R5RS
1083        * compliant.  Further, it should be verified that the constant
1084        * does only consist of octal digits.  */
1085       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1086       if (SCM_I_INUMP (p))
1087         {
1088           scm_t_wchar c = scm_to_uint32 (p);
1089           if (SCM_IS_UNICODE_CHAR (c))
1090             return SCM_MAKE_CHAR (c);
1091           else
1092             scm_i_input_error (FUNC_NAME, port,
1093                                "out-of-range octal character escape: ~a",
1094                                scm_list_1 (charname));
1095         }
1096     }
1097
1098   if (cp == 'x' && (charname_len > 1))
1099     {
1100       SCM p;
1101
1102       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1103       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1104                                 scm_from_uint (16));
1105       if (SCM_I_INUMP (p))
1106         {
1107           scm_t_wchar c = scm_to_uint32 (p);
1108           if (SCM_IS_UNICODE_CHAR (c))
1109             return SCM_MAKE_CHAR (c);
1110           else
1111             scm_i_input_error (FUNC_NAME, port,
1112                                "out-of-range hex character escape: ~a",
1113                                scm_list_1 (charname));
1114         }
1115     }
1116
1117   /* The names of characters should never have non-Latin1
1118      characters.  */
1119   if (scm_i_is_narrow_string (charname)
1120       || scm_i_try_narrow_string (charname))
1121     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1122                                        charname_len);
1123       if (scm_is_true (ch))
1124         return ch;
1125     }
1126
1127   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1128                      scm_list_1 (charname));
1129
1130   return SCM_UNSPECIFIED;
1131 }
1132 #undef FUNC_NAME
1133
1134 static SCM
1135 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1136 {
1137   SCM symbol;
1138
1139   /* Read the symbol that comprises the keyword.  Doing this instead of
1140      invoking a specific symbol reader function allows `scm_read_keyword ()'
1141      to adapt to the delimiters currently valid of symbols.
1142
1143      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1144   symbol = scm_read_expression (port, opts);
1145   if (!scm_is_symbol (symbol))
1146     scm_i_input_error ("scm_read_keyword", port,
1147                        "keyword prefix `~a' not followed by a symbol: ~s",
1148                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1149
1150   return (scm_symbol_to_keyword (symbol));
1151 }
1152
1153 static SCM
1154 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1155                  long line, int column)
1156 {
1157   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1158      guarantee that it's going to do what we want.  After all, this is an
1159      implementation detail of `scm_read_vector ()', not a desirable
1160      property.  */
1161   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1162                                 port, opts, line, column);
1163 }
1164
1165 /* Helper used by scm_read_array */
1166 static int
1167 read_decimal_integer (SCM port, int c, ssize_t *resp)
1168 {
1169   ssize_t sign = 1;
1170   ssize_t res = 0;
1171   int got_it = 0;
1172
1173   if (c == '-')
1174     {
1175       sign = -1;
1176       c = scm_getc_unlocked (port);
1177     }
1178
1179   while ('0' <= c && c <= '9')
1180     {
1181       if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1182         scm_i_input_error ("read_decimal_integer", port,
1183                            "number too large", SCM_EOL);
1184       res = 10*res + c-'0';
1185       got_it = 1;
1186       c = scm_getc_unlocked (port);
1187     }
1188
1189   if (got_it)
1190     *resp = sign * res;
1191   return c;
1192 }
1193
1194 /* Read an array.  This function can also read vectors and uniform
1195    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1196    handled here.
1197
1198    C is the first character read after the '#'. */
1199 static SCM
1200 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1201 {
1202   ssize_t rank;
1203   scm_t_wchar tag_buf[8];
1204   int tag_len;
1205
1206   SCM tag, shape = SCM_BOOL_F, elements, array;
1207
1208   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1209      the array code can not deal with zero-length dimensions yet, and
1210      we want to allow zero-length vectors, of course. */
1211   if (c == '(')
1212     return scm_read_vector (c, port, opts, line, column);
1213
1214   /* Disambiguate between '#f' and uniform floating point vectors. */
1215   if (c == 'f')
1216     {
1217       c = scm_getc_unlocked (port);
1218       if (c != '3' && c != '6')
1219         {
1220           if (c == 'a' && try_read_ci_chars (port, "lse"))
1221             return SCM_BOOL_F;
1222           else if (c != EOF)
1223             scm_ungetc_unlocked (c, port);
1224           return SCM_BOOL_F;
1225         }
1226       rank = 1;
1227       tag_buf[0] = 'f';
1228       tag_len = 1;
1229       goto continue_reading_tag;
1230     }
1231
1232   /* Read rank. */
1233   rank = 1;
1234   c = read_decimal_integer (port, c, &rank);
1235   if (rank < 0)
1236     scm_i_input_error (NULL, port, "array rank must be non-negative",
1237                        SCM_EOL);
1238
1239   /* Read tag. */
1240   tag_len = 0;
1241  continue_reading_tag:
1242   while (c != EOF && c != '(' && c != '@' && c != ':'
1243          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1244     {
1245       tag_buf[tag_len++] = c;
1246       c = scm_getc_unlocked (port);
1247     }
1248   if (tag_len == 0)
1249     tag = SCM_BOOL_T;
1250   else
1251     {
1252       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1253       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1254         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1255                            scm_list_1 (tag));
1256     }
1257
1258   /* Read shape. */
1259   if (c == '@' || c == ':')
1260     {
1261       shape = SCM_EOL;
1262
1263       do
1264         {
1265           ssize_t lbnd = 0, len = 0;
1266           SCM s;
1267
1268           if (c == '@')
1269             {
1270               c = scm_getc_unlocked (port);
1271               c = read_decimal_integer (port, c, &lbnd);
1272             }
1273
1274           s = scm_from_ssize_t (lbnd);
1275
1276           if (c == ':')
1277             {
1278               c = scm_getc_unlocked (port);
1279               c = read_decimal_integer (port, c, &len);
1280               if (len < 0)
1281                 scm_i_input_error (NULL, port,
1282                                    "array length must be non-negative",
1283                                    SCM_EOL);
1284
1285               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1286             }
1287
1288           shape = scm_cons (s, shape);
1289         } while (c == '@' || c == ':');
1290
1291       shape = scm_reverse_x (shape, SCM_EOL);
1292     }
1293
1294   /* Read nested lists of elements. */
1295   if (c != '(')
1296     scm_i_input_error (NULL, port,
1297                        "missing '(' in vector or array literal",
1298                        SCM_EOL);
1299   elements = scm_read_sexp (c, port, opts);
1300
1301   if (scm_is_false (shape))
1302     shape = scm_from_ssize_t (rank);
1303   else if (scm_ilength (shape) != rank)
1304     scm_i_input_error
1305       (NULL, port,
1306        "the number of shape specifications must match the array rank",
1307        SCM_EOL);
1308
1309   /* Handle special print syntax of rank zero arrays; see
1310      scm_i_print_array for a rationale. */
1311   if (rank == 0)
1312     {
1313       if (!scm_is_pair (elements))
1314         scm_i_input_error (NULL, port,
1315                            "too few elements in array literal, need 1",
1316                            SCM_EOL);
1317       if (!scm_is_null (SCM_CDR (elements)))
1318         scm_i_input_error (NULL, port,
1319                            "too many elements in array literal, want 1",
1320                            SCM_EOL);
1321       elements = SCM_CAR (elements);
1322     }
1323
1324   /* Construct array, annotate with source location, and return. */
1325   array = scm_list_to_typed_array (tag, shape, elements);
1326   return maybe_annotate_source (array, port, opts, line, column);
1327 }
1328
1329 static SCM
1330 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1331                        long line, int column)
1332 {
1333   return scm_read_array (chr, port, opts, line, column);
1334 }
1335
1336 static SCM
1337 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1338                      long line, int column)
1339 {
1340   chr = scm_getc_unlocked (port);
1341   if (chr != 'u')
1342     goto syntax;
1343
1344   chr = scm_getc_unlocked (port);
1345   if (chr != '8')
1346     goto syntax;
1347
1348   chr = scm_getc_unlocked (port);
1349   if (chr != '(')
1350     goto syntax;
1351
1352   return maybe_annotate_source
1353     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1354      port, opts, line, column);
1355
1356  syntax:
1357   scm_i_input_error ("read_bytevector", port,
1358                      "invalid bytevector prefix",
1359                      SCM_MAKE_CHAR (chr));
1360   return SCM_UNSPECIFIED;
1361 }
1362
1363 static SCM
1364 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1365                            long line, int column)
1366 {
1367   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1368      terribly inefficient but who cares?  */
1369   SCM s_bits = SCM_EOL;
1370
1371   for (chr = scm_getc_unlocked (port);
1372        (chr != EOF) && ((chr == '0') || (chr == '1'));
1373        chr = scm_getc_unlocked (port))
1374     {
1375       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1376     }
1377
1378   if (chr != EOF)
1379     scm_ungetc_unlocked (chr, port);
1380
1381   return maybe_annotate_source
1382     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1383      port, opts, line, column);
1384 }
1385
1386 static SCM
1387 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1388 {
1389   int bang_seen = 0;
1390
1391   for (;;)
1392     {
1393       int c = scm_getc_unlocked (port);
1394
1395       if (c == EOF)
1396         scm_i_input_error ("skip_block_comment", port,
1397                            "unterminated `#! ... !#' comment", SCM_EOL);
1398
1399       if (c == '!')
1400         bang_seen = 1;
1401       else if (c == '#' && bang_seen)
1402         break;
1403       else
1404         bang_seen = 0;
1405     }
1406
1407   return SCM_UNSPECIFIED;
1408 }
1409
1410 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1411                                          int value);
1412 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1413                                         int value);
1414 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1415                                     int value);
1416
1417 static SCM
1418 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1419 {
1420   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1421   int c;
1422   int i = 0;
1423
1424   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1425     {
1426       c = scm_getc_unlocked (port);
1427       if (c == EOF)
1428         scm_i_input_error ("skip_block_comment", port,
1429                            "unterminated `#! ... !#' comment", SCM_EOL);
1430       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1431         name[i++] = c;
1432       else if (CHAR_IS_DELIMITER (c))
1433         {
1434           scm_ungetc_unlocked (c, port);
1435           name[i] = '\0';
1436           if (0 == strcmp ("r6rs", name))
1437             ;  /* Silently ignore */
1438           else if (0 == strcmp ("fold-case", name))
1439             set_port_case_insensitive_p (port, opts, 1);
1440           else if (0 == strcmp ("no-fold-case", name))
1441             set_port_case_insensitive_p (port, opts, 0);
1442           else if (0 == strcmp ("curly-infix", name))
1443             set_port_curly_infix_p (port, opts, 1);
1444           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1445             {
1446               set_port_curly_infix_p (port, opts, 1);
1447               set_port_square_brackets_p (port, opts, 0);
1448             }
1449           else
1450             break;
1451
1452           return SCM_UNSPECIFIED;
1453         }
1454       else
1455         {
1456           scm_ungetc_unlocked (c, port);
1457           break;
1458         }
1459     }
1460   while (i > 0)
1461     scm_ungetc_unlocked (name[--i], port);
1462   return scm_read_scsh_block_comment (chr, port);
1463 }
1464
1465 static SCM
1466 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1467 {
1468   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1469      nested.  So care must be taken.  */
1470   int nesting_level = 1;
1471
1472   int a = scm_getc_unlocked (port);
1473
1474   if (a == EOF)
1475     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1476                        "unterminated `#| ... |#' comment", SCM_EOL);
1477
1478   while (nesting_level > 0)
1479     {
1480       int b = scm_getc_unlocked (port);
1481
1482       if (b == EOF)
1483         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1484                            "unterminated `#| ... |#' comment", SCM_EOL);
1485
1486       if (a == '|' && b == '#')
1487         {
1488           nesting_level--;
1489           b = EOF;
1490         }
1491       else if (a == '#' && b == '|')
1492         {
1493           nesting_level++;
1494           b = EOF;
1495         }
1496
1497       a = b;
1498     }
1499
1500   return SCM_UNSPECIFIED;
1501 }
1502
1503 static SCM
1504 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1505                                scm_t_read_opts *opts)
1506 {
1507   scm_t_wchar c;
1508
1509   c = flush_ws (port, opts, (char *) NULL);
1510   if (EOF == c)
1511     scm_i_input_error ("read_commented_expression", port,
1512                        "no expression after #; comment", SCM_EOL);
1513   scm_ungetc_unlocked (c, port);
1514   scm_read_expression (port, opts);
1515   return SCM_UNSPECIFIED;
1516 }
1517
1518 static SCM
1519 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1520 {
1521   /* Guile's extended symbol read syntax looks like this:
1522
1523        #{This is all a symbol name}#
1524
1525      So here, CHR is expected to be `{'.  */
1526   int saw_brace = 0;
1527   size_t len = 0;
1528   SCM buf = scm_i_make_string (1024, NULL, 0);
1529
1530   buf = scm_i_string_start_writing (buf);
1531
1532   while ((chr = scm_getc_unlocked (port)) != EOF)
1533     {
1534       if (saw_brace)
1535         {
1536           if (chr == '#')
1537             {
1538               break;
1539             }
1540           else
1541             {
1542               saw_brace = 0;
1543               scm_i_string_set_x (buf, len++, '}');
1544             }
1545         }
1546
1547       if (chr == '}')
1548         saw_brace = 1;
1549       else if (chr == '\\')
1550         {
1551           /* It used to be that print.c would print extended-read-syntax
1552              symbols with backslashes before "non-standard" chars, but
1553              this routine wouldn't do anything with those escapes.
1554              Bummer.  What we've done is to change print.c to output
1555              R6RS hex escapes for those characters, relying on the fact
1556              that the extended read syntax would never put a `\' before
1557              an `x'.  For now, we just ignore other instances of
1558              backslash in the string.  */
1559           switch ((chr = scm_getc_unlocked (port)))
1560             {
1561             case EOF:
1562               goto done;
1563             case 'x':
1564               {
1565                 scm_t_wchar c;
1566
1567                 SCM_READ_HEX_ESCAPE (10, ';');
1568                 scm_i_string_set_x (buf, len++, c);
1569                 break;
1570
1571               str_eof:
1572                 chr = EOF;
1573                 goto done;
1574
1575               bad_escaped:
1576                 scm_i_string_stop_writing ();
1577                 scm_i_input_error ("scm_read_extended_symbol", port,
1578                                    "illegal character in escape sequence: ~S",
1579                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1580                 break;
1581               }
1582             default:
1583               scm_i_string_set_x (buf, len++, chr);
1584               break;
1585             }
1586         }
1587       else
1588         scm_i_string_set_x (buf, len++, chr);
1589
1590       if (len >= scm_i_string_length (buf) - 2)
1591         {
1592           SCM addy;
1593
1594           scm_i_string_stop_writing ();
1595           addy = scm_i_make_string (1024, NULL, 0);
1596           buf = scm_string_append (scm_list_2 (buf, addy));
1597           len = 0;
1598           buf = scm_i_string_start_writing (buf);
1599         }
1600     }
1601
1602  done:
1603   scm_i_string_stop_writing ();
1604   if (chr == EOF)
1605     scm_i_input_error ("scm_read_extended_symbol", port,
1606                        "end of file while reading symbol", SCM_EOL);
1607
1608   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1609 }
1610
1611
1612 \f
1613 /* Top-level token readers, i.e., dispatchers.  */
1614
1615 static SCM
1616 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1617 {
1618   SCM proc;
1619
1620   proc = scm_get_hash_procedure (chr);
1621   if (scm_is_true (scm_procedure_p (proc)))
1622     {
1623       long line = SCM_LINUM (port);
1624       int column = SCM_COL (port) - 2;
1625       SCM got;
1626
1627       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1628
1629       if (opts->record_positions_p && SCM_NIMP (got)
1630           && !scm_i_has_source_properties (got))
1631         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1632
1633       return got;
1634     }
1635
1636   return SCM_UNSPECIFIED;
1637 }
1638
1639 /* The reader for the sharp `#' character.  It basically dispatches reads
1640    among the above token readers.   */
1641 static SCM
1642 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1643                 long line, int column)
1644 #define FUNC_NAME "scm_lreadr"
1645 {
1646   SCM result;
1647
1648   chr = scm_getc_unlocked (port);
1649
1650   result = scm_read_sharp_extension (chr, port, opts);
1651   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1652     return result;
1653
1654   switch (chr)
1655     {
1656     case '\\':
1657       return (scm_read_character (chr, port, opts));
1658     case '(':
1659       return (scm_read_vector (chr, port, opts, line, column));
1660     case 's':
1661     case 'u':
1662     case 'f':
1663     case 'c':
1664       /* This one may return either a boolean or an SRFI-4 vector.  */
1665       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1666     case 'v':
1667       return (scm_read_bytevector (chr, port, opts, line, column));
1668     case '*':
1669       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1670     case 't':
1671     case 'T':
1672     case 'F':
1673       return (scm_read_boolean (chr, port));
1674     case ':':
1675       return (scm_read_keyword (chr, port, opts));
1676     case '0': case '1': case '2': case '3': case '4':
1677     case '5': case '6': case '7': case '8': case '9':
1678     case '@':
1679       return (scm_read_array (chr, port, opts, line, column));
1680
1681     case 'i':
1682     case 'e':
1683     case 'b':
1684     case 'B':
1685     case 'o':
1686     case 'O':
1687     case 'd':
1688     case 'D':
1689     case 'x':
1690     case 'X':
1691     case 'I':
1692     case 'E':
1693       return (scm_read_number_and_radix (chr, port, opts));
1694     case '{':
1695       return (scm_read_extended_symbol (chr, port));
1696     case '!':
1697       return (scm_read_shebang (chr, port, opts));
1698     case ';':
1699       return (scm_read_commented_expression (chr, port, opts));
1700     case '`':
1701     case '\'':
1702     case ',':
1703       return (scm_read_syntax (chr, port, opts));
1704     case 'n':
1705       return (scm_read_nil (chr, port, opts));
1706     default:
1707       result = scm_read_sharp_extension (chr, port, opts);
1708       if (scm_is_eq (result, SCM_UNSPECIFIED))
1709         {
1710           /* To remain compatible with 1.8 and earlier, the following
1711              characters have lower precedence than `read-hash-extend'
1712              characters.  */
1713           switch (chr)
1714             {
1715             case '|':
1716               return scm_read_r6rs_block_comment (chr, port);
1717             default:
1718               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1719                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1720             }
1721         }
1722       else
1723         return result;
1724     }
1725
1726   return SCM_UNSPECIFIED;
1727 }
1728 #undef FUNC_NAME
1729
1730 static SCM
1731 read_inner_expression (SCM port, scm_t_read_opts *opts)
1732 #define FUNC_NAME "read_inner_expression"
1733 {
1734   while (1)
1735     {
1736       scm_t_wchar chr;
1737
1738       chr = scm_getc_unlocked (port);
1739
1740       switch (chr)
1741         {
1742         case SCM_WHITE_SPACES:
1743         case SCM_LINE_INCREMENTORS:
1744           break;
1745         case ';':
1746           (void) scm_read_semicolon_comment (chr, port);
1747           break;
1748         case '{':
1749           if (opts->curly_infix_p)
1750             {
1751               if (opts->neoteric_p)
1752                 return scm_read_sexp (chr, port, opts);
1753               else
1754                 {
1755                   SCM expr;
1756
1757                   /* Enable neoteric expressions within curly braces */
1758                   opts->neoteric_p = 1;
1759                   expr = scm_read_sexp (chr, port, opts);
1760                   opts->neoteric_p = 0;
1761                   return expr;
1762                 }
1763             }
1764           else
1765             return scm_read_mixed_case_symbol (chr, port, opts);
1766         case '[':
1767           if (opts->square_brackets_p)
1768             return scm_read_sexp (chr, port, opts);
1769           else if (opts->curly_infix_p)
1770             {
1771               /* The syntax of neoteric expressions requires that '[' be
1772                  a delimiter when curly-infix is enabled, so it cannot
1773                  be part of an unescaped symbol.  We might as well do
1774                  something useful with it, so we adopt Kawa's convention:
1775                  [...] => ($bracket-list$ ...) */
1776               long line = SCM_LINUM (port);
1777               int column = SCM_COL (port) - 1;
1778               return maybe_annotate_source
1779                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1780                  port, opts, line, column);
1781             }
1782           else
1783             return scm_read_mixed_case_symbol (chr, port, opts);
1784         case '(':
1785           return (scm_read_sexp (chr, port, opts));
1786         case '"':
1787           return (scm_read_string (chr, port, opts));
1788         case '|':
1789           if (opts->r7rs_symbols_p)
1790             return scm_read_r7rs_symbol (chr, port, opts);
1791           else
1792             return scm_read_mixed_case_symbol (chr, port, opts);
1793         case '\'':
1794         case '`':
1795         case ',':
1796           return (scm_read_quote (chr, port, opts));
1797         case '#':
1798           {
1799             long line  = SCM_LINUM (port);
1800             int column = SCM_COL (port) - 1;
1801             SCM result = scm_read_sharp (chr, port, opts, line, column);
1802             if (scm_is_eq (result, SCM_UNSPECIFIED))
1803               /* We read a comment or some such.  */
1804               break;
1805             else
1806               return result;
1807           }
1808         case ')':
1809           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1810           break;
1811         case '}':
1812           if (opts->curly_infix_p)
1813             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1814           else
1815             return scm_read_mixed_case_symbol (chr, port, opts);
1816         case ']':
1817           if (opts->square_brackets_p)
1818             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1819           /* otherwise fall through */
1820         case EOF:
1821           return SCM_EOF_VAL;
1822         case ':':
1823           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1824             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1825           /* Fall through.  */
1826
1827         default:
1828           {
1829             if (((chr >= '0') && (chr <= '9'))
1830                 || (strchr ("+-.", chr)))
1831               return (scm_read_number (chr, port, opts));
1832             else
1833               return (scm_read_mixed_case_symbol (chr, port, opts));
1834           }
1835         }
1836     }
1837 }
1838 #undef FUNC_NAME
1839
1840 static SCM
1841 scm_read_expression (SCM port, scm_t_read_opts *opts)
1842 #define FUNC_NAME "scm_read_expression"
1843 {
1844   if (!opts->neoteric_p)
1845     return read_inner_expression (port, opts);
1846   else
1847     {
1848       long line = 0;
1849       int column = 0;
1850       SCM expr;
1851
1852       if (opts->record_positions_p)
1853         {
1854           /* We need to get the position of the first non-whitespace
1855              character in order to correctly annotate neoteric
1856              expressions.  For example, for the expression 'f(x)', the
1857              first call to 'read_inner_expression' reads the 'f' (which
1858              cannot be annotated), and then we later read the '(x)' and
1859              use it to construct the new list (f x). */
1860           int c = flush_ws (port, opts, (char *) NULL);
1861           if (c == EOF)
1862             return SCM_EOF_VAL;
1863           scm_ungetc_unlocked (c, port);
1864           line = SCM_LINUM (port);
1865           column = SCM_COL (port);
1866         }
1867
1868       expr = read_inner_expression (port, opts);
1869
1870       /* 'expr' is the first component of the neoteric expression.  Now
1871          we loop, and as long as the next character is '(', '[', or '{',
1872          (without any intervening whitespace), we use it to construct a
1873          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1874       for (;;)
1875         {
1876           int chr = scm_getc_unlocked (port);
1877
1878           if (chr == '(')
1879             /* e(...) => (e ...) */
1880             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1881           else if (chr == '[')
1882             /* e[...] => ($bracket-apply$ e ...) */
1883             expr = scm_cons (sym_bracket_apply,
1884                              scm_cons (expr,
1885                                        scm_read_sexp (chr, port, opts)));
1886           else if (chr == '{')
1887             {
1888               SCM arg = scm_read_sexp (chr, port, opts);
1889
1890               if (scm_is_null (arg))
1891                 expr = scm_list_1 (expr);       /* e{} => (e) */
1892               else
1893                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1894             }
1895           else
1896             {
1897               if (chr != EOF)
1898                 scm_ungetc_unlocked (chr, port);
1899               break;
1900             }
1901           maybe_annotate_source (expr, port, opts, line, column);
1902         }
1903       return expr;
1904     }
1905 }
1906 #undef FUNC_NAME
1907
1908 \f
1909 /* Actual reader.  */
1910
1911 static void init_read_options (SCM port, scm_t_read_opts *opts);
1912
1913 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1914             (SCM port),
1915             "Read an s-expression from the input port @var{port}, or from\n"
1916             "the current input port if @var{port} is not specified.\n"
1917             "Any whitespace before the next token is discarded.")
1918 #define FUNC_NAME s_scm_read
1919 {
1920   scm_t_read_opts opts;
1921   int c;
1922
1923   if (SCM_UNBNDP (port))
1924     port = scm_current_input_port ();
1925   SCM_VALIDATE_OPINPORT (1, port);
1926
1927   init_read_options (port, &opts);
1928
1929   c = flush_ws (port, &opts, (char *) NULL);
1930   if (EOF == c)
1931     return SCM_EOF_VAL;
1932   scm_ungetc_unlocked (c, port);
1933
1934   return (scm_read_expression (port, &opts));
1935 }
1936 #undef FUNC_NAME
1937
1938
1939 \f
1940
1941 /* Manipulate the read-hash-procedures alist.  This could be written in
1942    Scheme, but maybe it will also be used by C code during initialisation.  */
1943 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1944             (SCM chr, SCM proc),
1945             "Install the procedure @var{proc} for reading expressions\n"
1946             "starting with the character sequence @code{#} and @var{chr}.\n"
1947             "@var{proc} will be called with two arguments:  the character\n"
1948             "@var{chr} and the port to read further data from. The object\n"
1949             "returned will be the return value of @code{read}. \n"
1950             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1951             )
1952 #define FUNC_NAME s_scm_read_hash_extend
1953 {
1954   SCM this;
1955   SCM prev;
1956
1957   SCM_VALIDATE_CHAR (1, chr);
1958   SCM_ASSERT (scm_is_false (proc)
1959               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1960               proc, SCM_ARG2, FUNC_NAME);
1961
1962   /* Check if chr is already in the alist.  */
1963   this = scm_i_read_hash_procedures_ref ();
1964   prev = SCM_BOOL_F;
1965   while (1)
1966     {
1967       if (scm_is_null (this))
1968         {
1969           /* not found, so add it to the beginning.  */
1970           if (scm_is_true (proc))
1971             {
1972               SCM new = scm_cons (scm_cons (chr, proc),
1973                                   scm_i_read_hash_procedures_ref ());
1974               scm_i_read_hash_procedures_set_x (new);
1975             }
1976           break;
1977         }
1978       if (scm_is_eq (chr, SCM_CAAR (this)))
1979         {
1980           /* already in the alist.  */
1981           if (scm_is_false (proc))
1982             {
1983               /* remove it.  */
1984               if (scm_is_false (prev))
1985                 {
1986                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1987                   scm_i_read_hash_procedures_set_x (rest);
1988                 }
1989               else
1990                 scm_set_cdr_x (prev, SCM_CDR (this));
1991             }
1992           else
1993             {
1994               /* replace it.  */
1995               scm_set_cdr_x (SCM_CAR (this), proc);
1996             }
1997           break;
1998         }
1999       prev = this;
2000       this = SCM_CDR (this);
2001     }
2002
2003   return SCM_UNSPECIFIED;
2004 }
2005 #undef FUNC_NAME
2006
2007 /* Recover the read-hash procedure corresponding to char c.  */
2008 static SCM
2009 scm_get_hash_procedure (int c)
2010 {
2011   SCM rest = scm_i_read_hash_procedures_ref ();
2012
2013   while (1)
2014     {
2015       if (scm_is_null (rest))
2016         return SCM_BOOL_F;
2017
2018       if (SCM_CHAR (SCM_CAAR (rest)) == c)
2019         return SCM_CDAR (rest);
2020
2021       rest = SCM_CDR (rest);
2022     }
2023 }
2024
2025 static int
2026 is_encoding_char (char c)
2027 {
2028   if (c >= 'a' && c <= 'z') return 1;
2029   if (c >= 'A' && c <= 'Z') return 1;
2030   if (c >= '0' && c <= '9') return 1;
2031   return strchr ("_-.:/,+=()", c) != NULL;
2032 }
2033
2034 /* Maximum size of an encoding name.  This is a bit more than the
2035    longest name listed at
2036    <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2037    characters.)  */
2038 #define ENCODING_NAME_MAX_SIZE 20
2039
2040 /* Number of bytes at the beginning or end of a file that are scanned
2041    for a "coding:" declaration.  */
2042 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2043
2044
2045 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2046    coding declaration.  Returns either NULL or a string whose storage
2047    has been allocated with `scm_gc_malloc'.  */
2048 char *
2049 scm_i_scan_for_encoding (SCM port)
2050 {
2051   scm_t_port *pt;
2052   char header[SCM_ENCODING_SEARCH_SIZE+1];
2053   size_t bytes_read, encoding_length, i;
2054   char *encoding = NULL;
2055   char *pos, *encoding_start;
2056   int in_comment;
2057
2058   pt = SCM_PTAB_ENTRY (port);
2059
2060   if (pt->rw_active == SCM_PORT_WRITE)
2061     scm_flush_unlocked (port);
2062
2063   if (pt->rw_random)
2064     pt->rw_active = SCM_PORT_READ;
2065
2066   if (pt->read_pos == pt->read_end)
2067     {
2068       /* We can use the read buffer, and thus avoid a seek. */
2069       if (scm_fill_input_unlocked (port) == EOF)
2070         return NULL;
2071
2072       bytes_read = pt->read_end - pt->read_pos;
2073       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2074         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2075
2076       if (bytes_read <= 1)
2077         /* An unbuffered port -- don't scan.  */
2078         return NULL;
2079
2080       memcpy (header, pt->read_pos, bytes_read);
2081       header[bytes_read] = '\0';
2082     }
2083   else
2084     {
2085       /* Try to read some bytes and then seek back.  Not all ports
2086          support seeking back; and indeed some file ports (like
2087          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2088          check performed by SCM_FPORT_FDES---but fail to seek
2089          backwards.  Hence this block comes second.  We prefer to use
2090          the read buffer in-place.  */
2091       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2092         return NULL;
2093
2094       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2095       header[bytes_read] = '\0';
2096       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2097     }
2098
2099   /* search past "coding[:=]" */
2100   pos = header;
2101   while (1)
2102     {
2103       if ((pos = strstr(pos, "coding")) == NULL)
2104         return NULL;
2105
2106       pos += strlen ("coding");
2107       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2108           (*pos == ':' || *pos == '='))
2109         {
2110           pos ++;
2111           break;
2112         }
2113     }
2114
2115   /* skip spaces */
2116   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2117          (*pos == ' ' || *pos == '\t'))
2118     pos ++;
2119
2120   if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2121     /* We found the "coding:" string, but there is probably not enough
2122        room to store an encoding name in its entirety, so ignore it.
2123        This makes sure we do not end up returning a truncated encoding
2124        name.  */
2125     return NULL;
2126
2127   /* grab the next token */
2128   encoding_start = pos;
2129   i = 0;
2130   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2131          && encoding_start + i - header < bytes_read
2132          && is_encoding_char (encoding_start[i]))
2133     i++;
2134
2135   encoding_length = i;
2136   if (encoding_length == 0)
2137     return NULL;
2138
2139   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2140
2141   /* push backwards to make sure we were in a comment */
2142   in_comment = 0;
2143   pos = encoding_start;
2144   while (pos >= header)
2145     {
2146       if (*pos == ';')
2147         {
2148           in_comment = 1;
2149           break;
2150         }
2151       else if (*pos == '\n' || pos == header)
2152         {
2153           /* This wasn't in a semicolon comment. Check for a
2154            hash-bang comment. */
2155           char *beg = strstr (header, "#!");
2156           char *end = strstr (header, "!#");
2157           if (beg < encoding_start && encoding_start + encoding_length <= end)
2158             in_comment = 1;
2159           break;
2160         }
2161       else
2162         {
2163           pos --;
2164           continue;
2165         }
2166     }
2167   if (!in_comment)
2168     /* This wasn't in a comment */
2169     return NULL;
2170
2171   return encoding;
2172 }
2173
2174 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2175             (SCM port),
2176             "Scans the port for an Emacs-like character coding declaration\n"
2177             "near the top of the contents of a port with random-accessible contents.\n"
2178             "The coding declaration is of the form\n"
2179             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2180             "\n"
2181             "Returns a string containing the character encoding of the file\n"
2182             "if a declaration was found, or @code{#f} otherwise.\n")
2183 #define FUNC_NAME s_scm_file_encoding
2184 {
2185   char *enc;
2186   SCM s_enc;
2187
2188   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2189
2190   enc = scm_i_scan_for_encoding (port);
2191   if (enc == NULL)
2192     return SCM_BOOL_F;
2193   else
2194     {
2195       s_enc = scm_string_upcase (scm_from_locale_string (enc));
2196       return s_enc;
2197     }
2198
2199   return SCM_BOOL_F;
2200 }
2201 #undef FUNC_NAME
2202
2203 \f
2204 /* Per-port read options.
2205
2206    We store per-port read options in the 'port-read-options' port
2207    property, which is stored in the internal port structure.  The value
2208    stored is a single integer that contains a two-bit field for each
2209    read option.
2210
2211    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2212    the applicable value should be inherited from the corresponding
2213    global read option.  Otherwise, the bit field contains the value of
2214    the read option.  For boolean read options that have been set
2215    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2216    read option has been set per-port, its possible values are those in
2217    'enum t_keyword_style'. */
2218
2219 /* Key to read options in port properties. */
2220 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2221
2222 /* Offsets of bit fields for each per-port override */
2223 #define READ_OPTION_COPY_SOURCE_P          0
2224 #define READ_OPTION_RECORD_POSITIONS_P     2
2225 #define READ_OPTION_CASE_INSENSITIVE_P     4
2226 #define READ_OPTION_KEYWORD_STYLE          6
2227 #define READ_OPTION_R6RS_ESCAPES_P         8
2228 #define READ_OPTION_SQUARE_BRACKETS_P     10
2229 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2230 #define READ_OPTION_CURLY_INFIX_P         14
2231 #define READ_OPTION_R7RS_SYMBOLS_P        16
2232
2233 /* The total width in bits of the per-port overrides */
2234 #define READ_OPTIONS_NUM_BITS             18
2235
2236 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2237 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2238
2239 #define READ_OPTION_MASK     3
2240 #define READ_OPTION_INHERIT  3
2241
2242 static void
2243 set_port_read_option (SCM port, int option, int new_value)
2244 {
2245   SCM scm_read_options;
2246   unsigned int read_options;
2247
2248   new_value &= READ_OPTION_MASK;
2249
2250   scm_dynwind_begin (0);
2251   scm_dynwind_lock_port (port);
2252
2253   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2254   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2255     read_options = scm_to_uint (scm_read_options);
2256   else
2257     read_options = READ_OPTIONS_INHERIT_ALL;
2258   read_options &= ~(READ_OPTION_MASK << option);
2259   read_options |= new_value << option;
2260   scm_read_options = scm_from_uint (read_options);
2261   scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2262
2263   scm_dynwind_end ();
2264 }
2265
2266 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2267 static void
2268 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2269 {
2270   value = !!value;
2271   opts->case_insensitive_p = value;
2272   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2273 }
2274
2275 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2276 static void
2277 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2278 {
2279   value = !!value;
2280   opts->square_brackets_p = value;
2281   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2282 }
2283
2284 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2285 static void
2286 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2287 {
2288   value = !!value;
2289   opts->curly_infix_p = value;
2290   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2291 }
2292
2293 /* Initialize OPTS based on PORT's read options and the global read
2294    options. */
2295 static void
2296 init_read_options (SCM port, scm_t_read_opts *opts)
2297 {
2298   SCM val, scm_read_options;
2299   unsigned int read_options, x;
2300
2301   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2302
2303   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2304     read_options = scm_to_uint (scm_read_options);
2305   else
2306     read_options = READ_OPTIONS_INHERIT_ALL;
2307
2308   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2309   if (x == READ_OPTION_INHERIT)
2310     {
2311       val = SCM_PACK (SCM_KEYWORD_STYLE);
2312       if (scm_is_eq (val, scm_keyword_prefix))
2313         x = KEYWORD_STYLE_PREFIX;
2314       else if (scm_is_eq (val, scm_keyword_postfix))
2315         x = KEYWORD_STYLE_POSTFIX;
2316       else
2317         x = KEYWORD_STYLE_HASH_PREFIX;
2318     }
2319   opts->keyword_style = x;
2320
2321 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2322   do                                                                    \
2323     {                                                                   \
2324       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2325       if (x == READ_OPTION_INHERIT)                                     \
2326         x = !!SCM_ ## NAME;                                             \
2327           opts->name = x;                                               \
2328     }                                                                   \
2329   while (0)
2330
2331   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2332   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2333   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2334   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2335   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2336   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2337   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2338   RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P,       r7rs_symbols_p);
2339
2340 #undef RESOLVE_BOOLEAN_OPTION
2341
2342   opts->neoteric_p = 0;
2343 }
2344
2345 void
2346 scm_init_read ()
2347 {
2348   SCM read_hash_procs;
2349
2350   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2351
2352   scm_i_read_hash_procedures =
2353     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2354
2355   scm_init_opts (scm_read_options, scm_read_opts);
2356 #include "libguile/read.x"
2357 }
2358
2359 /*
2360   Local Variables:
2361   c-file-style: "gnu"
2362   End:
2363 */