libguile/read.c

   1 /* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014
   2  *   Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <unistd.h>
  30 #include <unicase.h>
  31 #include <unictype.h>
  32 #include <c-strcase.h>
  33 #include <c-ctype.h>
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/bytevectors.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/eval.h"
  39 #include "libguile/arrays.h"
  40 #include "libguile/bitvectors.h"
  41 #include "libguile/keywords.h"
  42 #include "libguile/alist.h"
  43 #include "libguile/srcprop.h"
  44 #include "libguile/hashtab.h"
  45 #include "libguile/hash.h"
  46 #include "libguile/ports.h"
  47 #include "libguile/ports-internal.h"
  48 #include "libguile/fports.h"
  49 #include "libguile/root.h"
  50 #include "libguile/strings.h"
  51 #include "libguile/strports.h"
  52 #include "libguile/vectors.h"
  53 #include "libguile/validate.h"
  54 #include "libguile/srfi-4.h"
  55 #include "libguile/srfi-13.h"
  56
  57 #include "libguile/read.h"
  58 #include "libguile/private-options.h"
  59
  60
  61 \f
  62
  63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  66 SCM_SYMBOL (sym_nil, "nil");
  67
  68 /* SRFI-105 curly infix expression support */
  69 SCM_SYMBOL (sym_nfx, "$nfx$");
  70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  72
  73 scm_t_option scm_read_opts[] =
  74   {
  75     { SCM_OPTION_BOOLEAN, "copy", 0,
  76       "Copy source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "positions", 1,
  78       "Record positions of source code expressions." },
  79     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  80       "Convert symbols to lower case."},
  81     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  82       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  83     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  84       "Use R6RS variable-length character and string hex escapes."},
  85     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  86       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  87     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  88       "In strings, consume leading whitespace after an escaped end-of-line."},
  89     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  90       "Support SRFI-105 curly infix expressions."},
  91     { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
  92       "Support R7RS |...| symbol notation."},
  93     { 0, },
  94   };
  95
  96 /* Internal read options structure.  This is initialized by 'scm_read'
  97    from the global and per-port read options, and a pointer is passed
  98    down to all helper functions. */
  99
 100 enum t_keyword_style
 101   {
 102     KEYWORD_STYLE_HASH_PREFIX,
 103     KEYWORD_STYLE_PREFIX,
 104     KEYWORD_STYLE_POSTFIX
 105   };
 106
 107 struct t_read_opts
 108 {
 109   enum t_keyword_style keyword_style;
 110   unsigned int copy_source_p        : 1;
 111   unsigned int record_positions_p   : 1;
 112   unsigned int case_insensitive_p   : 1;
 113   unsigned int r6rs_escapes_p       : 1;
 114   unsigned int square_brackets_p    : 1;
 115   unsigned int hungry_eol_escapes_p : 1;
 116   unsigned int curly_infix_p        : 1;
 117   unsigned int neoteric_p           : 1;
 118   unsigned int r7rs_symbols_p       : 1;
 119 };
 120
 121 typedef struct t_read_opts scm_t_read_opts;
 122
 123
 124 /*
 125   Give meaningful error messages for errors
 126
 127   We use the format
 128
 129   FILE:LINE:COL: MESSAGE
 130   This happened in ....
 131
 132   This is not standard GNU format, but the test-suite likes the real
 133   message to be in front.
 134
 135  */
 136
 137
 138 void
 139 scm_i_input_error (char const *function,
 140                    SCM port, const char *message, SCM arg)
 141 {
 142   SCM fn = (scm_is_string (SCM_FILENAME(port))
 143             ? SCM_FILENAME(port)
 144             : scm_from_locale_string ("#<unknown port>"));
 145
 146   SCM string_port = scm_open_output_string ();
 147   SCM string = SCM_EOL;
 148   scm_simple_format (string_port,
 149                      scm_from_locale_string ("~A:~S:~S: ~A"),
 150                      scm_list_4 (fn,
 151                                  scm_from_long (SCM_LINUM (port) + 1),
 152                                  scm_from_int (SCM_COL (port) + 1),
 153                                  scm_from_locale_string (message)));
 154
 155   string = scm_get_output_string (string_port);
 156   scm_close_output_port (string_port);
 157   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 158                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 159                  string,
 160                  arg,
 161                  SCM_BOOL_F);
 162 }
 163
 164
 165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 166             (SCM setting),
 167             "Option interface for the read options. Instead of using\n"
 168             "this procedure directly, use the procedures @code{read-enable},\n"
 169             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 170 #define FUNC_NAME s_scm_read_options
 171 {
 172   SCM ans = scm_options (setting,
 173                          scm_read_opts,
 174                          FUNC_NAME);
 175   if (SCM_COPY_SOURCE_P)
 176     SCM_RECORD_POSITIONS_P = 1;
 177   return ans;
 178 }
 179 #undef FUNC_NAME
 180
 181 /* A fluid referring to an association list mapping extra hash
 182    characters to procedures.  */
 183 static SCM *scm_i_read_hash_procedures;
 184
 185 static SCM
 186 scm_i_read_hash_procedures_ref (void)
 187 {
 188   return scm_fluid_ref (*scm_i_read_hash_procedures);
 189 }
 190
 191 static void
 192 scm_i_read_hash_procedures_set_x (SCM value)
 193 {
 194   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 195 }
 196
 197 \f
 198 /* Token readers.  */
 199
 200
 201 /* Size of the C buffer used to read symbols and numbers.  */
 202 #define READER_BUFFER_SIZE            128
 203
 204 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 205 #define READER_STRING_BUFFER_SIZE     128
 206
 207 /* The maximum size of Scheme character names.  */
 208 #define READER_CHAR_NAME_MAX_SIZE      50
 209
 210 /* The maximum size of reader directive names.  */
 211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 212
 213
 214 /* `isblank' is only in C99.  */
 215 #define CHAR_IS_BLANK_(_chr)                                    \
 216   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 217    || ((_chr) == '\f') || ((_chr) == '\r'))
 218
 219 #ifdef MSDOS
 220 # define CHAR_IS_BLANK(_chr)                    \
 221   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 222 #else
 223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 224 #endif
 225
 226
 227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 228    structure'').  */
 229 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 230   (CHAR_IS_BLANK (c)                                            \
 231    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 232
 233 #define CHAR_IS_DELIMITER(c)                                    \
 234   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 235    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 236                                       || opts->curly_infix_p))  \
 237    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 238
 239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 240    Structure''.  */
 241 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 242   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 243    || ((_chr) == 'd') || ((_chr) == 'l'))
 244
 245 /* Read an SCSH block comment.  */
 246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 250 static SCM scm_get_hash_procedure (int);
 251
 252 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 253    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 254    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 255    bytes actually read.  */
 256 static int
 257 read_token (SCM port, scm_t_read_opts *opts,
 258             char *buf, size_t buf_size, size_t *read)
 259 {
 260    *read = 0;
 261
 262    while (*read < buf_size)
 263      {
 264        int chr;
 265
 266        chr = scm_get_byte_or_eof_unlocked (port);
 267
 268        if (chr == EOF)
 269         return 0;
 270       else if (CHAR_IS_DELIMITER (chr))
 271         {
 272           scm_unget_byte_unlocked (chr, port);
 273           return 0;
 274         }
 275       else
 276         {
 277           *buf = (char) chr;
 278           buf++, (*read)++;
 279         }
 280      }
 281
 282    return 1;
 283  }
 284
 285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 286    if the token doesn't fit in BUFFER_SIZE bytes.  */
 287 static char *
 288 read_complete_token (SCM port, scm_t_read_opts *opts,
 289                      char *buffer, size_t buffer_size, size_t *read)
 290 {
 291   int overflow = 0;
 292   size_t bytes_read, overflow_size = 0;
 293   char *overflow_buffer = NULL;
 294
 295   do
 296     {
 297       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 298       if (bytes_read == 0)
 299         break;
 300       if (overflow || overflow_size != 0)
 301         {
 302           if (overflow_size == 0)
 303             {
 304               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 305               memcpy (overflow_buffer, buffer, bytes_read);
 306               overflow_size = bytes_read;
 307             }
 308           else
 309             {
 310               char *new_buf =
 311                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 312
 313               memcpy (new_buf, overflow_buffer, overflow_size);
 314               memcpy (new_buf + overflow_size, buffer, bytes_read);
 315
 316               overflow_buffer = new_buf;
 317               overflow_size += bytes_read;
 318             }
 319         }
 320     }
 321   while (overflow);
 322
 323   if (overflow_size)
 324     *read = overflow_size;
 325   else
 326     *read = bytes_read;
 327
 328   return (overflow_size > 0 ? overflow_buffer : buffer);
 329 }
 330
 331 /* Skip whitespace from PORT and return the first non-whitespace character
 332    read.  Raise an error on end-of-file.  */
 333 static int
 334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 335 {
 336   scm_t_wchar c;
 337   while (1)
 338     switch (c = scm_getc_unlocked (port))
 339       {
 340       case EOF:
 341       goteof:
 342         if (eoferr)
 343           {
 344             scm_i_input_error (eoferr,
 345                                port,
 346                                "end of file",
 347                                SCM_EOL);
 348           }
 349         return c;
 350
 351       case ';':
 352       lp:
 353         switch (c = scm_getc_unlocked (port))
 354           {
 355           case EOF:
 356             goto goteof;
 357           default:
 358             goto lp;
 359           case SCM_LINE_INCREMENTORS:
 360             break;
 361           }
 362         break;
 363
 364       case '#':
 365         switch (c = scm_getc_unlocked (port))
 366           {
 367           case EOF:
 368             eoferr = "read_sharp";
 369             goto goteof;
 370           case '!':
 371             scm_read_shebang (c, port, opts);
 372             break;
 373           case ';':
 374             scm_read_commented_expression (c, port, opts);
 375             break;
 376           case '|':
 377             if (scm_is_false (scm_get_hash_procedure (c)))
 378               {
 379                 scm_read_r6rs_block_comment (c, port);
 380                 break;
 381               }
 382             /* fall through */
 383           default:
 384             scm_ungetc_unlocked (c, port);
 385             return '#';
 386           }
 387         break;
 388
 389       case SCM_LINE_INCREMENTORS:
 390       case SCM_SINGLE_SPACES:
 391       case '\t':
 392         break;
 393
 394       default:
 395         return c;
 396       }
 397
 398   return 0;
 399 }
 400
 401
 402 \f
 403 /* Token readers.  */
 404
 405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 407                            long line, int column);
 408
 409
 410 static SCM
 411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 412                        long line, int column)
 413 {
 414   /* This condition can be caused by a user calling
 415      set-port-column!.  */
 416   if (line < 0 || column < 0)
 417     return x;
 418
 419   if (opts->record_positions_p)
 420     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 421   return x;
 422 }
 423
 424 static SCM
 425 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 426 #define FUNC_NAME "scm_i_lreadparen"
 427 {
 428   int c;
 429   SCM tmp, tl, ans = SCM_EOL;
 430   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 431   const int terminating_char = ((chr == '{') ? '}'
 432                                 : ((chr == '[') ? ']'
 433                                    : ')'));
 434
 435   /* Need to capture line and column numbers here. */
 436   long line = SCM_LINUM (port);
 437   int column = SCM_COL (port) - 1;
 438
 439   c = flush_ws (port, opts, FUNC_NAME);
 440   if (terminating_char == c)
 441     return SCM_EOL;
 442
 443   scm_ungetc_unlocked (c, port);
 444   tmp = scm_read_expression (port, opts);
 445
 446   /* Note that it is possible for scm_read_expression to return
 447      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 448      check that it's a real dot by checking `c'.  */
 449   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 450     {
 451       ans = scm_read_expression (port, opts);
 452       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 453         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 454                            SCM_EOL);
 455       return ans;
 456     }
 457
 458   /* Build the head of the list structure. */
 459   ans = tl = scm_cons (tmp, SCM_EOL);
 460
 461   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 462     {
 463       SCM new_tail;
 464
 465       if (c == ')' || (c == ']' && opts->square_brackets_p)
 466           || ((c == '}' || c == ']') && opts->curly_infix_p))
 467         scm_i_input_error (FUNC_NAME, port,
 468                            "in pair: mismatched close paren: ~A",
 469                            scm_list_1 (SCM_MAKE_CHAR (c)));
 470
 471       scm_ungetc_unlocked (c, port);
 472       tmp = scm_read_expression (port, opts);
 473
 474       /* See above note about scm_sym_dot.  */
 475       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 476         {
 477           SCM_SETCDR (tl, scm_read_expression (port, opts));
 478
 479           c = flush_ws (port, opts, FUNC_NAME);
 480           if (terminating_char != c)
 481             scm_i_input_error (FUNC_NAME, port,
 482                                "in pair: missing close paren", SCM_EOL);
 483           break;
 484         }
 485
 486       new_tail = scm_cons (tmp, SCM_EOL);
 487       SCM_SETCDR (tl, new_tail);
 488       tl = new_tail;
 489     }
 490
 491   if (curly_list_p)
 492     {
 493       /* In addition to finding the length, 'scm_ilength' checks for
 494          improper or circular lists, in which case it returns -1. */
 495       int len = scm_ilength (ans);
 496
 497       /* The (len == 0) case is handled above */
 498       if (len == 1)
 499         /* Return directly to avoid re-annotating the element's source
 500            location with the position of the outer brace.  Also, it
 501            might not be possible to annotate the element. */
 502         return scm_car (ans);  /* {e} => e */
 503       else if (len == 2)
 504         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 505       else if (len >= 3 && (len & 1))
 506         {
 507           /* It's a proper list whose length is odd and at least 3.  If
 508              the elements at odd indices (the infix operator positions)
 509              are all 'equal?', then it's a simple curly-infix list.
 510              Otherwise it's a mixed curly-infix list. */
 511           SCM op = scm_cadr (ans);
 512
 513           /* Check to see if the elements at odd indices are 'equal?' */
 514           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 515             {
 516               if (scm_is_null (tl))
 517                 {
 518                   /* Convert simple curly-infix list to prefix:
 519                      {a <op> b <op> ...} => (<op> a b ...) */
 520                   tl = ans;
 521                   while (scm_is_pair (scm_cdr (tl)))
 522                     {
 523                       tmp = scm_cddr (tl);
 524                       SCM_SETCDR (tl, tmp);
 525                       tl = tmp;
 526                     }
 527                   ans = scm_cons (op, ans);
 528                   break;
 529                 }
 530               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 531                 {
 532                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 533                   ans = scm_cons (sym_nfx, ans);
 534                   break;
 535                 }
 536             }
 537         }
 538       else
 539         /* Mixed curly-infix (possibly improper) list:
 540            {e . tail} => ($nfx$ e . tail) */
 541         ans = scm_cons (sym_nfx, ans);
 542     }
 543
 544   return maybe_annotate_source (ans, port, opts, line, column);
 545 }
 546 #undef FUNC_NAME
 547
 548
 549 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 550    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 551    found.  */
 552 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 553   do                                                               \
 554     {                                                              \
 555       scm_t_wchar a;                                               \
 556       size_t i = 0;                                                \
 557       c = 0;                                                       \
 558       while (i < ndigits)                                          \
 559         {                                                          \
 560           a = scm_getc_unlocked (port);                                     \
 561           if (a == EOF)                                            \
 562             goto str_eof;                                          \
 563           if (terminator                                           \
 564               && (a == (scm_t_wchar) terminator)                   \
 565               && (i > 0))                                          \
 566             break;                                                 \
 567           if ('0' <= a && a <= '9')                                \
 568             a -= '0';                                              \
 569           else if ('A' <= a && a <= 'F')                           \
 570             a = a - 'A' + 10;                                      \
 571           else if ('a' <= a && a <= 'f')                           \
 572             a = a - 'a' + 10;                                      \
 573           else                                                     \
 574             {                                                      \
 575               c = a;                                               \
 576               goto bad_escaped;                                    \
 577             }                                                      \
 578           c = c * 16 + a;                                          \
 579           i ++;                                                    \
 580         }                                                          \
 581     } while (0)
 582
 583 static void
 584 skip_intraline_whitespace (SCM port)
 585 {
 586   scm_t_wchar c;
 587
 588   do
 589     {
 590       c = scm_getc_unlocked (port);
 591       if (c == EOF)
 592         return;
 593     }
 594   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 595
 596   scm_ungetc_unlocked (c, port);
 597 }
 598
 599 /* Read either a double-quoted string or an R7RS-style symbol delimited
 600    by vertical lines, depending on the value of 'chr' ('"' or '|').
 601    Regardless, the result is always returned as a string.  */
 602 static SCM
 603 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
 604 #define FUNC_NAME "scm_lreadr"
 605 {
 606   /* For strings smaller than C_STR, this function creates only one Scheme
 607      object (the string returned).  */
 608
 609   SCM str = SCM_EOL;
 610   size_t c_str_len = 0;
 611   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 612
 613   /* Need to capture line and column numbers here. */
 614   long line = SCM_LINUM (port);
 615   int column = SCM_COL (port) - 1;
 616
 617   while (chr != (c = scm_getc_unlocked (port)))
 618     {
 619       if (c == EOF)
 620         {
 621         str_eof:
 622           scm_i_input_error (FUNC_NAME, port,
 623                              (chr == '|'
 624                               ? "end of file in symbol"
 625                               : "end of file in string constant"),
 626                              SCM_EOL);
 627         }
 628
 629       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 630         {
 631           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 632           c_str_len = 0;
 633         }
 634
 635       if (c == '\\')
 636         {
 637           switch (c = scm_getc_unlocked (port))
 638             {
 639             case EOF:
 640               goto str_eof;
 641             case '|':
 642             case '\\':
 643             case '(':  /* Accept "\(" for use at the beginning of lines
 644                           in multiline strings to avoid confusing emacs
 645                           lisp modes.  */
 646               break;
 647             case '\n':
 648               if (opts->hungry_eol_escapes_p)
 649                 skip_intraline_whitespace (port);
 650               continue;
 651             case '0':
 652               c = '\0';
 653               break;
 654             case 'f':
 655               c = '\f';
 656               break;
 657             case 'n':
 658               c = '\n';
 659               break;
 660             case 'r':
 661               c = '\r';
 662               break;
 663             case 't':
 664               c = '\t';
 665               break;
 666             case 'a':
 667               c = '\007';
 668               break;
 669             case 'v':
 670               c = '\v';
 671               break;
 672             case 'b':
 673               c = '\010';
 674               break;
 675             case 'x':
 676               if (opts->r6rs_escapes_p || chr == '|')
 677                 SCM_READ_HEX_ESCAPE (10, ';');
 678               else
 679                 SCM_READ_HEX_ESCAPE (2, '\0');
 680               break;
 681             case 'u':
 682               if (!opts->r6rs_escapes_p)
 683                 {
 684                   SCM_READ_HEX_ESCAPE (4, '\0');
 685                   break;
 686                 }
 687             case 'U':
 688               if (!opts->r6rs_escapes_p)
 689                 {
 690                   SCM_READ_HEX_ESCAPE (6, '\0');
 691                   break;
 692                 }
 693             default:
 694               if (c == chr)
 695                 break;
 696             bad_escaped:
 697               scm_i_input_error (FUNC_NAME, port,
 698                                  "illegal character in escape sequence: ~S",
 699                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 700             }
 701         }
 702
 703       c_str[c_str_len++] = c;
 704     }
 705
 706   if (scm_is_null (str))
 707     /* Fast path: we got a string that fits in C_STR.  */
 708     str = scm_from_utf32_stringn (c_str, c_str_len);
 709   else
 710     {
 711       if (c_str_len > 0)
 712         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 713
 714       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 715     }
 716
 717   return maybe_annotate_source (str, port, opts, line, column);
 718 }
 719 #undef FUNC_NAME
 720
 721 static SCM
 722 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 723 {
 724   return scm_read_string_like_syntax (chr, port, opts);
 725 }
 726
 727 static SCM
 728 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
 729 {
 730   return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
 731 }
 732
 733 static SCM
 734 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 735 {
 736   SCM result, str = SCM_EOL;
 737   char local_buffer[READER_BUFFER_SIZE], *buffer;
 738   size_t bytes_read;
 739
 740   /* Need to capture line and column numbers here. */
 741   long line = SCM_LINUM (port);
 742   int column = SCM_COL (port) - 1;
 743
 744   scm_ungetc_unlocked (chr, port);
 745   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 746                                 &bytes_read);
 747
 748   str = scm_from_port_stringn (buffer, bytes_read, port);
 749
 750   result = scm_string_to_number (str, SCM_UNDEFINED);
 751   if (scm_is_false (result))
 752     {
 753       /* Return a symbol instead of a number */
 754       if (opts->case_insensitive_p)
 755         str = scm_string_downcase_x (str);
 756       result = scm_string_to_symbol (str);
 757     }
 758   else if (SCM_NIMP (result))
 759     result = maybe_annotate_source (result, port, opts, line, column);
 760
 761   SCM_COL (port) += scm_i_string_length (str);
 762   return result;
 763 }
 764
 765 static SCM
 766 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 767 {
 768   SCM result;
 769   int ends_with_colon = 0;
 770   size_t bytes_read;
 771   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 772   char local_buffer[READER_BUFFER_SIZE], *buffer;
 773   SCM str;
 774
 775   scm_ungetc_unlocked (chr, port);
 776   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 777                                 &bytes_read);
 778   if (bytes_read > 0)
 779     ends_with_colon = buffer[bytes_read - 1] == ':';
 780
 781   if (postfix && ends_with_colon && (bytes_read > 1))
 782     {
 783       str = scm_from_port_stringn (buffer, bytes_read - 1, port);
 784
 785       if (opts->case_insensitive_p)
 786         str = scm_string_downcase_x (str);
 787       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 788     }
 789   else
 790     {
 791       str = scm_from_port_stringn (buffer, bytes_read, port);
 792
 793       if (opts->case_insensitive_p)
 794         str = scm_string_downcase_x (str);
 795       result = scm_string_to_symbol (str);
 796     }
 797
 798   SCM_COL (port) += scm_i_string_length (str);
 799   return result;
 800 }
 801
 802 static SCM
 803 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 804 #define FUNC_NAME "scm_lreadr"
 805 {
 806   SCM result;
 807   size_t read;
 808   char local_buffer[READER_BUFFER_SIZE], *buffer;
 809   unsigned int radix;
 810   SCM str;
 811
 812   switch (chr)
 813     {
 814     case 'B':
 815     case 'b':
 816       radix = 2;
 817       break;
 818
 819     case 'o':
 820     case 'O':
 821       radix = 8;
 822       break;
 823
 824     case 'd':
 825     case 'D':
 826       radix = 10;
 827       break;
 828
 829     case 'x':
 830     case 'X':
 831       radix = 16;
 832       break;
 833
 834     default:
 835       scm_ungetc_unlocked (chr, port);
 836       scm_ungetc_unlocked ('#', port);
 837       radix = 10;
 838     }
 839
 840   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 841                                 &read);
 842
 843   str = scm_from_port_stringn (buffer, read, port);
 844
 845   result = scm_string_to_number (str, scm_from_uint (radix));
 846
 847   SCM_COL (port) += scm_i_string_length (str);
 848
 849   if (scm_is_true (result))
 850     return result;
 851
 852   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 853
 854   return SCM_BOOL_F;
 855 }
 856 #undef FUNC_NAME
 857
 858 static SCM
 859 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 860 {
 861   SCM p;
 862   long line = SCM_LINUM (port);
 863   int column = SCM_COL (port) - 1;
 864
 865   switch (chr)
 866     {
 867     case '`':
 868       p = scm_sym_quasiquote;
 869       break;
 870
 871     case '\'':
 872       p = scm_sym_quote;
 873       break;
 874
 875     case ',':
 876       {
 877         scm_t_wchar c;
 878
 879         c = scm_getc_unlocked (port);
 880         if ('@' == c)
 881           p = scm_sym_uq_splicing;
 882         else
 883           {
 884             scm_ungetc_unlocked (c, port);
 885             p = scm_sym_unquote;
 886           }
 887         break;
 888       }
 889
 890     default:
 891       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 892                "scm_read_quote", chr);
 893       abort ();
 894     }
 895
 896   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 897   return maybe_annotate_source (p, port, opts, line, column);
 898 }
 899
 900 SCM_SYMBOL (sym_syntax, "syntax");
 901 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 902 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 903 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 904
 905 static SCM
 906 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 907 {
 908   SCM p;
 909   long line = SCM_LINUM (port);
 910   int column = SCM_COL (port) - 1;
 911
 912   switch (chr)
 913     {
 914     case '`':
 915       p = sym_quasisyntax;
 916       break;
 917
 918     case '\'':
 919       p = sym_syntax;
 920       break;
 921
 922     case ',':
 923       {
 924         int c;
 925
 926         c = scm_getc_unlocked (port);
 927         if ('@' == c)
 928           p = sym_unsyntax_splicing;
 929         else
 930           {
 931             scm_ungetc_unlocked (c, port);
 932             p = sym_unsyntax;
 933           }
 934         break;
 935       }
 936
 937     default:
 938       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 939                "scm_read_syntax", chr);
 940       abort ();
 941     }
 942
 943   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 944   return maybe_annotate_source (p, port, opts, line, column);
 945 }
 946
 947 static SCM
 948 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 949 {
 950   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 951
 952   if (!scm_is_eq (id, sym_nil))
 953     scm_i_input_error ("scm_read_nil", port,
 954                        "unexpected input while reading #nil: ~a",
 955                        scm_list_1 (id));
 956
 957   return SCM_ELISP_NIL;
 958 }
 959
 960 static SCM
 961 scm_read_semicolon_comment (int chr, SCM port)
 962 {
 963   int c;
 964
 965   /* We use the get_byte here because there is no need to get the
 966      locale correct with comment input. This presumes that newline
 967      always represents itself no matter what the encoding is.  */
 968   for (c = scm_get_byte_or_eof_unlocked (port);
 969        (c != EOF) && (c != '\n');
 970        c = scm_get_byte_or_eof_unlocked (port));
 971
 972   return SCM_UNSPECIFIED;
 973 }
 974
 975 /* If the EXPECTED_CHARS are the next ones available from PORT, then
 976    consume them and return 1.  Otherwise leave the port position where
 977    it was and return 0.  EXPECTED_CHARS should be all lowercase, and
 978    will be matched case-insensitively against the characters read from
 979    PORT. */
 980 static int
 981 try_read_ci_chars (SCM port, const char *expected_chars)
 982 {
 983   int num_chars_wanted = strlen (expected_chars);
 984   int num_chars_read = 0;
 985   char *chars_read = alloca (num_chars_wanted);
 986   int c;
 987
 988   while (num_chars_read < num_chars_wanted)
 989     {
 990       c = scm_getc_unlocked (port);
 991       if (c == EOF)
 992         break;
 993       else if (c_tolower (c) != expected_chars[num_chars_read])
 994         {
 995           scm_ungetc_unlocked (c, port);
 996           break;
 997         }
 998       else
 999         chars_read[num_chars_read++] = c;
1000     }
1001
1002   if (num_chars_read == num_chars_wanted)
1003     return 1;
1004   else
1005     {
1006       while (num_chars_read > 0)
1007         scm_ungetc_unlocked (chars_read[--num_chars_read], port);
1008       return 0;
1009     }
1010 }
1011
1012 \f
1013 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
1014
1015 static SCM
1016 scm_read_boolean (int chr, SCM port)
1017 {
1018   switch (chr)
1019     {
1020     case 't':
1021     case 'T':
1022       try_read_ci_chars (port, "rue");
1023       return SCM_BOOL_T;
1024
1025     case 'f':
1026     case 'F':
1027       try_read_ci_chars (port, "alse");
1028       return SCM_BOOL_F;
1029     }
1030
1031   return SCM_UNSPECIFIED;
1032 }
1033
1034 static SCM
1035 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1036 #define FUNC_NAME "scm_lreadr"
1037 {
1038   char buffer[READER_CHAR_NAME_MAX_SIZE];
1039   SCM charname;
1040   size_t charname_len, bytes_read;
1041   scm_t_wchar cp;
1042   int overflow;
1043   scm_t_port_internal *pti;
1044
1045   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1046                          &bytes_read);
1047   if (overflow)
1048     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1049
1050   if (bytes_read == 0)
1051     {
1052       chr = scm_getc_unlocked (port);
1053       if (chr == EOF)
1054         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1055                            "while reading character", SCM_EOL);
1056
1057       /* CHR must be a token delimiter, like a whitespace.  */
1058       return (SCM_MAKE_CHAR (chr));
1059     }
1060
1061   pti = SCM_PORT_GET_INTERNAL (port);
1062
1063   /* Simple ASCII characters can be processed immediately.  Also, simple
1064      ISO-8859-1 characters can be processed immediately if the encoding for this
1065      port is ISO-8859-1.  */
1066   if (bytes_read == 1 &&
1067       ((unsigned char) buffer[0] <= 127
1068        || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
1069     {
1070       SCM_COL (port) += 1;
1071       return SCM_MAKE_CHAR (buffer[0]);
1072     }
1073
1074   /* Otherwise, convert the buffer into a proper scheme string for
1075      processing.  */
1076   charname = scm_from_port_stringn (buffer, bytes_read, port);
1077   charname_len = scm_i_string_length (charname);
1078   SCM_COL (port) += charname_len;
1079   cp = scm_i_string_ref (charname, 0);
1080   if (charname_len == 1)
1081     return SCM_MAKE_CHAR (cp);
1082
1083   /* Ignore dotted circles, which may be used to keep combining characters from
1084      combining with the backslash in #\charname.  */
1085   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1086     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1087
1088   if (cp >= '0' && cp < '8')
1089     {
1090       /* Dirk:FIXME::  This type of character syntax is not R5RS
1091        * compliant.  Further, it should be verified that the constant
1092        * does only consist of octal digits.  */
1093       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1094       if (SCM_I_INUMP (p))
1095         {
1096           scm_t_wchar c = scm_to_uint32 (p);
1097           if (SCM_IS_UNICODE_CHAR (c))
1098             return SCM_MAKE_CHAR (c);
1099           else
1100             scm_i_input_error (FUNC_NAME, port,
1101                                "out-of-range octal character escape: ~a",
1102                                scm_list_1 (charname));
1103         }
1104     }
1105
1106   if (cp == 'x' && (charname_len > 1))
1107     {
1108       SCM p;
1109
1110       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1111       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1112                                 scm_from_uint (16));
1113       if (SCM_I_INUMP (p))
1114         {
1115           scm_t_wchar c = scm_to_uint32 (p);
1116           if (SCM_IS_UNICODE_CHAR (c))
1117             return SCM_MAKE_CHAR (c);
1118           else
1119             scm_i_input_error (FUNC_NAME, port,
1120                                "out-of-range hex character escape: ~a",
1121                                scm_list_1 (charname));
1122         }
1123     }
1124
1125   /* The names of characters should never have non-Latin1
1126      characters.  */
1127   if (scm_i_is_narrow_string (charname)
1128       || scm_i_try_narrow_string (charname))
1129     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1130                                        charname_len);
1131       if (scm_is_true (ch))
1132         return ch;
1133     }
1134
1135   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1136                      scm_list_1 (charname));
1137
1138   return SCM_UNSPECIFIED;
1139 }
1140 #undef FUNC_NAME
1141
1142 static SCM
1143 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1144 {
1145   SCM symbol;
1146
1147   /* Read the symbol that comprises the keyword.  Doing this instead of
1148      invoking a specific symbol reader function allows `scm_read_keyword ()'
1149      to adapt to the delimiters currently valid of symbols.
1150
1151      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1152   symbol = scm_read_expression (port, opts);
1153   if (!scm_is_symbol (symbol))
1154     scm_i_input_error ("scm_read_keyword", port,
1155                        "keyword prefix `~a' not followed by a symbol: ~s",
1156                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1157
1158   return (scm_symbol_to_keyword (symbol));
1159 }
1160
1161 static SCM
1162 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1163                  long line, int column)
1164 {
1165   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1166      guarantee that it's going to do what we want.  After all, this is an
1167      implementation detail of `scm_read_vector ()', not a desirable
1168      property.  */
1169   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1170                                 port, opts, line, column);
1171 }
1172
1173 /* Helper used by scm_read_array */
1174 static int
1175 read_decimal_integer (SCM port, int c, ssize_t *resp)
1176 {
1177   ssize_t sign = 1;
1178   ssize_t res = 0;
1179   int got_it = 0;
1180
1181   if (c == '-')
1182     {
1183       sign = -1;
1184       c = scm_getc_unlocked (port);
1185     }
1186
1187   while ('0' <= c && c <= '9')
1188     {
1189       if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1190         scm_i_input_error ("read_decimal_integer", port,
1191                            "number too large", SCM_EOL);
1192       res = 10*res + c-'0';
1193       got_it = 1;
1194       c = scm_getc_unlocked (port);
1195     }
1196
1197   if (got_it)
1198     *resp = sign * res;
1199   return c;
1200 }
1201
1202 /* Read an array.  This function can also read vectors and uniform
1203    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1204    handled here.
1205
1206    C is the first character read after the '#'. */
1207 static SCM
1208 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1209 {
1210   ssize_t rank;
1211   scm_t_wchar tag_buf[8];
1212   int tag_len;
1213
1214   SCM tag, shape = SCM_BOOL_F, elements, array;
1215
1216   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1217      the array code can not deal with zero-length dimensions yet, and
1218      we want to allow zero-length vectors, of course. */
1219   if (c == '(')
1220     return scm_read_vector (c, port, opts, line, column);
1221
1222   /* Disambiguate between '#f' and uniform floating point vectors. */
1223   if (c == 'f')
1224     {
1225       c = scm_getc_unlocked (port);
1226       if (c != '3' && c != '6')
1227         {
1228           if (c == 'a' && try_read_ci_chars (port, "lse"))
1229             return SCM_BOOL_F;
1230           else if (c != EOF)
1231             scm_ungetc_unlocked (c, port);
1232           return SCM_BOOL_F;
1233         }
1234       rank = 1;
1235       tag_buf[0] = 'f';
1236       tag_len = 1;
1237       goto continue_reading_tag;
1238     }
1239
1240   /* Read rank. */
1241   rank = 1;
1242   c = read_decimal_integer (port, c, &rank);
1243   if (rank < 0)
1244     scm_i_input_error (NULL, port, "array rank must be non-negative",
1245                        SCM_EOL);
1246
1247   /* Read tag. */
1248   tag_len = 0;
1249  continue_reading_tag:
1250   while (c != EOF && c != '(' && c != '@' && c != ':'
1251          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1252     {
1253       tag_buf[tag_len++] = c;
1254       c = scm_getc_unlocked (port);
1255     }
1256   if (tag_len == 0)
1257     tag = SCM_BOOL_T;
1258   else
1259     {
1260       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1261       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1262         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1263                            scm_list_1 (tag));
1264     }
1265
1266   /* Read shape. */
1267   if (c == '@' || c == ':')
1268     {
1269       shape = SCM_EOL;
1270
1271       do
1272         {
1273           ssize_t lbnd = 0, len = 0;
1274           SCM s;
1275
1276           if (c == '@')
1277             {
1278               c = scm_getc_unlocked (port);
1279               c = read_decimal_integer (port, c, &lbnd);
1280             }
1281
1282           s = scm_from_ssize_t (lbnd);
1283
1284           if (c == ':')
1285             {
1286               c = scm_getc_unlocked (port);
1287               c = read_decimal_integer (port, c, &len);
1288               if (len < 0)
1289                 scm_i_input_error (NULL, port,
1290                                    "array length must be non-negative",
1291                                    SCM_EOL);
1292
1293               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1294             }
1295
1296           shape = scm_cons (s, shape);
1297         } while (c == '@' || c == ':');
1298
1299       shape = scm_reverse_x (shape, SCM_EOL);
1300     }
1301
1302   /* Read nested lists of elements. */
1303   if (c != '(')
1304     scm_i_input_error (NULL, port,
1305                        "missing '(' in vector or array literal",
1306                        SCM_EOL);
1307   elements = scm_read_sexp (c, port, opts);
1308
1309   if (scm_is_false (shape))
1310     shape = scm_from_ssize_t (rank);
1311   else if (scm_ilength (shape) != rank)
1312     scm_i_input_error
1313       (NULL, port,
1314        "the number of shape specifications must match the array rank",
1315        SCM_EOL);
1316
1317   /* Handle special print syntax of rank zero arrays; see
1318      scm_i_print_array for a rationale. */
1319   if (rank == 0)
1320     {
1321       if (!scm_is_pair (elements))
1322         scm_i_input_error (NULL, port,
1323                            "too few elements in array literal, need 1",
1324                            SCM_EOL);
1325       if (!scm_is_null (SCM_CDR (elements)))
1326         scm_i_input_error (NULL, port,
1327                            "too many elements in array literal, want 1",
1328                            SCM_EOL);
1329       elements = SCM_CAR (elements);
1330     }
1331
1332   /* Construct array, annotate with source location, and return. */
1333   array = scm_list_to_typed_array (tag, shape, elements);
1334   return maybe_annotate_source (array, port, opts, line, column);
1335 }
1336
1337 static SCM
1338 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1339                        long line, int column)
1340 {
1341   return scm_read_array (chr, port, opts, line, column);
1342 }
1343
1344 static SCM
1345 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1346                      long line, int column)
1347 {
1348   chr = scm_getc_unlocked (port);
1349   if (chr != 'u')
1350     goto syntax;
1351
1352   chr = scm_getc_unlocked (port);
1353   if (chr != '8')
1354     goto syntax;
1355
1356   chr = scm_getc_unlocked (port);
1357   if (chr != '(')
1358     goto syntax;
1359
1360   return maybe_annotate_source
1361     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1362      port, opts, line, column);
1363
1364  syntax:
1365   scm_i_input_error ("read_bytevector", port,
1366                      "invalid bytevector prefix",
1367                      SCM_MAKE_CHAR (chr));
1368   return SCM_UNSPECIFIED;
1369 }
1370
1371 static SCM
1372 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1373                            long line, int column)
1374 {
1375   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1376      terribly inefficient but who cares?  */
1377   SCM s_bits = SCM_EOL;
1378
1379   for (chr = scm_getc_unlocked (port);
1380        (chr != EOF) && ((chr == '0') || (chr == '1'));
1381        chr = scm_getc_unlocked (port))
1382     {
1383       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1384     }
1385
1386   if (chr != EOF)
1387     scm_ungetc_unlocked (chr, port);
1388
1389   return maybe_annotate_source
1390     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1391      port, opts, line, column);
1392 }
1393
1394 static SCM
1395 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1396 {
1397   int bang_seen = 0;
1398
1399   for (;;)
1400     {
1401       int c = scm_getc_unlocked (port);
1402
1403       if (c == EOF)
1404         scm_i_input_error ("skip_block_comment", port,
1405                            "unterminated `#! ... !#' comment", SCM_EOL);
1406
1407       if (c == '!')
1408         bang_seen = 1;
1409       else if (c == '#' && bang_seen)
1410         break;
1411       else
1412         bang_seen = 0;
1413     }
1414
1415   return SCM_UNSPECIFIED;
1416 }
1417
1418 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1419                                          int value);
1420 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1421                                         int value);
1422 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1423                                     int value);
1424
1425 static SCM
1426 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1427 {
1428   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1429   int c;
1430   int i = 0;
1431
1432   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1433     {
1434       c = scm_getc_unlocked (port);
1435       if (c == EOF)
1436         scm_i_input_error ("skip_block_comment", port,
1437                            "unterminated `#! ... !#' comment", SCM_EOL);
1438       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1439         name[i++] = c;
1440       else if (CHAR_IS_DELIMITER (c))
1441         {
1442           scm_ungetc_unlocked (c, port);
1443           name[i] = '\0';
1444           if (0 == strcmp ("r6rs", name))
1445             ;  /* Silently ignore */
1446           else if (0 == strcmp ("fold-case", name))
1447             set_port_case_insensitive_p (port, opts, 1);
1448           else if (0 == strcmp ("no-fold-case", name))
1449             set_port_case_insensitive_p (port, opts, 0);
1450           else if (0 == strcmp ("curly-infix", name))
1451             set_port_curly_infix_p (port, opts, 1);
1452           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1453             {
1454               set_port_curly_infix_p (port, opts, 1);
1455               set_port_square_brackets_p (port, opts, 0);
1456             }
1457           else
1458             break;
1459
1460           return SCM_UNSPECIFIED;
1461         }
1462       else
1463         {
1464           scm_ungetc_unlocked (c, port);
1465           break;
1466         }
1467     }
1468   while (i > 0)
1469     scm_ungetc_unlocked (name[--i], port);
1470   return scm_read_scsh_block_comment (chr, port);
1471 }
1472
1473 static SCM
1474 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1475 {
1476   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1477      nested.  So care must be taken.  */
1478   int nesting_level = 1;
1479
1480   int a = scm_getc_unlocked (port);
1481
1482   if (a == EOF)
1483     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1484                        "unterminated `#| ... |#' comment", SCM_EOL);
1485
1486   while (nesting_level > 0)
1487     {
1488       int b = scm_getc_unlocked (port);
1489
1490       if (b == EOF)
1491         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1492                            "unterminated `#| ... |#' comment", SCM_EOL);
1493
1494       if (a == '|' && b == '#')
1495         {
1496           nesting_level--;
1497           b = EOF;
1498         }
1499       else if (a == '#' && b == '|')
1500         {
1501           nesting_level++;
1502           b = EOF;
1503         }
1504
1505       a = b;
1506     }
1507
1508   return SCM_UNSPECIFIED;
1509 }
1510
1511 static SCM
1512 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1513                                scm_t_read_opts *opts)
1514 {
1515   scm_t_wchar c;
1516
1517   c = flush_ws (port, opts, (char *) NULL);
1518   if (EOF == c)
1519     scm_i_input_error ("read_commented_expression", port,
1520                        "no expression after #; comment", SCM_EOL);
1521   scm_ungetc_unlocked (c, port);
1522   scm_read_expression (port, opts);
1523   return SCM_UNSPECIFIED;
1524 }
1525
1526 static SCM
1527 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1528 {
1529   /* Guile's extended symbol read syntax looks like this:
1530
1531        #{This is all a symbol name}#
1532
1533      So here, CHR is expected to be `{'.  */
1534   int saw_brace = 0;
1535   size_t len = 0;
1536   SCM buf = scm_i_make_string (1024, NULL, 0);
1537
1538   buf = scm_i_string_start_writing (buf);
1539
1540   while ((chr = scm_getc_unlocked (port)) != EOF)
1541     {
1542       if (saw_brace)
1543         {
1544           if (chr == '#')
1545             {
1546               break;
1547             }
1548           else
1549             {
1550               saw_brace = 0;
1551               scm_i_string_set_x (buf, len++, '}');
1552             }
1553         }
1554
1555       if (chr == '}')
1556         saw_brace = 1;
1557       else if (chr == '\\')
1558         {
1559           /* It used to be that print.c would print extended-read-syntax
1560              symbols with backslashes before "non-standard" chars, but
1561              this routine wouldn't do anything with those escapes.
1562              Bummer.  What we've done is to change print.c to output
1563              R6RS hex escapes for those characters, relying on the fact
1564              that the extended read syntax would never put a `\' before
1565              an `x'.  For now, we just ignore other instances of
1566              backslash in the string.  */
1567           switch ((chr = scm_getc_unlocked (port)))
1568             {
1569             case EOF:
1570               goto done;
1571             case 'x':
1572               {
1573                 scm_t_wchar c;
1574
1575                 SCM_READ_HEX_ESCAPE (10, ';');
1576                 scm_i_string_set_x (buf, len++, c);
1577                 break;
1578
1579               str_eof:
1580                 chr = EOF;
1581                 goto done;
1582
1583               bad_escaped:
1584                 scm_i_string_stop_writing ();
1585                 scm_i_input_error ("scm_read_extended_symbol", port,
1586                                    "illegal character in escape sequence: ~S",
1587                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1588                 break;
1589               }
1590             default:
1591               scm_i_string_set_x (buf, len++, chr);
1592               break;
1593             }
1594         }
1595       else
1596         scm_i_string_set_x (buf, len++, chr);
1597
1598       if (len >= scm_i_string_length (buf) - 2)
1599         {
1600           SCM addy;
1601
1602           scm_i_string_stop_writing ();
1603           addy = scm_i_make_string (1024, NULL, 0);
1604           buf = scm_string_append (scm_list_2 (buf, addy));
1605           len = 0;
1606           buf = scm_i_string_start_writing (buf);
1607         }
1608     }
1609
1610  done:
1611   scm_i_string_stop_writing ();
1612   if (chr == EOF)
1613     scm_i_input_error ("scm_read_extended_symbol", port,
1614                        "end of file while reading symbol", SCM_EOL);
1615
1616   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1617 }
1618
1619
1620 \f
1621 /* Top-level token readers, i.e., dispatchers.  */
1622
1623 static SCM
1624 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1625 {
1626   SCM proc;
1627
1628   proc = scm_get_hash_procedure (chr);
1629   if (scm_is_true (scm_procedure_p (proc)))
1630     {
1631       long line = SCM_LINUM (port);
1632       int column = SCM_COL (port) - 2;
1633       SCM got;
1634
1635       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1636
1637       if (opts->record_positions_p && SCM_NIMP (got)
1638           && !scm_i_has_source_properties (got))
1639         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1640
1641       return got;
1642     }
1643
1644   return SCM_UNSPECIFIED;
1645 }
1646
1647 /* The reader for the sharp `#' character.  It basically dispatches reads
1648    among the above token readers.   */
1649 static SCM
1650 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1651                 long line, int column)
1652 #define FUNC_NAME "scm_lreadr"
1653 {
1654   SCM result;
1655
1656   chr = scm_getc_unlocked (port);
1657
1658   result = scm_read_sharp_extension (chr, port, opts);
1659   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1660     return result;
1661
1662   switch (chr)
1663     {
1664     case '\\':
1665       return (scm_read_character (chr, port, opts));
1666     case '(':
1667       return (scm_read_vector (chr, port, opts, line, column));
1668     case 's':
1669     case 'u':
1670     case 'f':
1671     case 'c':
1672       /* This one may return either a boolean or an SRFI-4 vector.  */
1673       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1674     case 'v':
1675       return (scm_read_bytevector (chr, port, opts, line, column));
1676     case '*':
1677       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1678     case 't':
1679     case 'T':
1680     case 'F':
1681       return (scm_read_boolean (chr, port));
1682     case ':':
1683       return (scm_read_keyword (chr, port, opts));
1684     case '0': case '1': case '2': case '3': case '4':
1685     case '5': case '6': case '7': case '8': case '9':
1686     case '@':
1687       return (scm_read_array (chr, port, opts, line, column));
1688
1689     case 'i':
1690     case 'e':
1691     case 'b':
1692     case 'B':
1693     case 'o':
1694     case 'O':
1695     case 'd':
1696     case 'D':
1697     case 'x':
1698     case 'X':
1699     case 'I':
1700     case 'E':
1701       return (scm_read_number_and_radix (chr, port, opts));
1702     case '{':
1703       return (scm_read_extended_symbol (chr, port));
1704     case '!':
1705       return (scm_read_shebang (chr, port, opts));
1706     case ';':
1707       return (scm_read_commented_expression (chr, port, opts));
1708     case '`':
1709     case '\'':
1710     case ',':
1711       return (scm_read_syntax (chr, port, opts));
1712     case 'n':
1713       return (scm_read_nil (chr, port, opts));
1714     default:
1715       result = scm_read_sharp_extension (chr, port, opts);
1716       if (scm_is_eq (result, SCM_UNSPECIFIED))
1717         {
1718           /* To remain compatible with 1.8 and earlier, the following
1719              characters have lower precedence than `read-hash-extend'
1720              characters.  */
1721           switch (chr)
1722             {
1723             case '|':
1724               return scm_read_r6rs_block_comment (chr, port);
1725             default:
1726               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1727                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1728             }
1729         }
1730       else
1731         return result;
1732     }
1733
1734   return SCM_UNSPECIFIED;
1735 }
1736 #undef FUNC_NAME
1737
1738 static SCM
1739 read_inner_expression (SCM port, scm_t_read_opts *opts)
1740 #define FUNC_NAME "read_inner_expression"
1741 {
1742   while (1)
1743     {
1744       scm_t_wchar chr;
1745
1746       chr = scm_getc_unlocked (port);
1747
1748       switch (chr)
1749         {
1750         case SCM_WHITE_SPACES:
1751         case SCM_LINE_INCREMENTORS:
1752           break;
1753         case ';':
1754           (void) scm_read_semicolon_comment (chr, port);
1755           break;
1756         case '{':
1757           if (opts->curly_infix_p)
1758             {
1759               if (opts->neoteric_p)
1760                 return scm_read_sexp (chr, port, opts);
1761               else
1762                 {
1763                   SCM expr;
1764
1765                   /* Enable neoteric expressions within curly braces */
1766                   opts->neoteric_p = 1;
1767                   expr = scm_read_sexp (chr, port, opts);
1768                   opts->neoteric_p = 0;
1769                   return expr;
1770                 }
1771             }
1772           else
1773             return scm_read_mixed_case_symbol (chr, port, opts);
1774         case '[':
1775           if (opts->square_brackets_p)
1776             return scm_read_sexp (chr, port, opts);
1777           else if (opts->curly_infix_p)
1778             {
1779               /* The syntax of neoteric expressions requires that '[' be
1780                  a delimiter when curly-infix is enabled, so it cannot
1781                  be part of an unescaped symbol.  We might as well do
1782                  something useful with it, so we adopt Kawa's convention:
1783                  [...] => ($bracket-list$ ...) */
1784               long line = SCM_LINUM (port);
1785               int column = SCM_COL (port) - 1;
1786               return maybe_annotate_source
1787                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1788                  port, opts, line, column);
1789             }
1790           else
1791             return scm_read_mixed_case_symbol (chr, port, opts);
1792         case '(':
1793           return (scm_read_sexp (chr, port, opts));
1794         case '"':
1795           return (scm_read_string (chr, port, opts));
1796         case '|':
1797           if (opts->r7rs_symbols_p)
1798             return scm_read_r7rs_symbol (chr, port, opts);
1799           else
1800             return scm_read_mixed_case_symbol (chr, port, opts);
1801         case '\'':
1802         case '`':
1803         case ',':
1804           return (scm_read_quote (chr, port, opts));
1805         case '#':
1806           {
1807             long line  = SCM_LINUM (port);
1808             int column = SCM_COL (port) - 1;
1809             SCM result = scm_read_sharp (chr, port, opts, line, column);
1810             if (scm_is_eq (result, SCM_UNSPECIFIED))
1811               /* We read a comment or some such.  */
1812               break;
1813             else
1814               return result;
1815           }
1816         case ')':
1817           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1818           break;
1819         case '}':
1820           if (opts->curly_infix_p)
1821             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1822           else
1823             return scm_read_mixed_case_symbol (chr, port, opts);
1824         case ']':
1825           if (opts->square_brackets_p)
1826             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1827           /* otherwise fall through */
1828         case EOF:
1829           return SCM_EOF_VAL;
1830         case ':':
1831           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1832             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1833           /* Fall through.  */
1834
1835         default:
1836           {
1837             if (((chr >= '0') && (chr <= '9'))
1838                 || (strchr ("+-.", chr)))
1839               return (scm_read_number (chr, port, opts));
1840             else
1841               return (scm_read_mixed_case_symbol (chr, port, opts));
1842           }
1843         }
1844     }
1845 }
1846 #undef FUNC_NAME
1847
1848 static SCM
1849 scm_read_expression (SCM port, scm_t_read_opts *opts)
1850 #define FUNC_NAME "scm_read_expression"
1851 {
1852   if (!opts->neoteric_p)
1853     return read_inner_expression (port, opts);
1854   else
1855     {
1856       long line = 0;
1857       int column = 0;
1858       SCM expr;
1859
1860       if (opts->record_positions_p)
1861         {
1862           /* We need to get the position of the first non-whitespace
1863              character in order to correctly annotate neoteric
1864              expressions.  For example, for the expression 'f(x)', the
1865              first call to 'read_inner_expression' reads the 'f' (which
1866              cannot be annotated), and then we later read the '(x)' and
1867              use it to construct the new list (f x). */
1868           int c = flush_ws (port, opts, (char *) NULL);
1869           if (c == EOF)
1870             return SCM_EOF_VAL;
1871           scm_ungetc_unlocked (c, port);
1872           line = SCM_LINUM (port);
1873           column = SCM_COL (port);
1874         }
1875
1876       expr = read_inner_expression (port, opts);
1877
1878       /* 'expr' is the first component of the neoteric expression.  Now
1879          we loop, and as long as the next character is '(', '[', or '{',
1880          (without any intervening whitespace), we use it to construct a
1881          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1882       for (;;)
1883         {
1884           int chr = scm_getc_unlocked (port);
1885
1886           if (chr == '(')
1887             /* e(...) => (e ...) */
1888             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1889           else if (chr == '[')
1890             /* e[...] => ($bracket-apply$ e ...) */
1891             expr = scm_cons (sym_bracket_apply,
1892                              scm_cons (expr,
1893                                        scm_read_sexp (chr, port, opts)));
1894           else if (chr == '{')
1895             {
1896               SCM arg = scm_read_sexp (chr, port, opts);
1897
1898               if (scm_is_null (arg))
1899                 expr = scm_list_1 (expr);       /* e{} => (e) */
1900               else
1901                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1902             }
1903           else
1904             {
1905               if (chr != EOF)
1906                 scm_ungetc_unlocked (chr, port);
1907               break;
1908             }
1909           maybe_annotate_source (expr, port, opts, line, column);
1910         }
1911       return expr;
1912     }
1913 }
1914 #undef FUNC_NAME
1915
1916 \f
1917 /* Actual reader.  */
1918
1919 static void init_read_options (SCM port, scm_t_read_opts *opts);
1920
1921 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1922             (SCM port),
1923             "Read an s-expression from the input port @var{port}, or from\n"
1924             "the current input port if @var{port} is not specified.\n"
1925             "Any whitespace before the next token is discarded.")
1926 #define FUNC_NAME s_scm_read
1927 {
1928   scm_t_read_opts opts;
1929   int c;
1930
1931   if (SCM_UNBNDP (port))
1932     port = scm_current_input_port ();
1933   SCM_VALIDATE_OPINPORT (1, port);
1934
1935   init_read_options (port, &opts);
1936
1937   c = flush_ws (port, &opts, (char *) NULL);
1938   if (EOF == c)
1939     return SCM_EOF_VAL;
1940   scm_ungetc_unlocked (c, port);
1941
1942   return (scm_read_expression (port, &opts));
1943 }
1944 #undef FUNC_NAME
1945
1946
1947 \f
1948
1949 /* Manipulate the read-hash-procedures alist.  This could be written in
1950    Scheme, but maybe it will also be used by C code during initialisation.  */
1951 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1952             (SCM chr, SCM proc),
1953             "Install the procedure @var{proc} for reading expressions\n"
1954             "starting with the character sequence @code{#} and @var{chr}.\n"
1955             "@var{proc} will be called with two arguments:  the character\n"
1956             "@var{chr} and the port to read further data from. The object\n"
1957             "returned will be the return value of @code{read}. \n"
1958             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1959             )
1960 #define FUNC_NAME s_scm_read_hash_extend
1961 {
1962   SCM this;
1963   SCM prev;
1964
1965   SCM_VALIDATE_CHAR (1, chr);
1966   SCM_ASSERT (scm_is_false (proc)
1967               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1968               proc, SCM_ARG2, FUNC_NAME);
1969
1970   /* Check if chr is already in the alist.  */
1971   this = scm_i_read_hash_procedures_ref ();
1972   prev = SCM_BOOL_F;
1973   while (1)
1974     {
1975       if (scm_is_null (this))
1976         {
1977           /* not found, so add it to the beginning.  */
1978           if (scm_is_true (proc))
1979             {
1980               SCM new = scm_cons (scm_cons (chr, proc),
1981                                   scm_i_read_hash_procedures_ref ());
1982               scm_i_read_hash_procedures_set_x (new);
1983             }
1984           break;
1985         }
1986       if (scm_is_eq (chr, SCM_CAAR (this)))
1987         {
1988           /* already in the alist.  */
1989           if (scm_is_false (proc))
1990             {
1991               /* remove it.  */
1992               if (scm_is_false (prev))
1993                 {
1994                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1995                   scm_i_read_hash_procedures_set_x (rest);
1996                 }
1997               else
1998                 scm_set_cdr_x (prev, SCM_CDR (this));
1999             }
2000           else
2001             {
2002               /* replace it.  */
2003               scm_set_cdr_x (SCM_CAR (this), proc);
2004             }
2005           break;
2006         }
2007       prev = this;
2008       this = SCM_CDR (this);
2009     }
2010
2011   return SCM_UNSPECIFIED;
2012 }
2013 #undef FUNC_NAME
2014
2015 /* Recover the read-hash procedure corresponding to char c.  */
2016 static SCM
2017 scm_get_hash_procedure (int c)
2018 {
2019   SCM rest = scm_i_read_hash_procedures_ref ();
2020
2021   while (1)
2022     {
2023       if (scm_is_null (rest))
2024         return SCM_BOOL_F;
2025
2026       if (SCM_CHAR (SCM_CAAR (rest)) == c)
2027         return SCM_CDAR (rest);
2028
2029       rest = SCM_CDR (rest);
2030     }
2031 }
2032
2033 static int
2034 is_encoding_char (char c)
2035 {
2036   if (c >= 'a' && c <= 'z') return 1;
2037   if (c >= 'A' && c <= 'Z') return 1;
2038   if (c >= '0' && c <= '9') return 1;
2039   return strchr ("_-.:/,+=()", c) != NULL;
2040 }
2041
2042 /* Maximum size of an encoding name.  This is a bit more than the
2043    longest name listed at
2044    <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2045    characters.)  */
2046 #define ENCODING_NAME_MAX_SIZE 20
2047
2048 /* Number of bytes at the beginning or end of a file that are scanned
2049    for a "coding:" declaration.  */
2050 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2051
2052
2053 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2054    coding declaration.  Returns either NULL or a string whose storage
2055    has been allocated with `scm_gc_malloc'.  */
2056 char *
2057 scm_i_scan_for_encoding (SCM port)
2058 {
2059   scm_t_port *pt;
2060   char header[SCM_ENCODING_SEARCH_SIZE+1];
2061   size_t bytes_read, encoding_length, i;
2062   char *encoding = NULL;
2063   char *pos, *encoding_start;
2064   int in_comment;
2065
2066   pt = SCM_PTAB_ENTRY (port);
2067
2068   if (pt->rw_active == SCM_PORT_WRITE)
2069     scm_flush_unlocked (port);
2070
2071   if (pt->rw_random)
2072     pt->rw_active = SCM_PORT_READ;
2073
2074   if (pt->read_pos == pt->read_end)
2075     {
2076       /* We can use the read buffer, and thus avoid a seek. */
2077       if (scm_fill_input_unlocked (port) == EOF)
2078         return NULL;
2079
2080       bytes_read = pt->read_end - pt->read_pos;
2081       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2082         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2083
2084       if (bytes_read <= 1)
2085         /* An unbuffered port -- don't scan.  */
2086         return NULL;
2087
2088       memcpy (header, pt->read_pos, bytes_read);
2089       header[bytes_read] = '\0';
2090     }
2091   else
2092     {
2093       /* Try to read some bytes and then seek back.  Not all ports
2094          support seeking back; and indeed some file ports (like
2095          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2096          check performed by SCM_FPORT_FDES---but fail to seek
2097          backwards.  Hence this block comes second.  We prefer to use
2098          the read buffer in-place.  */
2099       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2100         return NULL;
2101
2102       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2103       header[bytes_read] = '\0';
2104       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2105     }
2106
2107   /* search past "coding[:=]" */
2108   pos = header;
2109   while (1)
2110     {
2111       if ((pos = strstr(pos, "coding")) == NULL)
2112         return NULL;
2113
2114       pos += strlen ("coding");
2115       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2116           (*pos == ':' || *pos == '='))
2117         {
2118           pos ++;
2119           break;
2120         }
2121     }
2122
2123   /* skip spaces */
2124   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2125          (*pos == ' ' || *pos == '\t'))
2126     pos ++;
2127
2128   if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2129     /* We found the "coding:" string, but there is probably not enough
2130        room to store an encoding name in its entirety, so ignore it.
2131        This makes sure we do not end up returning a truncated encoding
2132        name.  */
2133     return NULL;
2134
2135   /* grab the next token */
2136   encoding_start = pos;
2137   i = 0;
2138   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2139          && encoding_start + i - header < bytes_read
2140          && is_encoding_char (encoding_start[i]))
2141     i++;
2142
2143   encoding_length = i;
2144   if (encoding_length == 0)
2145     return NULL;
2146
2147   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2148
2149   /* push backwards to make sure we were in a comment */
2150   in_comment = 0;
2151   pos = encoding_start;
2152   while (pos >= header)
2153     {
2154       if (*pos == ';')
2155         {
2156           in_comment = 1;
2157           break;
2158         }
2159       else if (*pos == '\n' || pos == header)
2160         {
2161           /* This wasn't in a semicolon comment. Check for a
2162            hash-bang comment. */
2163           char *beg = strstr (header, "#!");
2164           char *end = strstr (header, "!#");
2165           if (beg < encoding_start && encoding_start + encoding_length <= end)
2166             in_comment = 1;
2167           break;
2168         }
2169       else
2170         {
2171           pos --;
2172           continue;
2173         }
2174     }
2175   if (!in_comment)
2176     /* This wasn't in a comment */
2177     return NULL;
2178
2179   return encoding;
2180 }
2181
2182 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2183             (SCM port),
2184             "Scans the port for an Emacs-like character coding declaration\n"
2185             "near the top of the contents of a port with random-accessible contents.\n"
2186             "The coding declaration is of the form\n"
2187             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2188             "\n"
2189             "Returns a string containing the character encoding of the file\n"
2190             "if a declaration was found, or @code{#f} otherwise.\n")
2191 #define FUNC_NAME s_scm_file_encoding
2192 {
2193   char *enc;
2194   SCM s_enc;
2195
2196   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2197
2198   enc = scm_i_scan_for_encoding (port);
2199   if (enc == NULL)
2200     return SCM_BOOL_F;
2201   else
2202     {
2203       s_enc = scm_string_upcase (scm_from_locale_string (enc));
2204       return s_enc;
2205     }
2206
2207   return SCM_BOOL_F;
2208 }
2209 #undef FUNC_NAME
2210
2211 \f
2212 /* Per-port read options.
2213
2214    We store per-port read options in the 'port-read-options' port
2215    property, which is stored in the internal port structure.  The value
2216    stored is a single integer that contains a two-bit field for each
2217    read option.
2218
2219    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2220    the applicable value should be inherited from the corresponding
2221    global read option.  Otherwise, the bit field contains the value of
2222    the read option.  For boolean read options that have been set
2223    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2224    read option has been set per-port, its possible values are those in
2225    'enum t_keyword_style'. */
2226
2227 /* Key to read options in port properties. */
2228 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2229
2230 /* Offsets of bit fields for each per-port override */
2231 #define READ_OPTION_COPY_SOURCE_P          0
2232 #define READ_OPTION_RECORD_POSITIONS_P     2
2233 #define READ_OPTION_CASE_INSENSITIVE_P     4
2234 #define READ_OPTION_KEYWORD_STYLE          6
2235 #define READ_OPTION_R6RS_ESCAPES_P         8
2236 #define READ_OPTION_SQUARE_BRACKETS_P     10
2237 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2238 #define READ_OPTION_CURLY_INFIX_P         14
2239 #define READ_OPTION_R7RS_SYMBOLS_P        16
2240
2241 /* The total width in bits of the per-port overrides */
2242 #define READ_OPTIONS_NUM_BITS             18
2243
2244 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2245 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2246
2247 #define READ_OPTION_MASK     3
2248 #define READ_OPTION_INHERIT  3
2249
2250 static void
2251 set_port_read_option (SCM port, int option, int new_value)
2252 {
2253   SCM scm_read_options;
2254   unsigned int read_options;
2255
2256   new_value &= READ_OPTION_MASK;
2257
2258   scm_dynwind_begin (0);
2259   scm_dynwind_lock_port (port);
2260
2261   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2262   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2263     read_options = scm_to_uint (scm_read_options);
2264   else
2265     read_options = READ_OPTIONS_INHERIT_ALL;
2266   read_options &= ~(READ_OPTION_MASK << option);
2267   read_options |= new_value << option;
2268   scm_read_options = scm_from_uint (read_options);
2269   scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2270
2271   scm_dynwind_end ();
2272 }
2273
2274 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2275 static void
2276 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2277 {
2278   value = !!value;
2279   opts->case_insensitive_p = value;
2280   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2281 }
2282
2283 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2284 static void
2285 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2286 {
2287   value = !!value;
2288   opts->square_brackets_p = value;
2289   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2290 }
2291
2292 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2293 static void
2294 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2295 {
2296   value = !!value;
2297   opts->curly_infix_p = value;
2298   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2299 }
2300
2301 /* Initialize OPTS based on PORT's read options and the global read
2302    options. */
2303 static void
2304 init_read_options (SCM port, scm_t_read_opts *opts)
2305 {
2306   SCM val, scm_read_options;
2307   unsigned int read_options, x;
2308
2309   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2310
2311   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2312     read_options = scm_to_uint (scm_read_options);
2313   else
2314     read_options = READ_OPTIONS_INHERIT_ALL;
2315
2316   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2317   if (x == READ_OPTION_INHERIT)
2318     {
2319       val = SCM_PACK (SCM_KEYWORD_STYLE);
2320       if (scm_is_eq (val, scm_keyword_prefix))
2321         x = KEYWORD_STYLE_PREFIX;
2322       else if (scm_is_eq (val, scm_keyword_postfix))
2323         x = KEYWORD_STYLE_POSTFIX;
2324       else
2325         x = KEYWORD_STYLE_HASH_PREFIX;
2326     }
2327   opts->keyword_style = x;
2328
2329 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2330   do                                                                    \
2331     {                                                                   \
2332       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2333       if (x == READ_OPTION_INHERIT)                                     \
2334         x = !!SCM_ ## NAME;                                             \
2335           opts->name = x;                                               \
2336     }                                                                   \
2337   while (0)
2338
2339   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2340   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2341   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2342   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2343   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2344   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2345   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2346   RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P,       r7rs_symbols_p);
2347
2348 #undef RESOLVE_BOOLEAN_OPTION
2349
2350   opts->neoteric_p = 0;
2351 }
2352
2353 void
2354 scm_init_read ()
2355 {
2356   SCM read_hash_procs;
2357
2358   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2359
2360   scm_i_read_hash_procedures =
2361     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2362
2363   scm_init_opts (scm_read_options, scm_read_opts);
2364 #include "libguile/read.x"
2365 }
2366
2367 /*
2368   Local Variables:
2369   c-file-style: "gnu"
2370   End:
2371 */