libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 /* SRFI-105 curly infix expression support */
  67 SCM_SYMBOL (sym_nfx, "$nfx$");
  68 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  69 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  70
  71 scm_t_option scm_read_opts[] =
  72   {
  73     { SCM_OPTION_BOOLEAN, "copy", 0,
  74       "Copy source code expressions." },
  75     { SCM_OPTION_BOOLEAN, "positions", 1,
  76       "Record positions of source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  78       "Convert symbols to lower case."},
  79     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  80       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  81     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  82       "Use R6RS variable-length character and string hex escapes."},
  83     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  84       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  85     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  86       "In strings, consume leading whitespace after an escaped end-of-line."},
  87     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  88       "Support SRFI-105 curly infix expressions."},
  89     { 0, },
  90   };
  91
  92 /* Internal read options structure.  This is initialized by 'scm_read'
  93    from the global and per-port read options, and a pointer is passed
  94    down to all helper functions. */
  95
  96 enum t_keyword_style
  97   {
  98     KEYWORD_STYLE_HASH_PREFIX,
  99     KEYWORD_STYLE_PREFIX,
 100     KEYWORD_STYLE_POSTFIX
 101   };
 102
 103 struct t_read_opts
 104 {
 105   enum t_keyword_style keyword_style;
 106   unsigned int copy_source_p        : 1;
 107   unsigned int record_positions_p   : 1;
 108   unsigned int case_insensitive_p   : 1;
 109   unsigned int r6rs_escapes_p       : 1;
 110   unsigned int square_brackets_p    : 1;
 111   unsigned int hungry_eol_escapes_p : 1;
 112   unsigned int curly_infix_p        : 1;
 113   unsigned int neoteric_p           : 1;
 114 };
 115
 116 typedef struct t_read_opts scm_t_read_opts;
 117
 118
 119 /*
 120   Give meaningful error messages for errors
 121
 122   We use the format
 123
 124   FILE:LINE:COL: MESSAGE
 125   This happened in ....
 126
 127   This is not standard GNU format, but the test-suite likes the real
 128   message to be in front.
 129
 130  */
 131
 132
 133 void
 134 scm_i_input_error (char const *function,
 135                    SCM port, const char *message, SCM arg)
 136 {
 137   SCM fn = (scm_is_string (SCM_FILENAME(port))
 138             ? SCM_FILENAME(port)
 139             : scm_from_locale_string ("#<unknown port>"));
 140
 141   SCM string_port = scm_open_output_string ();
 142   SCM string = SCM_EOL;
 143   scm_simple_format (string_port,
 144                      scm_from_locale_string ("~A:~S:~S: ~A"),
 145                      scm_list_4 (fn,
 146                                  scm_from_long (SCM_LINUM (port) + 1),
 147                                  scm_from_int (SCM_COL (port) + 1),
 148                                  scm_from_locale_string (message)));
 149
 150   string = scm_get_output_string (string_port);
 151   scm_close_output_port (string_port);
 152   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 153                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 154                  string,
 155                  arg,
 156                  SCM_BOOL_F);
 157 }
 158
 159
 160 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 161             (SCM setting),
 162             "Option interface for the read options. Instead of using\n"
 163             "this procedure directly, use the procedures @code{read-enable},\n"
 164             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 165 #define FUNC_NAME s_scm_read_options
 166 {
 167   SCM ans = scm_options (setting,
 168                          scm_read_opts,
 169                          FUNC_NAME);
 170   if (SCM_COPY_SOURCE_P)
 171     SCM_RECORD_POSITIONS_P = 1;
 172   return ans;
 173 }
 174 #undef FUNC_NAME
 175
 176 /* A fluid referring to an association list mapping extra hash
 177    characters to procedures.  */
 178 static SCM *scm_i_read_hash_procedures;
 179
 180 static SCM
 181 scm_i_read_hash_procedures_ref (void)
 182 {
 183   return scm_fluid_ref (*scm_i_read_hash_procedures);
 184 }
 185
 186 static void
 187 scm_i_read_hash_procedures_set_x (SCM value)
 188 {
 189   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 190 }
 191
 192 \f
 193 /* Token readers.  */
 194
 195
 196 /* Size of the C buffer used to read symbols and numbers.  */
 197 #define READER_BUFFER_SIZE            128
 198
 199 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 200 #define READER_STRING_BUFFER_SIZE     128
 201
 202 /* The maximum size of Scheme character names.  */
 203 #define READER_CHAR_NAME_MAX_SIZE      50
 204
 205 /* The maximum size of reader directive names.  */
 206 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 207
 208
 209 /* `isblank' is only in C99.  */
 210 #define CHAR_IS_BLANK_(_chr)                                    \
 211   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 212    || ((_chr) == '\f') || ((_chr) == '\r'))
 213
 214 #ifdef MSDOS
 215 # define CHAR_IS_BLANK(_chr)                    \
 216   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 217 #else
 218 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 219 #endif
 220
 221
 222 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 223    structure'').  */
 224 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 225   (CHAR_IS_BLANK (c)                                            \
 226    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 227
 228 #define CHAR_IS_DELIMITER(c)                                    \
 229   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 230    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 231                                       || opts->curly_infix_p))  \
 232    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 233
 234 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 235    Structure''.  */
 236 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 237   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 238    || ((_chr) == 'd') || ((_chr) == 'l'))
 239
 240 /* Read an SCSH block comment.  */
 241 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 242 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 244 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_get_hash_procedure (int);
 246
 247 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 248    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 249    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 250    bytes actually read.  */
 251 static int
 252 read_token (SCM port, scm_t_read_opts *opts,
 253             char *buf, size_t buf_size, size_t *read)
 254 {
 255    *read = 0;
 256
 257    while (*read < buf_size)
 258      {
 259        int chr;
 260
 261        chr = scm_get_byte_or_eof_unlocked (port);
 262
 263        if (chr == EOF)
 264         return 0;
 265       else if (CHAR_IS_DELIMITER (chr))
 266         {
 267           scm_unget_byte_unlocked (chr, port);
 268           return 0;
 269         }
 270       else
 271         {
 272           *buf = (char) chr;
 273           buf++, (*read)++;
 274         }
 275      }
 276
 277    return 1;
 278  }
 279
 280 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 281    if the token doesn't fit in BUFFER_SIZE bytes.  */
 282 static char *
 283 read_complete_token (SCM port, scm_t_read_opts *opts,
 284                      char *buffer, size_t buffer_size, size_t *read)
 285 {
 286   int overflow = 0;
 287   size_t bytes_read, overflow_size = 0;
 288   char *overflow_buffer = NULL;
 289
 290   do
 291     {
 292       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 293       if (bytes_read == 0)
 294         break;
 295       if (overflow || overflow_size != 0)
 296         {
 297           if (overflow_size == 0)
 298             {
 299               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 300               memcpy (overflow_buffer, buffer, bytes_read);
 301               overflow_size = bytes_read;
 302             }
 303           else
 304             {
 305               char *new_buf =
 306                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 307
 308               memcpy (new_buf, overflow_buffer, overflow_size);
 309               memcpy (new_buf + overflow_size, buffer, bytes_read);
 310
 311               overflow_buffer = new_buf;
 312               overflow_size += bytes_read;
 313             }
 314         }
 315     }
 316   while (overflow);
 317
 318   if (overflow_size)
 319     *read = overflow_size;
 320   else
 321     *read = bytes_read;
 322
 323   return (overflow_size > 0 ? overflow_buffer : buffer);
 324 }
 325
 326 /* Skip whitespace from PORT and return the first non-whitespace character
 327    read.  Raise an error on end-of-file.  */
 328 static int
 329 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 330 {
 331   scm_t_wchar c;
 332   while (1)
 333     switch (c = scm_getc_unlocked (port))
 334       {
 335       case EOF:
 336       goteof:
 337         if (eoferr)
 338           {
 339             scm_i_input_error (eoferr,
 340                                port,
 341                                "end of file",
 342                                SCM_EOL);
 343           }
 344         return c;
 345
 346       case ';':
 347       lp:
 348         switch (c = scm_getc_unlocked (port))
 349           {
 350           case EOF:
 351             goto goteof;
 352           default:
 353             goto lp;
 354           case SCM_LINE_INCREMENTORS:
 355             break;
 356           }
 357         break;
 358
 359       case '#':
 360         switch (c = scm_getc_unlocked (port))
 361           {
 362           case EOF:
 363             eoferr = "read_sharp";
 364             goto goteof;
 365           case '!':
 366             scm_read_shebang (c, port, opts);
 367             break;
 368           case ';':
 369             scm_read_commented_expression (c, port, opts);
 370             break;
 371           case '|':
 372             if (scm_is_false (scm_get_hash_procedure (c)))
 373               {
 374                 scm_read_r6rs_block_comment (c, port);
 375                 break;
 376               }
 377             /* fall through */
 378           default:
 379             scm_ungetc_unlocked (c, port);
 380             return '#';
 381           }
 382         break;
 383
 384       case SCM_LINE_INCREMENTORS:
 385       case SCM_SINGLE_SPACES:
 386       case '\t':
 387         break;
 388
 389       default:
 390         return c;
 391       }
 392
 393   return 0;
 394 }
 395
 396
 397 \f
 398 /* Token readers.  */
 399
 400 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 401 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 402                            long line, int column);
 403
 404
 405 static SCM
 406 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 407                        long line, int column)
 408 {
 409   if (opts->record_positions_p)
 410     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 411   return x;
 412 }
 413
 414 static SCM
 415 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 416 #define FUNC_NAME "scm_i_lreadparen"
 417 {
 418   int c;
 419   SCM tmp, tl, ans = SCM_EOL;
 420   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 421   const int terminating_char = ((chr == '{') ? '}'
 422                                 : ((chr == '[') ? ']'
 423                                    : ')'));
 424
 425   /* Need to capture line and column numbers here. */
 426   long line = SCM_LINUM (port);
 427   int column = SCM_COL (port) - 1;
 428
 429   c = flush_ws (port, opts, FUNC_NAME);
 430   if (terminating_char == c)
 431     return SCM_EOL;
 432
 433   scm_ungetc_unlocked (c, port);
 434   tmp = scm_read_expression (port, opts);
 435
 436   /* Note that it is possible for scm_read_expression to return
 437      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 438      check that it's a real dot by checking `c'.  */
 439   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 440     {
 441       ans = scm_read_expression (port, opts);
 442       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 443         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 444                            SCM_EOL);
 445       return ans;
 446     }
 447
 448   /* Build the head of the list structure. */
 449   ans = tl = scm_cons (tmp, SCM_EOL);
 450
 451   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 452     {
 453       SCM new_tail;
 454
 455       if (c == ')' || (c == ']' && opts->square_brackets_p)
 456           || ((c == '}' || c == ']') && opts->curly_infix_p))
 457         scm_i_input_error (FUNC_NAME, port,
 458                            "in pair: mismatched close paren: ~A",
 459                            scm_list_1 (SCM_MAKE_CHAR (c)));
 460
 461       scm_ungetc_unlocked (c, port);
 462       tmp = scm_read_expression (port, opts);
 463
 464       /* See above note about scm_sym_dot.  */
 465       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 466         {
 467           SCM_SETCDR (tl, scm_read_expression (port, opts));
 468
 469           c = flush_ws (port, opts, FUNC_NAME);
 470           if (terminating_char != c)
 471             scm_i_input_error (FUNC_NAME, port,
 472                                "in pair: missing close paren", SCM_EOL);
 473           break;
 474         }
 475
 476       new_tail = scm_cons (tmp, SCM_EOL);
 477       SCM_SETCDR (tl, new_tail);
 478       tl = new_tail;
 479     }
 480
 481   if (curly_list_p)
 482     {
 483       /* In addition to finding the length, 'scm_ilength' checks for
 484          improper or circular lists, in which case it returns -1. */
 485       int len = scm_ilength (ans);
 486
 487       /* The (len == 0) case is handled above */
 488       if (len == 1)
 489         /* Return directly to avoid re-annotating the element's source
 490            location with the position of the outer brace.  Also, it
 491            might not be possible to annotate the element. */
 492         return scm_car (ans);  /* {e} => e */
 493       else if (len == 2)
 494         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 495       else if (len >= 3 && (len & 1))
 496         {
 497           /* It's a proper list whose length is odd and at least 3.  If
 498              the elements at odd indices (the infix operator positions)
 499              are all 'equal?', then it's a simple curly-infix list.
 500              Otherwise it's a mixed curly-infix list. */
 501           SCM op = scm_cadr (ans);
 502
 503           /* Check to see if the elements at odd indices are 'equal?' */
 504           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 505             {
 506               if (scm_is_null (tl))
 507                 {
 508                   /* Convert simple curly-infix list to prefix:
 509                      {a <op> b <op> ...} => (<op> a b ...) */
 510                   tl = ans;
 511                   while (scm_is_pair (scm_cdr (tl)))
 512                     {
 513                       tmp = scm_cddr (tl);
 514                       SCM_SETCDR (tl, tmp);
 515                       tl = tmp;
 516                     }
 517                   ans = scm_cons (op, ans);
 518                   break;
 519                 }
 520               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 521                 {
 522                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 523                   ans = scm_cons (sym_nfx, ans);
 524                   break;
 525                 }
 526             }
 527         }
 528       else
 529         /* Mixed curly-infix (possibly improper) list:
 530            {e . tail} => ($nfx$ e . tail) */
 531         ans = scm_cons (sym_nfx, ans);
 532     }
 533
 534   return maybe_annotate_source (ans, port, opts, line, column);
 535 }
 536 #undef FUNC_NAME
 537
 538
 539 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 540    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 541    found.  */
 542 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 543   do                                                               \
 544     {                                                              \
 545       scm_t_wchar a;                                               \
 546       size_t i = 0;                                                \
 547       c = 0;                                                       \
 548       while (i < ndigits)                                          \
 549         {                                                          \
 550           a = scm_getc_unlocked (port);                                     \
 551           if (a == EOF)                                            \
 552             goto str_eof;                                          \
 553           if (terminator                                           \
 554               && (a == (scm_t_wchar) terminator)                   \
 555               && (i > 0))                                          \
 556             break;                                                 \
 557           if ('0' <= a && a <= '9')                                \
 558             a -= '0';                                              \
 559           else if ('A' <= a && a <= 'F')                           \
 560             a = a - 'A' + 10;                                      \
 561           else if ('a' <= a && a <= 'f')                           \
 562             a = a - 'a' + 10;                                      \
 563           else                                                     \
 564             {                                                      \
 565               c = a;                                               \
 566               goto bad_escaped;                                    \
 567             }                                                      \
 568           c = c * 16 + a;                                          \
 569           i ++;                                                    \
 570         }                                                          \
 571     } while (0)
 572
 573 static void
 574 skip_intraline_whitespace (SCM port)
 575 {
 576   scm_t_wchar c;
 577
 578   do
 579     {
 580       c = scm_getc_unlocked (port);
 581       if (c == EOF)
 582         return;
 583     }
 584   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 585
 586   scm_ungetc_unlocked (c, port);
 587 }
 588
 589 static SCM
 590 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 591 #define FUNC_NAME "scm_lreadr"
 592 {
 593   /* For strings smaller than C_STR, this function creates only one Scheme
 594      object (the string returned).  */
 595
 596   SCM str = SCM_EOL;
 597   size_t c_str_len = 0;
 598   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 599
 600   /* Need to capture line and column numbers here. */
 601   long line = SCM_LINUM (port);
 602   int column = SCM_COL (port) - 1;
 603
 604   while ('"' != (c = scm_getc_unlocked (port)))
 605     {
 606       if (c == EOF)
 607         {
 608         str_eof:
 609           scm_i_input_error (FUNC_NAME, port,
 610                              "end of file in string constant", SCM_EOL);
 611         }
 612
 613       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 614         {
 615           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 616           c_str_len = 0;
 617         }
 618
 619       if (c == '\\')
 620         {
 621           switch (c = scm_getc_unlocked (port))
 622             {
 623             case EOF:
 624               goto str_eof;
 625             case '"':
 626             case '\\':
 627               break;
 628             case '\n':
 629               if (opts->hungry_eol_escapes_p)
 630                 skip_intraline_whitespace (port);
 631               continue;
 632             case '0':
 633               c = '\0';
 634               break;
 635             case 'f':
 636               c = '\f';
 637               break;
 638             case 'n':
 639               c = '\n';
 640               break;
 641             case 'r':
 642               c = '\r';
 643               break;
 644             case 't':
 645               c = '\t';
 646               break;
 647             case 'a':
 648               c = '\007';
 649               break;
 650             case 'v':
 651               c = '\v';
 652               break;
 653             case 'b':
 654               c = '\010';
 655               break;
 656             case 'x':
 657               if (opts->r6rs_escapes_p)
 658                 SCM_READ_HEX_ESCAPE (10, ';');
 659               else
 660                 SCM_READ_HEX_ESCAPE (2, '\0');
 661               break;
 662             case 'u':
 663               if (!opts->r6rs_escapes_p)
 664                 {
 665                   SCM_READ_HEX_ESCAPE (4, '\0');
 666                   break;
 667                 }
 668             case 'U':
 669               if (!opts->r6rs_escapes_p)
 670                 {
 671                   SCM_READ_HEX_ESCAPE (6, '\0');
 672                   break;
 673                 }
 674             default:
 675             bad_escaped:
 676               scm_i_input_error (FUNC_NAME, port,
 677                                  "illegal character in escape sequence: ~S",
 678                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 679             }
 680         }
 681
 682       c_str[c_str_len++] = c;
 683     }
 684
 685   if (scm_is_null (str))
 686     /* Fast path: we got a string that fits in C_STR.  */
 687     str = scm_from_utf32_stringn (c_str, c_str_len);
 688   else
 689     {
 690       if (c_str_len > 0)
 691         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 692
 693       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 694     }
 695
 696   return maybe_annotate_source (str, port, opts, line, column);
 697 }
 698 #undef FUNC_NAME
 699
 700
 701 static SCM
 702 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 703 {
 704   SCM result, str = SCM_EOL;
 705   char local_buffer[READER_BUFFER_SIZE], *buffer;
 706   size_t bytes_read;
 707   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 708
 709   /* Need to capture line and column numbers here. */
 710   long line = SCM_LINUM (port);
 711   int column = SCM_COL (port) - 1;
 712
 713   scm_ungetc_unlocked (chr, port);
 714   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 715                                 &bytes_read);
 716
 717   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 718
 719   result = scm_string_to_number (str, SCM_UNDEFINED);
 720   if (scm_is_false (result))
 721     {
 722       /* Return a symbol instead of a number */
 723       if (opts->case_insensitive_p)
 724         str = scm_string_downcase_x (str);
 725       result = scm_string_to_symbol (str);
 726     }
 727   else if (SCM_NIMP (result))
 728     result = maybe_annotate_source (result, port, opts, line, column);
 729
 730   SCM_COL (port) += scm_i_string_length (str);
 731   return result;
 732 }
 733
 734 static SCM
 735 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 736 {
 737   SCM result;
 738   int ends_with_colon = 0;
 739   size_t bytes_read;
 740   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 741   char local_buffer[READER_BUFFER_SIZE], *buffer;
 742   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 743   SCM str;
 744
 745   scm_ungetc_unlocked (chr, port);
 746   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 747                                 &bytes_read);
 748   if (bytes_read > 0)
 749     ends_with_colon = buffer[bytes_read - 1] == ':';
 750
 751   if (postfix && ends_with_colon && (bytes_read > 1))
 752     {
 753       str = scm_from_stringn (buffer, bytes_read - 1,
 754                               pt->encoding, pt->ilseq_handler);
 755
 756       if (opts->case_insensitive_p)
 757         str = scm_string_downcase_x (str);
 758       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 759     }
 760   else
 761     {
 762       str = scm_from_stringn (buffer, bytes_read,
 763                               pt->encoding, pt->ilseq_handler);
 764
 765       if (opts->case_insensitive_p)
 766         str = scm_string_downcase_x (str);
 767       result = scm_string_to_symbol (str);
 768     }
 769
 770   SCM_COL (port) += scm_i_string_length (str);
 771   return result;
 772 }
 773
 774 static SCM
 775 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 776 #define FUNC_NAME "scm_lreadr"
 777 {
 778   SCM result;
 779   size_t read;
 780   char local_buffer[READER_BUFFER_SIZE], *buffer;
 781   unsigned int radix;
 782   SCM str;
 783   scm_t_port *pt;
 784
 785   switch (chr)
 786     {
 787     case 'B':
 788     case 'b':
 789       radix = 2;
 790       break;
 791
 792     case 'o':
 793     case 'O':
 794       radix = 8;
 795       break;
 796
 797     case 'd':
 798     case 'D':
 799       radix = 10;
 800       break;
 801
 802     case 'x':
 803     case 'X':
 804       radix = 16;
 805       break;
 806
 807     default:
 808       scm_ungetc_unlocked (chr, port);
 809       scm_ungetc_unlocked ('#', port);
 810       radix = 10;
 811     }
 812
 813   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 814                                 &read);
 815
 816   pt = SCM_PTAB_ENTRY (port);
 817   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 818
 819   result = scm_string_to_number (str, scm_from_uint (radix));
 820
 821   SCM_COL (port) += scm_i_string_length (str);
 822
 823   if (scm_is_true (result))
 824     return result;
 825
 826   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 827
 828   return SCM_BOOL_F;
 829 }
 830 #undef FUNC_NAME
 831
 832 static SCM
 833 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 834 {
 835   SCM p;
 836   long line = SCM_LINUM (port);
 837   int column = SCM_COL (port) - 1;
 838
 839   switch (chr)
 840     {
 841     case '`':
 842       p = scm_sym_quasiquote;
 843       break;
 844
 845     case '\'':
 846       p = scm_sym_quote;
 847       break;
 848
 849     case ',':
 850       {
 851         scm_t_wchar c;
 852
 853         c = scm_getc_unlocked (port);
 854         if ('@' == c)
 855           p = scm_sym_uq_splicing;
 856         else
 857           {
 858             scm_ungetc_unlocked (c, port);
 859             p = scm_sym_unquote;
 860           }
 861         break;
 862       }
 863
 864     default:
 865       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 866                "scm_read_quote", chr);
 867       abort ();
 868     }
 869
 870   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 871   return maybe_annotate_source (p, port, opts, line, column);
 872 }
 873
 874 SCM_SYMBOL (sym_syntax, "syntax");
 875 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 876 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 877 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 878
 879 static SCM
 880 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 881 {
 882   SCM p;
 883   long line = SCM_LINUM (port);
 884   int column = SCM_COL (port) - 1;
 885
 886   switch (chr)
 887     {
 888     case '`':
 889       p = sym_quasisyntax;
 890       break;
 891
 892     case '\'':
 893       p = sym_syntax;
 894       break;
 895
 896     case ',':
 897       {
 898         int c;
 899
 900         c = scm_getc_unlocked (port);
 901         if ('@' == c)
 902           p = sym_unsyntax_splicing;
 903         else
 904           {
 905             scm_ungetc_unlocked (c, port);
 906             p = sym_unsyntax;
 907           }
 908         break;
 909       }
 910
 911     default:
 912       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 913                "scm_read_syntax", chr);
 914       abort ();
 915     }
 916
 917   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 918   return maybe_annotate_source (p, port, opts, line, column);
 919 }
 920
 921 static SCM
 922 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 923 {
 924   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 925
 926   if (!scm_is_eq (id, sym_nil))
 927     scm_i_input_error ("scm_read_nil", port,
 928                        "unexpected input while reading #nil: ~a",
 929                        scm_list_1 (id));
 930
 931   return SCM_ELISP_NIL;
 932 }
 933
 934 static SCM
 935 scm_read_semicolon_comment (int chr, SCM port)
 936 {
 937   int c;
 938
 939   /* We use the get_byte here because there is no need to get the
 940      locale correct with comment input. This presumes that newline
 941      always represents itself no matter what the encoding is.  */
 942   for (c = scm_get_byte_or_eof_unlocked (port);
 943        (c != EOF) && (c != '\n');
 944        c = scm_get_byte_or_eof_unlocked (port));
 945
 946   return SCM_UNSPECIFIED;
 947 }
 948
 949 \f
 950 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 951
 952 static SCM
 953 scm_read_boolean (int chr, SCM port)
 954 {
 955   switch (chr)
 956     {
 957     case 't':
 958     case 'T':
 959       return SCM_BOOL_T;
 960
 961     case 'f':
 962     case 'F':
 963       return SCM_BOOL_F;
 964     }
 965
 966   return SCM_UNSPECIFIED;
 967 }
 968
 969 static SCM
 970 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 971 #define FUNC_NAME "scm_lreadr"
 972 {
 973   char buffer[READER_CHAR_NAME_MAX_SIZE];
 974   SCM charname;
 975   size_t charname_len, bytes_read;
 976   scm_t_wchar cp;
 977   int overflow;
 978   scm_t_port *pt;
 979
 980   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 981                          &bytes_read);
 982   if (overflow)
 983     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 984
 985   if (bytes_read == 0)
 986     {
 987       chr = scm_getc_unlocked (port);
 988       if (chr == EOF)
 989         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 990                            "while reading character", SCM_EOL);
 991
 992       /* CHR must be a token delimiter, like a whitespace.  */
 993       return (SCM_MAKE_CHAR (chr));
 994     }
 995
 996   pt = SCM_PTAB_ENTRY (port);
 997
 998   /* Simple ASCII characters can be processed immediately.  Also, simple
 999      ISO-8859-1 characters can be processed immediately if the encoding for this
1000      port is ISO-8859-1.  */
1001   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1002     {
1003       SCM_COL (port) += 1;
1004       return SCM_MAKE_CHAR (buffer[0]);
1005     }
1006
1007   /* Otherwise, convert the buffer into a proper scheme string for
1008      processing.  */
1009   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1010                                pt->ilseq_handler);
1011   charname_len = scm_i_string_length (charname);
1012   SCM_COL (port) += charname_len;
1013   cp = scm_i_string_ref (charname, 0);
1014   if (charname_len == 1)
1015     return SCM_MAKE_CHAR (cp);
1016
1017   /* Ignore dotted circles, which may be used to keep combining characters from
1018      combining with the backslash in #\charname.  */
1019   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1020     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1021
1022   if (cp >= '0' && cp < '8')
1023     {
1024       /* Dirk:FIXME::  This type of character syntax is not R5RS
1025        * compliant.  Further, it should be verified that the constant
1026        * does only consist of octal digits.  */
1027       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1028       if (SCM_I_INUMP (p))
1029         {
1030           scm_t_wchar c = scm_to_uint32 (p);
1031           if (SCM_IS_UNICODE_CHAR (c))
1032             return SCM_MAKE_CHAR (c);
1033           else
1034             scm_i_input_error (FUNC_NAME, port,
1035                                "out-of-range octal character escape: ~a",
1036                                scm_list_1 (charname));
1037         }
1038     }
1039
1040   if (cp == 'x' && (charname_len > 1))
1041     {
1042       SCM p;
1043
1044       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1045       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1046                                 scm_from_uint (16));
1047       if (SCM_I_INUMP (p))
1048         {
1049           scm_t_wchar c = scm_to_uint32 (p);
1050           if (SCM_IS_UNICODE_CHAR (c))
1051             return SCM_MAKE_CHAR (c);
1052           else
1053             scm_i_input_error (FUNC_NAME, port,
1054                                "out-of-range hex character escape: ~a",
1055                                scm_list_1 (charname));
1056         }
1057     }
1058
1059   /* The names of characters should never have non-Latin1
1060      characters.  */
1061   if (scm_i_is_narrow_string (charname)
1062       || scm_i_try_narrow_string (charname))
1063     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1064                                        charname_len);
1065       if (scm_is_true (ch))
1066         return ch;
1067     }
1068
1069   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1070                      scm_list_1 (charname));
1071
1072   return SCM_UNSPECIFIED;
1073 }
1074 #undef FUNC_NAME
1075
1076 static SCM
1077 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1078 {
1079   SCM symbol;
1080
1081   /* Read the symbol that comprises the keyword.  Doing this instead of
1082      invoking a specific symbol reader function allows `scm_read_keyword ()'
1083      to adapt to the delimiters currently valid of symbols.
1084
1085      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1086   symbol = scm_read_expression (port, opts);
1087   if (!scm_is_symbol (symbol))
1088     scm_i_input_error ("scm_read_keyword", port,
1089                        "keyword prefix `~a' not followed by a symbol: ~s",
1090                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1091
1092   return (scm_symbol_to_keyword (symbol));
1093 }
1094
1095 static SCM
1096 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1097                  long line, int column)
1098 {
1099   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1100      guarantee that it's going to do what we want.  After all, this is an
1101      implementation detail of `scm_read_vector ()', not a desirable
1102      property.  */
1103   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1104                                 port, opts, line, column);
1105 }
1106
1107 /* Helper used by scm_read_array */
1108 static int
1109 read_decimal_integer (SCM port, int c, ssize_t *resp)
1110 {
1111   ssize_t sign = 1;
1112   ssize_t res = 0;
1113   int got_it = 0;
1114
1115   if (c == '-')
1116     {
1117       sign = -1;
1118       c = scm_getc_unlocked (port);
1119     }
1120
1121   while ('0' <= c && c <= '9')
1122     {
1123       res = 10*res + c-'0';
1124       got_it = 1;
1125       c = scm_getc_unlocked (port);
1126     }
1127
1128   if (got_it)
1129     *resp = sign * res;
1130   return c;
1131 }
1132
1133 /* Read an array.  This function can also read vectors and uniform
1134    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1135    handled here.
1136
1137    C is the first character read after the '#'. */
1138 static SCM
1139 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1140 {
1141   ssize_t rank;
1142   scm_t_wchar tag_buf[8];
1143   int tag_len;
1144
1145   SCM tag, shape = SCM_BOOL_F, elements, array;
1146
1147   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1148      the array code can not deal with zero-length dimensions yet, and
1149      we want to allow zero-length vectors, of course. */
1150   if (c == '(')
1151     return scm_read_vector (c, port, opts, line, column);
1152
1153   /* Disambiguate between '#f' and uniform floating point vectors. */
1154   if (c == 'f')
1155     {
1156       c = scm_getc_unlocked (port);
1157       if (c != '3' && c != '6')
1158         {
1159           if (c != EOF)
1160             scm_ungetc_unlocked (c, port);
1161           return SCM_BOOL_F;
1162         }
1163       rank = 1;
1164       tag_buf[0] = 'f';
1165       tag_len = 1;
1166       goto continue_reading_tag;
1167     }
1168
1169   /* Read rank. */
1170   rank = 1;
1171   c = read_decimal_integer (port, c, &rank);
1172   if (rank < 0)
1173     scm_i_input_error (NULL, port, "array rank must be non-negative",
1174                        SCM_EOL);
1175
1176   /* Read tag. */
1177   tag_len = 0;
1178  continue_reading_tag:
1179   while (c != EOF && c != '(' && c != '@' && c != ':'
1180          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1181     {
1182       tag_buf[tag_len++] = c;
1183       c = scm_getc_unlocked (port);
1184     }
1185   if (tag_len == 0)
1186     tag = SCM_BOOL_T;
1187   else
1188     {
1189       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1190       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1191         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1192                            scm_list_1 (tag));
1193     }
1194
1195   /* Read shape. */
1196   if (c == '@' || c == ':')
1197     {
1198       shape = SCM_EOL;
1199
1200       do
1201         {
1202           ssize_t lbnd = 0, len = 0;
1203           SCM s;
1204
1205           if (c == '@')
1206             {
1207               c = scm_getc_unlocked (port);
1208               c = read_decimal_integer (port, c, &lbnd);
1209             }
1210
1211           s = scm_from_ssize_t (lbnd);
1212
1213           if (c == ':')
1214             {
1215               c = scm_getc_unlocked (port);
1216               c = read_decimal_integer (port, c, &len);
1217               if (len < 0)
1218                 scm_i_input_error (NULL, port,
1219                                    "array length must be non-negative",
1220                                    SCM_EOL);
1221
1222               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1223             }
1224
1225           shape = scm_cons (s, shape);
1226         } while (c == '@' || c == ':');
1227
1228       shape = scm_reverse_x (shape, SCM_EOL);
1229     }
1230
1231   /* Read nested lists of elements. */
1232   if (c != '(')
1233     scm_i_input_error (NULL, port,
1234                        "missing '(' in vector or array literal",
1235                        SCM_EOL);
1236   elements = scm_read_sexp (c, port, opts);
1237
1238   if (scm_is_false (shape))
1239     shape = scm_from_ssize_t (rank);
1240   else if (scm_ilength (shape) != rank)
1241     scm_i_input_error
1242       (NULL, port,
1243        "the number of shape specifications must match the array rank",
1244        SCM_EOL);
1245
1246   /* Handle special print syntax of rank zero arrays; see
1247      scm_i_print_array for a rationale. */
1248   if (rank == 0)
1249     {
1250       if (!scm_is_pair (elements))
1251         scm_i_input_error (NULL, port,
1252                            "too few elements in array literal, need 1",
1253                            SCM_EOL);
1254       if (!scm_is_null (SCM_CDR (elements)))
1255         scm_i_input_error (NULL, port,
1256                            "too many elements in array literal, want 1",
1257                            SCM_EOL);
1258       elements = SCM_CAR (elements);
1259     }
1260
1261   /* Construct array, annotate with source location, and return. */
1262   array = scm_list_to_typed_array (tag, shape, elements);
1263   return maybe_annotate_source (array, port, opts, line, column);
1264 }
1265
1266 static SCM
1267 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1268                        long line, int column)
1269 {
1270   return scm_read_array (chr, port, opts, line, column);
1271 }
1272
1273 static SCM
1274 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1275                      long line, int column)
1276 {
1277   chr = scm_getc_unlocked (port);
1278   if (chr != 'u')
1279     goto syntax;
1280
1281   chr = scm_getc_unlocked (port);
1282   if (chr != '8')
1283     goto syntax;
1284
1285   chr = scm_getc_unlocked (port);
1286   if (chr != '(')
1287     goto syntax;
1288
1289   return maybe_annotate_source
1290     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1291      port, opts, line, column);
1292
1293  syntax:
1294   scm_i_input_error ("read_bytevector", port,
1295                      "invalid bytevector prefix",
1296                      SCM_MAKE_CHAR (chr));
1297   return SCM_UNSPECIFIED;
1298 }
1299
1300 static SCM
1301 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1302                            long line, int column)
1303 {
1304   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1305      terribly inefficient but who cares?  */
1306   SCM s_bits = SCM_EOL;
1307
1308   for (chr = scm_getc_unlocked (port);
1309        (chr != EOF) && ((chr == '0') || (chr == '1'));
1310        chr = scm_getc_unlocked (port))
1311     {
1312       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1313     }
1314
1315   if (chr != EOF)
1316     scm_ungetc_unlocked (chr, port);
1317
1318   return maybe_annotate_source
1319     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1320      port, opts, line, column);
1321 }
1322
1323 static SCM
1324 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1325 {
1326   int bang_seen = 0;
1327
1328   for (;;)
1329     {
1330       int c = scm_getc_unlocked (port);
1331
1332       if (c == EOF)
1333         scm_i_input_error ("skip_block_comment", port,
1334                            "unterminated `#! ... !#' comment", SCM_EOL);
1335
1336       if (c == '!')
1337         bang_seen = 1;
1338       else if (c == '#' && bang_seen)
1339         break;
1340       else
1341         bang_seen = 0;
1342     }
1343
1344   return SCM_UNSPECIFIED;
1345 }
1346
1347 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1348                                          int value);
1349 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1350                                         int value);
1351 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1352                                     int value);
1353
1354 static SCM
1355 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1356 {
1357   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1358   int c;
1359   int i = 0;
1360
1361   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1362     {
1363       c = scm_getc_unlocked (port);
1364       if (c == EOF)
1365         scm_i_input_error ("skip_block_comment", port,
1366                            "unterminated `#! ... !#' comment", SCM_EOL);
1367       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1368         name[i++] = c;
1369       else if (CHAR_IS_DELIMITER (c))
1370         {
1371           scm_ungetc_unlocked (c, port);
1372           name[i] = '\0';
1373           if (0 == strcmp ("r6rs", name))
1374             ;  /* Silently ignore */
1375           else if (0 == strcmp ("fold-case", name))
1376             set_port_case_insensitive_p (port, opts, 1);
1377           else if (0 == strcmp ("no-fold-case", name))
1378             set_port_case_insensitive_p (port, opts, 0);
1379           else if (0 == strcmp ("curly-infix", name))
1380             set_port_curly_infix_p (port, opts, 1);
1381           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1382             {
1383               set_port_curly_infix_p (port, opts, 1);
1384               set_port_square_brackets_p (port, opts, 0);
1385             }
1386           else
1387             break;
1388
1389           return SCM_UNSPECIFIED;
1390         }
1391       else
1392         {
1393           scm_ungetc_unlocked (c, port);
1394           break;
1395         }
1396     }
1397   while (i > 0)
1398     scm_ungetc_unlocked (name[--i], port);
1399   return scm_read_scsh_block_comment (chr, port);
1400 }
1401
1402 static SCM
1403 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1404 {
1405   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1406      nested.  So care must be taken.  */
1407   int nesting_level = 1;
1408
1409   int a = scm_getc_unlocked (port);
1410
1411   if (a == EOF)
1412     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1413                        "unterminated `#| ... |#' comment", SCM_EOL);
1414
1415   while (nesting_level > 0)
1416     {
1417       int b = scm_getc_unlocked (port);
1418
1419       if (b == EOF)
1420         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1421                            "unterminated `#| ... |#' comment", SCM_EOL);
1422
1423       if (a == '|' && b == '#')
1424         {
1425           nesting_level--;
1426           b = EOF;
1427         }
1428       else if (a == '#' && b == '|')
1429         {
1430           nesting_level++;
1431           b = EOF;
1432         }
1433
1434       a = b;
1435     }
1436
1437   return SCM_UNSPECIFIED;
1438 }
1439
1440 static SCM
1441 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1442                                scm_t_read_opts *opts)
1443 {
1444   scm_t_wchar c;
1445
1446   c = flush_ws (port, opts, (char *) NULL);
1447   if (EOF == c)
1448     scm_i_input_error ("read_commented_expression", port,
1449                        "no expression after #; comment", SCM_EOL);
1450   scm_ungetc_unlocked (c, port);
1451   scm_read_expression (port, opts);
1452   return SCM_UNSPECIFIED;
1453 }
1454
1455 static SCM
1456 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1457 {
1458   /* Guile's extended symbol read syntax looks like this:
1459
1460        #{This is all a symbol name}#
1461
1462      So here, CHR is expected to be `{'.  */
1463   int saw_brace = 0;
1464   size_t len = 0;
1465   SCM buf = scm_i_make_string (1024, NULL, 0);
1466
1467   buf = scm_i_string_start_writing (buf);
1468
1469   while ((chr = scm_getc_unlocked (port)) != EOF)
1470     {
1471       if (saw_brace)
1472         {
1473           if (chr == '#')
1474             {
1475               break;
1476             }
1477           else
1478             {
1479               saw_brace = 0;
1480               scm_i_string_set_x (buf, len++, '}');
1481             }
1482         }
1483
1484       if (chr == '}')
1485         saw_brace = 1;
1486       else if (chr == '\\')
1487         {
1488           /* It used to be that print.c would print extended-read-syntax
1489              symbols with backslashes before "non-standard" chars, but
1490              this routine wouldn't do anything with those escapes.
1491              Bummer.  What we've done is to change print.c to output
1492              R6RS hex escapes for those characters, relying on the fact
1493              that the extended read syntax would never put a `\' before
1494              an `x'.  For now, we just ignore other instances of
1495              backslash in the string.  */
1496           switch ((chr = scm_getc_unlocked (port)))
1497             {
1498             case EOF:
1499               goto done;
1500             case 'x':
1501               {
1502                 scm_t_wchar c;
1503
1504                 SCM_READ_HEX_ESCAPE (10, ';');
1505                 scm_i_string_set_x (buf, len++, c);
1506                 break;
1507
1508               str_eof:
1509                 chr = EOF;
1510                 goto done;
1511
1512               bad_escaped:
1513                 scm_i_string_stop_writing ();
1514                 scm_i_input_error ("scm_read_extended_symbol", port,
1515                                    "illegal character in escape sequence: ~S",
1516                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1517                 break;
1518               }
1519             default:
1520               scm_i_string_set_x (buf, len++, chr);
1521               break;
1522             }
1523         }
1524       else
1525         scm_i_string_set_x (buf, len++, chr);
1526
1527       if (len >= scm_i_string_length (buf) - 2)
1528         {
1529           SCM addy;
1530
1531           scm_i_string_stop_writing ();
1532           addy = scm_i_make_string (1024, NULL, 0);
1533           buf = scm_string_append (scm_list_2 (buf, addy));
1534           len = 0;
1535           buf = scm_i_string_start_writing (buf);
1536         }
1537     }
1538
1539  done:
1540   scm_i_string_stop_writing ();
1541   if (chr == EOF)
1542     scm_i_input_error ("scm_read_extended_symbol", port,
1543                        "end of file while reading symbol", SCM_EOL);
1544
1545   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1546 }
1547
1548
1549 \f
1550 /* Top-level token readers, i.e., dispatchers.  */
1551
1552 static SCM
1553 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1554 {
1555   SCM proc;
1556
1557   proc = scm_get_hash_procedure (chr);
1558   if (scm_is_true (scm_procedure_p (proc)))
1559     {
1560       long line = SCM_LINUM (port);
1561       int column = SCM_COL (port) - 2;
1562       SCM got;
1563
1564       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1565
1566       if (opts->record_positions_p && SCM_NIMP (got)
1567           && !scm_i_has_source_properties (got))
1568         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1569
1570       return got;
1571     }
1572
1573   return SCM_UNSPECIFIED;
1574 }
1575
1576 /* The reader for the sharp `#' character.  It basically dispatches reads
1577    among the above token readers.   */
1578 static SCM
1579 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1580                 long line, int column)
1581 #define FUNC_NAME "scm_lreadr"
1582 {
1583   SCM result;
1584
1585   chr = scm_getc_unlocked (port);
1586
1587   result = scm_read_sharp_extension (chr, port, opts);
1588   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1589     return result;
1590
1591   switch (chr)
1592     {
1593     case '\\':
1594       return (scm_read_character (chr, port, opts));
1595     case '(':
1596       return (scm_read_vector (chr, port, opts, line, column));
1597     case 's':
1598     case 'u':
1599     case 'f':
1600     case 'c':
1601       /* This one may return either a boolean or an SRFI-4 vector.  */
1602       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1603     case 'v':
1604       return (scm_read_bytevector (chr, port, opts, line, column));
1605     case '*':
1606       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1607     case 't':
1608     case 'T':
1609     case 'F':
1610       return (scm_read_boolean (chr, port));
1611     case ':':
1612       return (scm_read_keyword (chr, port, opts));
1613     case '0': case '1': case '2': case '3': case '4':
1614     case '5': case '6': case '7': case '8': case '9':
1615     case '@':
1616       return (scm_read_array (chr, port, opts, line, column));
1617
1618     case 'i':
1619     case 'e':
1620     case 'b':
1621     case 'B':
1622     case 'o':
1623     case 'O':
1624     case 'd':
1625     case 'D':
1626     case 'x':
1627     case 'X':
1628     case 'I':
1629     case 'E':
1630       return (scm_read_number_and_radix (chr, port, opts));
1631     case '{':
1632       return (scm_read_extended_symbol (chr, port));
1633     case '!':
1634       return (scm_read_shebang (chr, port, opts));
1635     case ';':
1636       return (scm_read_commented_expression (chr, port, opts));
1637     case '`':
1638     case '\'':
1639     case ',':
1640       return (scm_read_syntax (chr, port, opts));
1641     case 'n':
1642       return (scm_read_nil (chr, port, opts));
1643     default:
1644       result = scm_read_sharp_extension (chr, port, opts);
1645       if (scm_is_eq (result, SCM_UNSPECIFIED))
1646         {
1647           /* To remain compatible with 1.8 and earlier, the following
1648              characters have lower precedence than `read-hash-extend'
1649              characters.  */
1650           switch (chr)
1651             {
1652             case '|':
1653               return scm_read_r6rs_block_comment (chr, port);
1654             default:
1655               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1656                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1657             }
1658         }
1659       else
1660         return result;
1661     }
1662
1663   return SCM_UNSPECIFIED;
1664 }
1665 #undef FUNC_NAME
1666
1667 static SCM
1668 read_inner_expression (SCM port, scm_t_read_opts *opts)
1669 #define FUNC_NAME "read_inner_expression"
1670 {
1671   while (1)
1672     {
1673       scm_t_wchar chr;
1674
1675       chr = scm_getc_unlocked (port);
1676
1677       switch (chr)
1678         {
1679         case SCM_WHITE_SPACES:
1680         case SCM_LINE_INCREMENTORS:
1681           break;
1682         case ';':
1683           (void) scm_read_semicolon_comment (chr, port);
1684           break;
1685         case '{':
1686           if (opts->curly_infix_p)
1687             {
1688               if (opts->neoteric_p)
1689                 return scm_read_sexp (chr, port, opts);
1690               else
1691                 {
1692                   SCM expr;
1693
1694                   /* Enable neoteric expressions within curly braces */
1695                   opts->neoteric_p = 1;
1696                   expr = scm_read_sexp (chr, port, opts);
1697                   opts->neoteric_p = 0;
1698                   return expr;
1699                 }
1700             }
1701           else
1702             return scm_read_mixed_case_symbol (chr, port, opts);
1703         case '[':
1704           if (opts->square_brackets_p)
1705             return scm_read_sexp (chr, port, opts);
1706           else if (opts->curly_infix_p)
1707             {
1708               /* The syntax of neoteric expressions requires that '[' be
1709                  a delimiter when curly-infix is enabled, so it cannot
1710                  be part of an unescaped symbol.  We might as well do
1711                  something useful with it, so we adopt Kawa's convention:
1712                  [...] => ($bracket-list$ ...) */
1713               long line = SCM_LINUM (port);
1714               int column = SCM_COL (port) - 1;
1715               return maybe_annotate_source
1716                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1717                  port, opts, line, column);
1718             }
1719           else
1720             return scm_read_mixed_case_symbol (chr, port, opts);
1721         case '(':
1722           return (scm_read_sexp (chr, port, opts));
1723         case '"':
1724           return (scm_read_string (chr, port, opts));
1725         case '\'':
1726         case '`':
1727         case ',':
1728           return (scm_read_quote (chr, port, opts));
1729         case '#':
1730           {
1731             long line  = SCM_LINUM (port);
1732             int column = SCM_COL (port) - 1;
1733             SCM result = scm_read_sharp (chr, port, opts, line, column);
1734             if (scm_is_eq (result, SCM_UNSPECIFIED))
1735               /* We read a comment or some such.  */
1736               break;
1737             else
1738               return result;
1739           }
1740         case ')':
1741           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1742           break;
1743         case '}':
1744           if (opts->curly_infix_p)
1745             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1746           else
1747             return scm_read_mixed_case_symbol (chr, port, opts);
1748         case ']':
1749           if (opts->square_brackets_p)
1750             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1751           /* otherwise fall through */
1752         case EOF:
1753           return SCM_EOF_VAL;
1754         case ':':
1755           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1756             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1757           /* Fall through.  */
1758
1759         default:
1760           {
1761             if (((chr >= '0') && (chr <= '9'))
1762                 || (strchr ("+-.", chr)))
1763               return (scm_read_number (chr, port, opts));
1764             else
1765               return (scm_read_mixed_case_symbol (chr, port, opts));
1766           }
1767         }
1768     }
1769 }
1770 #undef FUNC_NAME
1771
1772 static SCM
1773 scm_read_expression (SCM port, scm_t_read_opts *opts)
1774 #define FUNC_NAME "scm_read_expression"
1775 {
1776   if (!opts->neoteric_p)
1777     return read_inner_expression (port, opts);
1778   else
1779     {
1780       long line = 0;
1781       int column = 0;
1782       SCM expr;
1783
1784       if (opts->record_positions_p)
1785         {
1786           /* We need to get the position of the first non-whitespace
1787              character in order to correctly annotate neoteric
1788              expressions.  For example, for the expression 'f(x)', the
1789              first call to 'read_inner_expression' reads the 'f' (which
1790              cannot be annotated), and then we later read the '(x)' and
1791              use it to construct the new list (f x). */
1792           int c = flush_ws (port, opts, (char *) NULL);
1793           if (c == EOF)
1794             return SCM_EOF_VAL;
1795           scm_ungetc_unlocked (c, port);
1796           line = SCM_LINUM (port);
1797           column = SCM_COL (port);
1798         }
1799
1800       expr = read_inner_expression (port, opts);
1801
1802       /* 'expr' is the first component of the neoteric expression.  Now
1803          we loop, and as long as the next character is '(', '[', or '{',
1804          (without any intervening whitespace), we use it to construct a
1805          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1806       for (;;)
1807         {
1808           int chr = scm_getc_unlocked (port);
1809
1810           if (chr == '(')
1811             /* e(...) => (e ...) */
1812             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1813           else if (chr == '[')
1814             /* e[...] => ($bracket-apply$ e ...) */
1815             expr = scm_cons (sym_bracket_apply,
1816                              scm_cons (expr,
1817                                        scm_read_sexp (chr, port, opts)));
1818           else if (chr == '{')
1819             {
1820               SCM arg = scm_read_sexp (chr, port, opts);
1821
1822               if (scm_is_null (arg))
1823                 expr = scm_list_1 (expr);       /* e{} => (e) */
1824               else
1825                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1826             }
1827           else
1828             {
1829               if (chr != EOF)
1830                 scm_ungetc_unlocked (chr, port);
1831               break;
1832             }
1833           maybe_annotate_source (expr, port, opts, line, column);
1834         }
1835       return expr;
1836     }
1837 }
1838 #undef FUNC_NAME
1839
1840 \f
1841 /* Actual reader.  */
1842
1843 static void init_read_options (SCM port, scm_t_read_opts *opts);
1844
1845 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1846             (SCM port),
1847             "Read an s-expression from the input port @var{port}, or from\n"
1848             "the current input port if @var{port} is not specified.\n"
1849             "Any whitespace before the next token is discarded.")
1850 #define FUNC_NAME s_scm_read
1851 {
1852   scm_t_read_opts opts;
1853   int c;
1854
1855   if (SCM_UNBNDP (port))
1856     port = scm_current_input_port ();
1857   SCM_VALIDATE_OPINPORT (1, port);
1858
1859   init_read_options (port, &opts);
1860
1861   c = flush_ws (port, &opts, (char *) NULL);
1862   if (EOF == c)
1863     return SCM_EOF_VAL;
1864   scm_ungetc_unlocked (c, port);
1865
1866   return (scm_read_expression (port, &opts));
1867 }
1868 #undef FUNC_NAME
1869
1870
1871 \f
1872
1873 /* Manipulate the read-hash-procedures alist.  This could be written in
1874    Scheme, but maybe it will also be used by C code during initialisation.  */
1875 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1876             (SCM chr, SCM proc),
1877             "Install the procedure @var{proc} for reading expressions\n"
1878             "starting with the character sequence @code{#} and @var{chr}.\n"
1879             "@var{proc} will be called with two arguments:  the character\n"
1880             "@var{chr} and the port to read further data from. The object\n"
1881             "returned will be the return value of @code{read}. \n"
1882             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1883             )
1884 #define FUNC_NAME s_scm_read_hash_extend
1885 {
1886   SCM this;
1887   SCM prev;
1888
1889   SCM_VALIDATE_CHAR (1, chr);
1890   SCM_ASSERT (scm_is_false (proc)
1891               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1892               proc, SCM_ARG2, FUNC_NAME);
1893
1894   /* Check if chr is already in the alist.  */
1895   this = scm_i_read_hash_procedures_ref ();
1896   prev = SCM_BOOL_F;
1897   while (1)
1898     {
1899       if (scm_is_null (this))
1900         {
1901           /* not found, so add it to the beginning.  */
1902           if (scm_is_true (proc))
1903             {
1904               SCM new = scm_cons (scm_cons (chr, proc),
1905                                   scm_i_read_hash_procedures_ref ());
1906               scm_i_read_hash_procedures_set_x (new);
1907             }
1908           break;
1909         }
1910       if (scm_is_eq (chr, SCM_CAAR (this)))
1911         {
1912           /* already in the alist.  */
1913           if (scm_is_false (proc))
1914             {
1915               /* remove it.  */
1916               if (scm_is_false (prev))
1917                 {
1918                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1919                   scm_i_read_hash_procedures_set_x (rest);
1920                 }
1921               else
1922                 scm_set_cdr_x (prev, SCM_CDR (this));
1923             }
1924           else
1925             {
1926               /* replace it.  */
1927               scm_set_cdr_x (SCM_CAR (this), proc);
1928             }
1929           break;
1930         }
1931       prev = this;
1932       this = SCM_CDR (this);
1933     }
1934
1935   return SCM_UNSPECIFIED;
1936 }
1937 #undef FUNC_NAME
1938
1939 /* Recover the read-hash procedure corresponding to char c.  */
1940 static SCM
1941 scm_get_hash_procedure (int c)
1942 {
1943   SCM rest = scm_i_read_hash_procedures_ref ();
1944
1945   while (1)
1946     {
1947       if (scm_is_null (rest))
1948         return SCM_BOOL_F;
1949
1950       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1951         return SCM_CDAR (rest);
1952
1953       rest = SCM_CDR (rest);
1954     }
1955 }
1956
1957 #define SCM_ENCODING_SEARCH_SIZE (500)
1958
1959 /* Search the first few hundred characters of a file for an Emacs-like coding
1960    declaration.  Returns either NULL or a string whose storage has been
1961    allocated with `scm_gc_malloc ()'.  */
1962 char *
1963 scm_i_scan_for_encoding (SCM port)
1964 {
1965   scm_t_port *pt;
1966   char header[SCM_ENCODING_SEARCH_SIZE+1];
1967   size_t bytes_read, encoding_length, i;
1968   char *encoding = NULL;
1969   int utf8_bom = 0;
1970   char *pos, *encoding_start;
1971   int in_comment;
1972
1973   pt = SCM_PTAB_ENTRY (port);
1974
1975   if (pt->rw_active == SCM_PORT_WRITE)
1976     scm_flush_unlocked (port);
1977
1978   if (pt->rw_random)
1979     pt->rw_active = SCM_PORT_READ;
1980
1981   if (pt->read_pos == pt->read_end)
1982     {
1983       /* We can use the read buffer, and thus avoid a seek. */
1984       if (scm_fill_input_unlocked (port) == EOF)
1985         return NULL;
1986
1987       bytes_read = pt->read_end - pt->read_pos;
1988       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1989         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1990
1991       if (bytes_read <= 1)
1992         /* An unbuffered port -- don't scan.  */
1993         return NULL;
1994
1995       memcpy (header, pt->read_pos, bytes_read);
1996       header[bytes_read] = '\0';
1997     }
1998   else
1999     {
2000       /* Try to read some bytes and then seek back.  Not all ports
2001          support seeking back; and indeed some file ports (like
2002          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2003          check performed by SCM_FPORT_FDES---but fail to seek
2004          backwards.  Hence this block comes second.  We prefer to use
2005          the read buffer in-place.  */
2006       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2007         return NULL;
2008
2009       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2010       header[bytes_read] = '\0';
2011       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2012     }
2013
2014   if (bytes_read > 3
2015       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2016     utf8_bom = 1;
2017
2018   /* search past "coding[:=]" */
2019   pos = header;
2020   while (1)
2021     {
2022       if ((pos = strstr(pos, "coding")) == NULL)
2023         return NULL;
2024
2025       pos += strlen("coding");
2026       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2027           (*pos == ':' || *pos == '='))
2028         {
2029           pos ++;
2030           break;
2031         }
2032     }
2033
2034   /* skip spaces */
2035   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2036          (*pos == ' ' || *pos == '\t'))
2037     pos ++;
2038
2039   /* grab the next token */
2040   encoding_start = pos;
2041   i = 0;
2042   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2043          && encoding_start + i - header < bytes_read
2044          && (isalnum ((int) encoding_start[i])
2045              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2046     i++;
2047
2048   encoding_length = i;
2049   if (encoding_length == 0)
2050     return NULL;
2051
2052   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2053   for (i = 0; i < encoding_length; i++)
2054     encoding[i] = toupper ((int) encoding[i]);
2055
2056   /* push backwards to make sure we were in a comment */
2057   in_comment = 0;
2058   pos = encoding_start;
2059   while (pos >= header)
2060     {
2061       if (*pos == ';')
2062         {
2063           in_comment = 1;
2064           break;
2065         }
2066       else if (*pos == '\n' || pos == header)
2067         {
2068           /* This wasn't in a semicolon comment. Check for a
2069            hash-bang comment. */
2070           char *beg = strstr (header, "#!");
2071           char *end = strstr (header, "!#");
2072           if (beg < encoding_start && encoding_start + encoding_length <= end)
2073             in_comment = 1;
2074           break;
2075         }
2076       else
2077         {
2078           pos --;
2079           continue;
2080         }
2081     }
2082   if (!in_comment)
2083     /* This wasn't in a comment */
2084     return NULL;
2085
2086   if (utf8_bom && strcmp(encoding, "UTF-8"))
2087     scm_misc_error (NULL,
2088                     "the port input declares the encoding ~s but is encoded as UTF-8",
2089                     scm_list_1 (scm_from_locale_string (encoding)));
2090
2091   return encoding;
2092 }
2093
2094 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2095             (SCM port),
2096             "Scans the port for an Emacs-like character coding declaration\n"
2097             "near the top of the contents of a port with random-accessible contents.\n"
2098             "The coding declaration is of the form\n"
2099             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2100             "\n"
2101             "Returns a string containing the character encoding of the file\n"
2102             "if a declaration was found, or @code{#f} otherwise.\n")
2103 #define FUNC_NAME s_scm_file_encoding
2104 {
2105   char *enc;
2106   SCM s_enc;
2107
2108   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2109
2110   enc = scm_i_scan_for_encoding (port);
2111   if (enc == NULL)
2112     return SCM_BOOL_F;
2113   else
2114     {
2115       s_enc = scm_from_locale_string (enc);
2116       return s_enc;
2117     }
2118
2119   return SCM_BOOL_F;
2120 }
2121 #undef FUNC_NAME
2122
2123 \f
2124 /* Per-port read options.
2125
2126    We store per-port read options in the 'port-read-options' key of the
2127    port's alist.  The value stored in the alist is a single integer that
2128    contains a two-bit field for each read option.
2129
2130    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2131    the applicable value should be inherited from the corresponding
2132    global read option.  Otherwise, the bit field contains the value of
2133    the read option.  For boolean read options that have been set
2134    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2135    read option has been set per-port, its possible values are those in
2136    'enum t_keyword_style'. */
2137
2138 /* Key to read options in per-port alists. */
2139 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2140
2141 /* Offsets of bit fields for each per-port override */
2142 #define READ_OPTION_COPY_SOURCE_P          0
2143 #define READ_OPTION_RECORD_POSITIONS_P     2
2144 #define READ_OPTION_CASE_INSENSITIVE_P     4
2145 #define READ_OPTION_KEYWORD_STYLE          6
2146 #define READ_OPTION_R6RS_ESCAPES_P         8
2147 #define READ_OPTION_SQUARE_BRACKETS_P     10
2148 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2149 #define READ_OPTION_CURLY_INFIX_P         14
2150
2151 /* The total width in bits of the per-port overrides */
2152 #define READ_OPTIONS_NUM_BITS             16
2153
2154 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2155 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2156
2157 #define READ_OPTION_MASK     3
2158 #define READ_OPTION_INHERIT  3
2159
2160 static void
2161 set_port_read_option (SCM port, int option, int new_value)
2162 {
2163   SCM scm_read_options;
2164   unsigned int read_options;
2165
2166   new_value &= READ_OPTION_MASK;
2167   scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2168                                    sym_port_read_options);
2169   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2170     read_options = scm_to_uint (scm_read_options);
2171   else
2172     read_options = READ_OPTIONS_INHERIT_ALL;
2173   read_options &= ~(READ_OPTION_MASK << option);
2174   read_options |= new_value << option;
2175   scm_read_options = scm_from_uint (read_options);
2176   SCM_PTAB_ENTRY(port)->alist = scm_assq_set_x (SCM_PTAB_ENTRY(port)->alist,
2177                                                 sym_port_read_options,
2178                                                 scm_read_options);
2179 }
2180
2181 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2182 static void
2183 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2184 {
2185   value = !!value;
2186   opts->case_insensitive_p = value;
2187   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2188 }
2189
2190 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2191 static void
2192 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2193 {
2194   value = !!value;
2195   opts->square_brackets_p = value;
2196   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2197 }
2198
2199 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2200 static void
2201 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2202 {
2203   value = !!value;
2204   opts->curly_infix_p = value;
2205   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2206 }
2207
2208 /* Initialize OPTS based on PORT's read options and the global read
2209    options. */
2210 static void
2211 init_read_options (SCM port, scm_t_read_opts *opts)
2212 {
2213   SCM val, scm_read_options;
2214   unsigned int read_options, x;
2215
2216   scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2217                                    sym_port_read_options);
2218
2219   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2220     read_options = scm_to_uint (scm_read_options);
2221   else
2222     read_options = READ_OPTIONS_INHERIT_ALL;
2223
2224   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2225   if (x == READ_OPTION_INHERIT)
2226     {
2227       val = SCM_PACK (SCM_KEYWORD_STYLE);
2228       if (scm_is_eq (val, scm_keyword_prefix))
2229         x = KEYWORD_STYLE_PREFIX;
2230       else if (scm_is_eq (val, scm_keyword_postfix))
2231         x = KEYWORD_STYLE_POSTFIX;
2232       else
2233         x = KEYWORD_STYLE_HASH_PREFIX;
2234     }
2235   opts->keyword_style = x;
2236
2237 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2238   do                                                                    \
2239     {                                                                   \
2240       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2241       if (x == READ_OPTION_INHERIT)                                     \
2242         x = !!SCM_ ## NAME;                                             \
2243           opts->name = x;                                               \
2244     }                                                                   \
2245   while (0)
2246
2247   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2248   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2249   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2250   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2251   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2252   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2253   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2254
2255 #undef RESOLVE_BOOLEAN_OPTION
2256
2257   opts->neoteric_p = 0;
2258 }
2259
2260 void
2261 scm_init_read ()
2262 {
2263   SCM read_hash_procs;
2264
2265   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2266
2267   scm_i_read_hash_procedures =
2268     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2269
2270   scm_init_opts (scm_read_options, scm_read_opts);
2271 #include "libguile/read.x"
2272 }
2273
2274 /*
2275   Local Variables:
2276   c-file-style: "gnu"
2277   End:
2278 */