libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 /* SRFI-105 curly infix expression support */
  67 SCM_SYMBOL (sym_nfx, "$nfx$");
  68 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  69 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  70
  71 scm_t_option scm_read_opts[] =
  72   {
  73     { SCM_OPTION_BOOLEAN, "copy", 0,
  74       "Copy source code expressions." },
  75     { SCM_OPTION_BOOLEAN, "positions", 1,
  76       "Record positions of source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  78       "Convert symbols to lower case."},
  79     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  80       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  81     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  82       "Use R6RS variable-length character and string hex escapes."},
  83     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  84       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  85     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  86       "In strings, consume leading whitespace after an escaped end-of-line."},
  87     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  88       "Support SRFI-105 curly infix expressions."},
  89     { 0, },
  90   };
  91
  92 /* Internal read options structure.  This is initialized by 'scm_read'
  93    from the global and per-port read options, and a pointer is passed
  94    down to all helper functions. */
  95
  96 enum t_keyword_style
  97   {
  98     KEYWORD_STYLE_HASH_PREFIX,
  99     KEYWORD_STYLE_PREFIX,
 100     KEYWORD_STYLE_POSTFIX
 101   };
 102
 103 struct t_read_opts
 104 {
 105   enum t_keyword_style keyword_style;
 106   unsigned int copy_source_p        : 1;
 107   unsigned int record_positions_p   : 1;
 108   unsigned int case_insensitive_p   : 1;
 109   unsigned int r6rs_escapes_p       : 1;
 110   unsigned int square_brackets_p    : 1;
 111   unsigned int hungry_eol_escapes_p : 1;
 112   unsigned int curly_infix_p        : 1;
 113   unsigned int neoteric_p           : 1;
 114 };
 115
 116 typedef struct t_read_opts scm_t_read_opts;
 117
 118
 119 /*
 120   Give meaningful error messages for errors
 121
 122   We use the format
 123
 124   FILE:LINE:COL: MESSAGE
 125   This happened in ....
 126
 127   This is not standard GNU format, but the test-suite likes the real
 128   message to be in front.
 129
 130  */
 131
 132
 133 void
 134 scm_i_input_error (char const *function,
 135                    SCM port, const char *message, SCM arg)
 136 {
 137   SCM fn = (scm_is_string (SCM_FILENAME(port))
 138             ? SCM_FILENAME(port)
 139             : scm_from_locale_string ("#<unknown port>"));
 140
 141   SCM string_port = scm_open_output_string ();
 142   SCM string = SCM_EOL;
 143   scm_simple_format (string_port,
 144                      scm_from_locale_string ("~A:~S:~S: ~A"),
 145                      scm_list_4 (fn,
 146                                  scm_from_long (SCM_LINUM (port) + 1),
 147                                  scm_from_int (SCM_COL (port) + 1),
 148                                  scm_from_locale_string (message)));
 149
 150   string = scm_get_output_string (string_port);
 151   scm_close_output_port (string_port);
 152   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 153                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 154                  string,
 155                  arg,
 156                  SCM_BOOL_F);
 157 }
 158
 159
 160 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 161             (SCM setting),
 162             "Option interface for the read options. Instead of using\n"
 163             "this procedure directly, use the procedures @code{read-enable},\n"
 164             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 165 #define FUNC_NAME s_scm_read_options
 166 {
 167   SCM ans = scm_options (setting,
 168                          scm_read_opts,
 169                          FUNC_NAME);
 170   if (SCM_COPY_SOURCE_P)
 171     SCM_RECORD_POSITIONS_P = 1;
 172   return ans;
 173 }
 174 #undef FUNC_NAME
 175
 176 /* A fluid referring to an association list mapping extra hash
 177    characters to procedures.  */
 178 static SCM *scm_i_read_hash_procedures;
 179
 180 static SCM
 181 scm_i_read_hash_procedures_ref (void)
 182 {
 183   return scm_fluid_ref (*scm_i_read_hash_procedures);
 184 }
 185
 186 static void
 187 scm_i_read_hash_procedures_set_x (SCM value)
 188 {
 189   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 190 }
 191
 192 \f
 193 /* Token readers.  */
 194
 195
 196 /* Size of the C buffer used to read symbols and numbers.  */
 197 #define READER_BUFFER_SIZE            128
 198
 199 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 200 #define READER_STRING_BUFFER_SIZE     128
 201
 202 /* The maximum size of Scheme character names.  */
 203 #define READER_CHAR_NAME_MAX_SIZE      50
 204
 205 /* The maximum size of reader directive names.  */
 206 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 207
 208
 209 /* `isblank' is only in C99.  */
 210 #define CHAR_IS_BLANK_(_chr)                                    \
 211   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 212    || ((_chr) == '\f') || ((_chr) == '\r'))
 213
 214 #ifdef MSDOS
 215 # define CHAR_IS_BLANK(_chr)                    \
 216   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 217 #else
 218 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 219 #endif
 220
 221
 222 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 223    structure'').  */
 224 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 225   (CHAR_IS_BLANK (c)                                            \
 226    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 227
 228 #define CHAR_IS_DELIMITER(c)                                    \
 229   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 230    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 231                                       || opts->curly_infix_p))  \
 232    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 233
 234 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 235    Structure''.  */
 236 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 237   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 238    || ((_chr) == 'd') || ((_chr) == 'l'))
 239
 240 /* Read an SCSH block comment.  */
 241 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 242 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 244 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_get_hash_procedure (int);
 246
 247 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 248    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 249    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 250    bytes actually read.  */
 251 static int
 252 read_token (SCM port, scm_t_read_opts *opts,
 253             char *buf, size_t buf_size, size_t *read)
 254 {
 255    *read = 0;
 256
 257    while (*read < buf_size)
 258      {
 259        int chr;
 260
 261        chr = scm_get_byte_or_eof (port);
 262
 263        if (chr == EOF)
 264         return 0;
 265       else if (CHAR_IS_DELIMITER (chr))
 266         {
 267           scm_unget_byte (chr, port);
 268           return 0;
 269         }
 270       else
 271         {
 272           *buf = (char) chr;
 273           buf++, (*read)++;
 274         }
 275      }
 276
 277    return 1;
 278  }
 279
 280 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 281    if the token doesn't fit in BUFFER_SIZE bytes.  */
 282 static char *
 283 read_complete_token (SCM port, scm_t_read_opts *opts,
 284                      char *buffer, size_t buffer_size, size_t *read)
 285 {
 286   int overflow = 0;
 287   size_t bytes_read, overflow_size = 0;
 288   char *overflow_buffer = NULL;
 289
 290   do
 291     {
 292       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 293       if (bytes_read == 0)
 294         break;
 295       if (overflow || overflow_size != 0)
 296         {
 297           if (overflow_size == 0)
 298             {
 299               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 300               memcpy (overflow_buffer, buffer, bytes_read);
 301               overflow_size = bytes_read;
 302             }
 303           else
 304             {
 305               char *new_buf =
 306                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 307
 308               memcpy (new_buf, overflow_buffer, overflow_size);
 309               memcpy (new_buf + overflow_size, buffer, bytes_read);
 310
 311               overflow_buffer = new_buf;
 312               overflow_size += bytes_read;
 313             }
 314         }
 315     }
 316   while (overflow);
 317
 318   if (overflow_size)
 319     *read = overflow_size;
 320   else
 321     *read = bytes_read;
 322
 323   return (overflow_size > 0 ? overflow_buffer : buffer);
 324 }
 325
 326 /* Skip whitespace from PORT and return the first non-whitespace character
 327    read.  Raise an error on end-of-file.  */
 328 static int
 329 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 330 {
 331   scm_t_wchar c;
 332   while (1)
 333     switch (c = scm_getc (port))
 334       {
 335       case EOF:
 336       goteof:
 337         if (eoferr)
 338           {
 339             scm_i_input_error (eoferr,
 340                                port,
 341                                "end of file",
 342                                SCM_EOL);
 343           }
 344         return c;
 345
 346       case ';':
 347       lp:
 348         switch (c = scm_getc (port))
 349           {
 350           case EOF:
 351             goto goteof;
 352           default:
 353             goto lp;
 354           case SCM_LINE_INCREMENTORS:
 355             break;
 356           }
 357         break;
 358
 359       case '#':
 360         switch (c = scm_getc (port))
 361           {
 362           case EOF:
 363             eoferr = "read_sharp";
 364             goto goteof;
 365           case '!':
 366             scm_read_shebang (c, port, opts);
 367             break;
 368           case ';':
 369             scm_read_commented_expression (c, port, opts);
 370             break;
 371           case '|':
 372             if (scm_is_false (scm_get_hash_procedure (c)))
 373               {
 374                 scm_read_r6rs_block_comment (c, port);
 375                 break;
 376               }
 377             /* fall through */
 378           default:
 379             scm_ungetc (c, port);
 380             return '#';
 381           }
 382         break;
 383
 384       case SCM_LINE_INCREMENTORS:
 385       case SCM_SINGLE_SPACES:
 386       case '\t':
 387         break;
 388
 389       default:
 390         return c;
 391       }
 392
 393   return 0;
 394 }
 395
 396
 397 \f
 398 /* Token readers.  */
 399
 400 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 401 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 402                            long line, int column);
 403
 404
 405 static SCM
 406 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 407                        long line, int column)
 408 {
 409   if (opts->record_positions_p)
 410     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 411   return x;
 412 }
 413
 414 static SCM
 415 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 416 #define FUNC_NAME "scm_i_lreadparen"
 417 {
 418   int c;
 419   SCM tmp, tl, ans = SCM_EOL;
 420   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 421   const int terminating_char = ((chr == '{') ? '}'
 422                                 : ((chr == '[') ? ']'
 423                                    : ')'));
 424
 425   /* Need to capture line and column numbers here. */
 426   long line = SCM_LINUM (port);
 427   int column = SCM_COL (port) - 1;
 428
 429   c = flush_ws (port, opts, FUNC_NAME);
 430   if (terminating_char == c)
 431     return SCM_EOL;
 432
 433   scm_ungetc (c, port);
 434   tmp = scm_read_expression (port, opts);
 435
 436   /* Note that it is possible for scm_read_expression to return
 437      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 438      check that it's a real dot by checking `c'.  */
 439   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 440     {
 441       ans = scm_read_expression (port, opts);
 442       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 443         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 444                            SCM_EOL);
 445       return ans;
 446     }
 447
 448   /* Build the head of the list structure. */
 449   ans = tl = scm_cons (tmp, SCM_EOL);
 450
 451   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 452     {
 453       SCM new_tail;
 454
 455       if (c == ')' || (c == ']' && opts->square_brackets_p)
 456           || ((c == '}' || c == ']') && opts->curly_infix_p))
 457         scm_i_input_error (FUNC_NAME, port,
 458                            "in pair: mismatched close paren: ~A",
 459                            scm_list_1 (SCM_MAKE_CHAR (c)));
 460
 461       scm_ungetc (c, port);
 462       tmp = scm_read_expression (port, opts);
 463
 464       /* See above note about scm_sym_dot.  */
 465       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 466         {
 467           SCM_SETCDR (tl, scm_read_expression (port, opts));
 468
 469           c = flush_ws (port, opts, FUNC_NAME);
 470           if (terminating_char != c)
 471             scm_i_input_error (FUNC_NAME, port,
 472                                "in pair: missing close paren", SCM_EOL);
 473           break;
 474         }
 475
 476       new_tail = scm_cons (tmp, SCM_EOL);
 477       SCM_SETCDR (tl, new_tail);
 478       tl = new_tail;
 479     }
 480
 481   if (curly_list_p)
 482     {
 483       /* In addition to finding the length, 'scm_ilength' checks for
 484          improper or circular lists, in which case it returns -1. */
 485       int len = scm_ilength (ans);
 486
 487       /* The (len == 0) case is handled above */
 488       if (len == 1)
 489         /* Return directly to avoid re-annotating the element's source
 490            location with the position of the outer brace.  Also, it
 491            might not be possible to annotate the element. */
 492         return scm_car (ans);  /* {e} => e */
 493       else if (len == 2)
 494         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 495       else if (len >= 3 && (len & 1))
 496         {
 497           /* It's a proper list whose length is odd and at least 3.  If
 498              the elements at odd indices (the infix operator positions)
 499              are all 'equal?', then it's a simple curly-infix list.
 500              Otherwise it's a mixed curly-infix list. */
 501           SCM op = scm_cadr (ans);
 502
 503           /* Check to see if the elements at odd indices are 'equal?' */
 504           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 505             {
 506               if (scm_is_null (tl))
 507                 {
 508                   /* Convert simple curly-infix list to prefix:
 509                      {a <op> b <op> ...} => (<op> a b ...) */
 510                   tl = ans;
 511                   while (scm_is_pair (scm_cdr (tl)))
 512                     {
 513                       tmp = scm_cddr (tl);
 514                       SCM_SETCDR (tl, tmp);
 515                       tl = tmp;
 516                     }
 517                   ans = scm_cons (op, ans);
 518                   break;
 519                 }
 520               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 521                 {
 522                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 523                   ans = scm_cons (sym_nfx, ans);
 524                   break;
 525                 }
 526             }
 527         }
 528       else
 529         /* Mixed curly-infix (possibly improper) list:
 530            {e . tail} => ($nfx$ e . tail) */
 531         ans = scm_cons (sym_nfx, ans);
 532     }
 533
 534   return maybe_annotate_source (ans, port, opts, line, column);
 535 }
 536 #undef FUNC_NAME
 537
 538
 539 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 540    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 541    found.  */
 542 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 543   do                                                               \
 544     {                                                              \
 545       scm_t_wchar a;                                               \
 546       size_t i = 0;                                                \
 547       c = 0;                                                       \
 548       while (i < ndigits)                                          \
 549         {                                                          \
 550           a = scm_getc (port);                                     \
 551           if (a == EOF)                                            \
 552             goto str_eof;                                          \
 553           if (terminator                                           \
 554               && (a == (scm_t_wchar) terminator)                   \
 555               && (i > 0))                                          \
 556             break;                                                 \
 557           if ('0' <= a && a <= '9')                                \
 558             a -= '0';                                              \
 559           else if ('A' <= a && a <= 'F')                           \
 560             a = a - 'A' + 10;                                      \
 561           else if ('a' <= a && a <= 'f')                           \
 562             a = a - 'a' + 10;                                      \
 563           else                                                     \
 564             {                                                      \
 565               c = a;                                               \
 566               goto bad_escaped;                                    \
 567             }                                                      \
 568           c = c * 16 + a;                                          \
 569           i ++;                                                    \
 570         }                                                          \
 571     } while (0)
 572
 573 static void
 574 skip_intraline_whitespace (SCM port)
 575 {
 576   scm_t_wchar c;
 577
 578   do
 579     {
 580       c = scm_getc (port);
 581       if (c == EOF)
 582         return;
 583     }
 584   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 585
 586   scm_ungetc (c, port);
 587 }
 588
 589 static SCM
 590 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 591 #define FUNC_NAME "scm_lreadr"
 592 {
 593   /* For strings smaller than C_STR, this function creates only one Scheme
 594      object (the string returned).  */
 595
 596   SCM str = SCM_EOL;
 597   size_t c_str_len = 0;
 598   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 599
 600   /* Need to capture line and column numbers here. */
 601   long line = SCM_LINUM (port);
 602   int column = SCM_COL (port) - 1;
 603
 604   while ('"' != (c = scm_getc (port)))
 605     {
 606       if (c == EOF)
 607         {
 608         str_eof:
 609           scm_i_input_error (FUNC_NAME, port,
 610                              "end of file in string constant", SCM_EOL);
 611         }
 612
 613       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 614         {
 615           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 616           c_str_len = 0;
 617         }
 618
 619       if (c == '\\')
 620         {
 621           switch (c = scm_getc (port))
 622             {
 623             case EOF:
 624               goto str_eof;
 625             case '"':
 626             case '\\':
 627               break;
 628             case '\n':
 629               if (opts->hungry_eol_escapes_p)
 630                 skip_intraline_whitespace (port);
 631               continue;
 632             case '0':
 633               c = '\0';
 634               break;
 635             case 'f':
 636               c = '\f';
 637               break;
 638             case 'n':
 639               c = '\n';
 640               break;
 641             case 'r':
 642               c = '\r';
 643               break;
 644             case 't':
 645               c = '\t';
 646               break;
 647             case 'a':
 648               c = '\007';
 649               break;
 650             case 'v':
 651               c = '\v';
 652               break;
 653             case 'b':
 654               c = '\010';
 655               break;
 656             case 'x':
 657               if (opts->r6rs_escapes_p)
 658                 SCM_READ_HEX_ESCAPE (10, ';');
 659               else
 660                 SCM_READ_HEX_ESCAPE (2, '\0');
 661               break;
 662             case 'u':
 663               if (!opts->r6rs_escapes_p)
 664                 {
 665                   SCM_READ_HEX_ESCAPE (4, '\0');
 666                   break;
 667                 }
 668             case 'U':
 669               if (!opts->r6rs_escapes_p)
 670                 {
 671                   SCM_READ_HEX_ESCAPE (6, '\0');
 672                   break;
 673                 }
 674             default:
 675             bad_escaped:
 676               scm_i_input_error (FUNC_NAME, port,
 677                                  "illegal character in escape sequence: ~S",
 678                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 679             }
 680         }
 681
 682       c_str[c_str_len++] = c;
 683     }
 684
 685   if (scm_is_null (str))
 686     /* Fast path: we got a string that fits in C_STR.  */
 687     str = scm_from_utf32_stringn (c_str, c_str_len);
 688   else
 689     {
 690       if (c_str_len > 0)
 691         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 692
 693       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 694     }
 695
 696   return maybe_annotate_source (str, port, opts, line, column);
 697 }
 698 #undef FUNC_NAME
 699
 700
 701 static SCM
 702 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 703 {
 704   SCM result, str = SCM_EOL;
 705   char local_buffer[READER_BUFFER_SIZE], *buffer;
 706   size_t bytes_read;
 707   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 708
 709   /* Need to capture line and column numbers here. */
 710   long line = SCM_LINUM (port);
 711   int column = SCM_COL (port) - 1;
 712
 713   scm_ungetc (chr, port);
 714   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 715                                 &bytes_read);
 716
 717   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 718
 719   result = scm_string_to_number (str, SCM_UNDEFINED);
 720   if (scm_is_false (result))
 721     {
 722       /* Return a symbol instead of a number */
 723       if (opts->case_insensitive_p)
 724         str = scm_string_downcase_x (str);
 725       result = scm_string_to_symbol (str);
 726     }
 727   else if (SCM_NIMP (result))
 728     result = maybe_annotate_source (result, port, opts, line, column);
 729
 730   SCM_COL (port) += scm_i_string_length (str);
 731   return result;
 732 }
 733
 734 static SCM
 735 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 736 {
 737   SCM result;
 738   int ends_with_colon = 0;
 739   size_t bytes_read;
 740   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 741   char local_buffer[READER_BUFFER_SIZE], *buffer;
 742   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 743   SCM str;
 744
 745   scm_ungetc (chr, port);
 746   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 747                                 &bytes_read);
 748   if (bytes_read > 0)
 749     ends_with_colon = buffer[bytes_read - 1] == ':';
 750
 751   if (postfix && ends_with_colon && (bytes_read > 1))
 752     {
 753       str = scm_from_stringn (buffer, bytes_read - 1,
 754                               pt->encoding, pt->ilseq_handler);
 755
 756       if (opts->case_insensitive_p)
 757         str = scm_string_downcase_x (str);
 758       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 759     }
 760   else
 761     {
 762       str = scm_from_stringn (buffer, bytes_read,
 763                               pt->encoding, pt->ilseq_handler);
 764
 765       if (opts->case_insensitive_p)
 766         str = scm_string_downcase_x (str);
 767       result = scm_string_to_symbol (str);
 768     }
 769
 770   SCM_COL (port) += scm_i_string_length (str);
 771   return result;
 772 }
 773
 774 static SCM
 775 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 776 #define FUNC_NAME "scm_lreadr"
 777 {
 778   SCM result;
 779   size_t read;
 780   char local_buffer[READER_BUFFER_SIZE], *buffer;
 781   unsigned int radix;
 782   SCM str;
 783   scm_t_port *pt;
 784
 785   switch (chr)
 786     {
 787     case 'B':
 788     case 'b':
 789       radix = 2;
 790       break;
 791
 792     case 'o':
 793     case 'O':
 794       radix = 8;
 795       break;
 796
 797     case 'd':
 798     case 'D':
 799       radix = 10;
 800       break;
 801
 802     case 'x':
 803     case 'X':
 804       radix = 16;
 805       break;
 806
 807     default:
 808       scm_ungetc (chr, port);
 809       scm_ungetc ('#', port);
 810       radix = 10;
 811     }
 812
 813   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 814                                 &read);
 815
 816   pt = SCM_PTAB_ENTRY (port);
 817   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 818
 819   result = scm_string_to_number (str, scm_from_uint (radix));
 820
 821   SCM_COL (port) += scm_i_string_length (str);
 822
 823   if (scm_is_true (result))
 824     return result;
 825
 826   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 827
 828   return SCM_BOOL_F;
 829 }
 830 #undef FUNC_NAME
 831
 832 static SCM
 833 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 834 {
 835   SCM p;
 836   long line = SCM_LINUM (port);
 837   int column = SCM_COL (port) - 1;
 838
 839   switch (chr)
 840     {
 841     case '`':
 842       p = scm_sym_quasiquote;
 843       break;
 844
 845     case '\'':
 846       p = scm_sym_quote;
 847       break;
 848
 849     case ',':
 850       {
 851         scm_t_wchar c;
 852
 853         c = scm_getc (port);
 854         if ('@' == c)
 855           p = scm_sym_uq_splicing;
 856         else
 857           {
 858             scm_ungetc (c, port);
 859             p = scm_sym_unquote;
 860           }
 861         break;
 862       }
 863
 864     default:
 865       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 866                "scm_read_quote", chr);
 867       abort ();
 868     }
 869
 870   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 871   return maybe_annotate_source (p, port, opts, line, column);
 872 }
 873
 874 SCM_SYMBOL (sym_syntax, "syntax");
 875 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 876 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 877 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 878
 879 static SCM
 880 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 881 {
 882   SCM p;
 883   long line = SCM_LINUM (port);
 884   int column = SCM_COL (port) - 1;
 885
 886   switch (chr)
 887     {
 888     case '`':
 889       p = sym_quasisyntax;
 890       break;
 891
 892     case '\'':
 893       p = sym_syntax;
 894       break;
 895
 896     case ',':
 897       {
 898         int c;
 899
 900         c = scm_getc (port);
 901         if ('@' == c)
 902           p = sym_unsyntax_splicing;
 903         else
 904           {
 905             scm_ungetc (c, port);
 906             p = sym_unsyntax;
 907           }
 908         break;
 909       }
 910
 911     default:
 912       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 913                "scm_read_syntax", chr);
 914       abort ();
 915     }
 916
 917   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 918   return maybe_annotate_source (p, port, opts, line, column);
 919 }
 920
 921 static SCM
 922 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 923 {
 924   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 925
 926   if (!scm_is_eq (id, sym_nil))
 927     scm_i_input_error ("scm_read_nil", port,
 928                        "unexpected input while reading #nil: ~a",
 929                        scm_list_1 (id));
 930
 931   return SCM_ELISP_NIL;
 932 }
 933
 934 static SCM
 935 scm_read_semicolon_comment (int chr, SCM port)
 936 {
 937   int c;
 938
 939   /* We use the get_byte here because there is no need to get the
 940      locale correct with comment input. This presumes that newline
 941      always represents itself no matter what the encoding is.  */
 942   for (c = scm_get_byte_or_eof (port);
 943        (c != EOF) && (c != '\n');
 944        c = scm_get_byte_or_eof (port));
 945
 946   return SCM_UNSPECIFIED;
 947 }
 948
 949 \f
 950 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 951
 952 static SCM
 953 scm_read_boolean (int chr, SCM port)
 954 {
 955   switch (chr)
 956     {
 957     case 't':
 958     case 'T':
 959       return SCM_BOOL_T;
 960
 961     case 'f':
 962     case 'F':
 963       return SCM_BOOL_F;
 964     }
 965
 966   return SCM_UNSPECIFIED;
 967 }
 968
 969 static SCM
 970 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 971 #define FUNC_NAME "scm_lreadr"
 972 {
 973   char buffer[READER_CHAR_NAME_MAX_SIZE];
 974   SCM charname;
 975   size_t charname_len, bytes_read;
 976   scm_t_wchar cp;
 977   int overflow;
 978   scm_t_port *pt;
 979
 980   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 981                          &bytes_read);
 982   if (overflow)
 983     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 984
 985   if (bytes_read == 0)
 986     {
 987       chr = scm_getc (port);
 988       if (chr == EOF)
 989         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 990                            "while reading character", SCM_EOL);
 991
 992       /* CHR must be a token delimiter, like a whitespace.  */
 993       return (SCM_MAKE_CHAR (chr));
 994     }
 995
 996   pt = SCM_PTAB_ENTRY (port);
 997
 998   /* Simple ASCII characters can be processed immediately.  Also, simple
 999      ISO-8859-1 characters can be processed immediately if the encoding for this
1000      port is ISO-8859-1.  */
1001   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1002     {
1003       SCM_COL (port) += 1;
1004       return SCM_MAKE_CHAR (buffer[0]);
1005     }
1006
1007   /* Otherwise, convert the buffer into a proper scheme string for
1008      processing.  */
1009   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1010                                pt->ilseq_handler);
1011   charname_len = scm_i_string_length (charname);
1012   SCM_COL (port) += charname_len;
1013   cp = scm_i_string_ref (charname, 0);
1014   if (charname_len == 1)
1015     return SCM_MAKE_CHAR (cp);
1016
1017   /* Ignore dotted circles, which may be used to keep combining characters from
1018      combining with the backslash in #\charname.  */
1019   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1020     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1021
1022   if (cp >= '0' && cp < '8')
1023     {
1024       /* Dirk:FIXME::  This type of character syntax is not R5RS
1025        * compliant.  Further, it should be verified that the constant
1026        * does only consist of octal digits.  */
1027       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1028       if (SCM_I_INUMP (p))
1029         {
1030           scm_t_wchar c = scm_to_uint32 (p);
1031           if (SCM_IS_UNICODE_CHAR (c))
1032             return SCM_MAKE_CHAR (c);
1033           else
1034             scm_i_input_error (FUNC_NAME, port,
1035                                "out-of-range octal character escape: ~a",
1036                                scm_list_1 (charname));
1037         }
1038     }
1039
1040   if (cp == 'x' && (charname_len > 1))
1041     {
1042       SCM p;
1043
1044       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1045       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1046                                 scm_from_uint (16));
1047       if (SCM_I_INUMP (p))
1048         {
1049           scm_t_wchar c = scm_to_uint32 (p);
1050           if (SCM_IS_UNICODE_CHAR (c))
1051             return SCM_MAKE_CHAR (c);
1052           else
1053             scm_i_input_error (FUNC_NAME, port,
1054                                "out-of-range hex character escape: ~a",
1055                                scm_list_1 (charname));
1056         }
1057     }
1058
1059   /* The names of characters should never have non-Latin1
1060      characters.  */
1061   if (scm_i_is_narrow_string (charname)
1062       || scm_i_try_narrow_string (charname))
1063     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1064                                        charname_len);
1065       if (scm_is_true (ch))
1066         return ch;
1067     }
1068
1069   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1070                      scm_list_1 (charname));
1071
1072   return SCM_UNSPECIFIED;
1073 }
1074 #undef FUNC_NAME
1075
1076 static SCM
1077 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1078 {
1079   SCM symbol;
1080
1081   /* Read the symbol that comprises the keyword.  Doing this instead of
1082      invoking a specific symbol reader function allows `scm_read_keyword ()'
1083      to adapt to the delimiters currently valid of symbols.
1084
1085      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1086   symbol = scm_read_expression (port, opts);
1087   if (!scm_is_symbol (symbol))
1088     scm_i_input_error ("scm_read_keyword", port,
1089                        "keyword prefix `~a' not followed by a symbol: ~s",
1090                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1091
1092   return (scm_symbol_to_keyword (symbol));
1093 }
1094
1095 static SCM
1096 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1097                  long line, int column)
1098 {
1099   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1100      guarantee that it's going to do what we want.  After all, this is an
1101      implementation detail of `scm_read_vector ()', not a desirable
1102      property.  */
1103   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1104                                 port, opts, line, column);
1105 }
1106
1107 /* Helper used by scm_read_array */
1108 static int
1109 read_decimal_integer (SCM port, int c, ssize_t *resp)
1110 {
1111   ssize_t sign = 1;
1112   ssize_t res = 0;
1113   int got_it = 0;
1114
1115   if (c == '-')
1116     {
1117       sign = -1;
1118       c = scm_getc (port);
1119     }
1120
1121   while ('0' <= c && c <= '9')
1122     {
1123       res = 10*res + c-'0';
1124       got_it = 1;
1125       c = scm_getc (port);
1126     }
1127
1128   if (got_it)
1129     *resp = sign * res;
1130   return c;
1131 }
1132
1133 /* Read an array.  This function can also read vectors and uniform
1134    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1135    handled here.
1136
1137    C is the first character read after the '#'.
1138 */
1139 static SCM
1140 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1141 {
1142   ssize_t rank;
1143   scm_t_wchar tag_buf[8];
1144   int tag_len;
1145
1146   SCM tag, shape = SCM_BOOL_F, elements, array;
1147
1148   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1149      the array code can not deal with zero-length dimensions yet, and
1150      we want to allow zero-length vectors, of course.
1151   */
1152   if (c == '(')
1153     return scm_read_vector (c, port, opts, line, column);
1154
1155   /* Disambiguate between '#f' and uniform floating point vectors.
1156    */
1157   if (c == 'f')
1158     {
1159       c = scm_getc (port);
1160       if (c != '3' && c != '6')
1161         {
1162           if (c != EOF)
1163             scm_ungetc (c, port);
1164           return SCM_BOOL_F;
1165         }
1166       rank = 1;
1167       tag_buf[0] = 'f';
1168       tag_len = 1;
1169       goto continue_reading_tag;
1170     }
1171
1172   /* Read rank. */
1173   rank = 1;
1174   c = read_decimal_integer (port, c, &rank);
1175   if (rank < 0)
1176     scm_i_input_error (NULL, port, "array rank must be non-negative",
1177                        SCM_EOL);
1178
1179   /* Read tag. */
1180   tag_len = 0;
1181  continue_reading_tag:
1182   while (c != EOF && c != '(' && c != '@' && c != ':'
1183          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1184     {
1185       tag_buf[tag_len++] = c;
1186       c = scm_getc (port);
1187     }
1188   if (tag_len == 0)
1189     tag = SCM_BOOL_T;
1190   else
1191     {
1192       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1193       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1194         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1195                            scm_list_1 (tag));
1196     }
1197
1198   /* Read shape. */
1199   if (c == '@' || c == ':')
1200     {
1201       shape = SCM_EOL;
1202
1203       do
1204         {
1205           ssize_t lbnd = 0, len = 0;
1206           SCM s;
1207
1208           if (c == '@')
1209             {
1210               c = scm_getc (port);
1211               c = read_decimal_integer (port, c, &lbnd);
1212             }
1213
1214           s = scm_from_ssize_t (lbnd);
1215
1216           if (c == ':')
1217             {
1218               c = scm_getc (port);
1219               c = read_decimal_integer (port, c, &len);
1220               if (len < 0)
1221                 scm_i_input_error (NULL, port,
1222                                    "array length must be non-negative",
1223                                    SCM_EOL);
1224
1225               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1226             }
1227
1228           shape = scm_cons (s, shape);
1229         } while (c == '@' || c == ':');
1230
1231       shape = scm_reverse_x (shape, SCM_EOL);
1232     }
1233
1234   /* Read nested lists of elements. */
1235   if (c != '(')
1236     scm_i_input_error (NULL, port,
1237                        "missing '(' in vector or array literal",
1238                        SCM_EOL);
1239   elements = scm_read_sexp (c, port, opts);
1240
1241   if (scm_is_false (shape))
1242     shape = scm_from_ssize_t (rank);
1243   else if (scm_ilength (shape) != rank)
1244     scm_i_input_error
1245       (NULL, port,
1246        "the number of shape specifications must match the array rank",
1247        SCM_EOL);
1248
1249   /* Handle special print syntax of rank zero arrays; see
1250      scm_i_print_array for a rationale. */
1251   if (rank == 0)
1252     {
1253       if (!scm_is_pair (elements))
1254         scm_i_input_error (NULL, port,
1255                            "too few elements in array literal, need 1",
1256                            SCM_EOL);
1257       if (!scm_is_null (SCM_CDR (elements)))
1258         scm_i_input_error (NULL, port,
1259                            "too many elements in array literal, want 1",
1260                            SCM_EOL);
1261       elements = SCM_CAR (elements);
1262     }
1263
1264   /* Construct array, annotate with source location, and return. */
1265   array = scm_list_to_typed_array (tag, shape, elements);
1266   return maybe_annotate_source (array, port, opts, line, column);
1267 }
1268
1269 static SCM
1270 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1271                        long line, int column)
1272 {
1273   return scm_read_array (chr, port, opts, line, column);
1274 }
1275
1276 static SCM
1277 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1278                      long line, int column)
1279 {
1280   chr = scm_getc (port);
1281   if (chr != 'u')
1282     goto syntax;
1283
1284   chr = scm_getc (port);
1285   if (chr != '8')
1286     goto syntax;
1287
1288   chr = scm_getc (port);
1289   if (chr != '(')
1290     goto syntax;
1291
1292   return maybe_annotate_source
1293     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1294      port, opts, line, column);
1295
1296  syntax:
1297   scm_i_input_error ("read_bytevector", port,
1298                      "invalid bytevector prefix",
1299                      SCM_MAKE_CHAR (chr));
1300   return SCM_UNSPECIFIED;
1301 }
1302
1303 static SCM
1304 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1305                            long line, int column)
1306 {
1307   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1308      terribly inefficient but who cares?  */
1309   SCM s_bits = SCM_EOL;
1310
1311   for (chr = scm_getc (port);
1312        (chr != EOF) && ((chr == '0') || (chr == '1'));
1313        chr = scm_getc (port))
1314     {
1315       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1316     }
1317
1318   if (chr != EOF)
1319     scm_ungetc (chr, port);
1320
1321   return maybe_annotate_source
1322     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1323      port, opts, line, column);
1324 }
1325
1326 static SCM
1327 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1328 {
1329   int bang_seen = 0;
1330
1331   for (;;)
1332     {
1333       int c = scm_getc (port);
1334
1335       if (c == EOF)
1336         scm_i_input_error ("skip_block_comment", port,
1337                            "unterminated `#! ... !#' comment", SCM_EOL);
1338
1339       if (c == '!')
1340         bang_seen = 1;
1341       else if (c == '#' && bang_seen)
1342         break;
1343       else
1344         bang_seen = 0;
1345     }
1346
1347   return SCM_UNSPECIFIED;
1348 }
1349
1350 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1351                                          int value);
1352 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1353                                         int value);
1354 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1355                                     int value);
1356
1357 static SCM
1358 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1359 {
1360   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1361   int c;
1362   int i = 0;
1363
1364   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1365     {
1366       c = scm_getc (port);
1367       if (c == EOF)
1368         scm_i_input_error ("skip_block_comment", port,
1369                            "unterminated `#! ... !#' comment", SCM_EOL);
1370       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1371         name[i++] = c;
1372       else if (CHAR_IS_DELIMITER (c))
1373         {
1374           scm_ungetc (c, port);
1375           name[i] = '\0';
1376           if (0 == strcmp ("r6rs", name))
1377             ;  /* Silently ignore */
1378           else if (0 == strcmp ("fold-case", name))
1379             set_port_case_insensitive_p (port, opts, 1);
1380           else if (0 == strcmp ("no-fold-case", name))
1381             set_port_case_insensitive_p (port, opts, 0);
1382           else if (0 == strcmp ("curly-infix", name))
1383             set_port_curly_infix_p (port, opts, 1);
1384           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1385             {
1386               set_port_curly_infix_p (port, opts, 1);
1387               set_port_square_brackets_p (port, opts, 0);
1388             }
1389           else
1390             break;
1391
1392           return SCM_UNSPECIFIED;
1393         }
1394       else
1395         {
1396           scm_ungetc (c, port);
1397           break;
1398         }
1399     }
1400   while (i > 0)
1401     scm_ungetc (name[--i], port);
1402   return scm_read_scsh_block_comment (chr, port);
1403 }
1404
1405 static SCM
1406 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1407 {
1408   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1409      nested.  So care must be taken.  */
1410   int nesting_level = 1;
1411
1412   int a = scm_getc (port);
1413
1414   if (a == EOF)
1415     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1416                        "unterminated `#| ... |#' comment", SCM_EOL);
1417
1418   while (nesting_level > 0)
1419     {
1420       int b = scm_getc (port);
1421
1422       if (b == EOF)
1423         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1424                            "unterminated `#| ... |#' comment", SCM_EOL);
1425
1426       if (a == '|' && b == '#')
1427         {
1428           nesting_level--;
1429           b = EOF;
1430         }
1431       else if (a == '#' && b == '|')
1432         {
1433           nesting_level++;
1434           b = EOF;
1435         }
1436
1437       a = b;
1438     }
1439
1440   return SCM_UNSPECIFIED;
1441 }
1442
1443 static SCM
1444 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1445                                scm_t_read_opts *opts)
1446 {
1447   scm_t_wchar c;
1448
1449   c = flush_ws (port, opts, (char *) NULL);
1450   if (EOF == c)
1451     scm_i_input_error ("read_commented_expression", port,
1452                        "no expression after #; comment", SCM_EOL);
1453   scm_ungetc (c, port);
1454   scm_read_expression (port, opts);
1455   return SCM_UNSPECIFIED;
1456 }
1457
1458 static SCM
1459 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1460 {
1461   /* Guile's extended symbol read syntax looks like this:
1462
1463        #{This is all a symbol name}#
1464
1465      So here, CHR is expected to be `{'.  */
1466   int saw_brace = 0;
1467   size_t len = 0;
1468   SCM buf = scm_i_make_string (1024, NULL, 0);
1469
1470   buf = scm_i_string_start_writing (buf);
1471
1472   while ((chr = scm_getc (port)) != EOF)
1473     {
1474       if (saw_brace)
1475         {
1476           if (chr == '#')
1477             {
1478               break;
1479             }
1480           else
1481             {
1482               saw_brace = 0;
1483               scm_i_string_set_x (buf, len++, '}');
1484             }
1485         }
1486
1487       if (chr == '}')
1488         saw_brace = 1;
1489       else if (chr == '\\')
1490         {
1491           /* It used to be that print.c would print extended-read-syntax
1492              symbols with backslashes before "non-standard" chars, but
1493              this routine wouldn't do anything with those escapes.
1494              Bummer.  What we've done is to change print.c to output
1495              R6RS hex escapes for those characters, relying on the fact
1496              that the extended read syntax would never put a `\' before
1497              an `x'.  For now, we just ignore other instances of
1498              backslash in the string.  */
1499           switch ((chr = scm_getc (port)))
1500             {
1501             case EOF:
1502               goto done;
1503             case 'x':
1504               {
1505                 scm_t_wchar c;
1506
1507                 SCM_READ_HEX_ESCAPE (10, ';');
1508                 scm_i_string_set_x (buf, len++, c);
1509                 break;
1510
1511               str_eof:
1512                 chr = EOF;
1513                 goto done;
1514
1515               bad_escaped:
1516                 scm_i_string_stop_writing ();
1517                 scm_i_input_error ("scm_read_extended_symbol", port,
1518                                    "illegal character in escape sequence: ~S",
1519                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1520                 break;
1521               }
1522             default:
1523               scm_i_string_set_x (buf, len++, chr);
1524               break;
1525             }
1526         }
1527       else
1528         scm_i_string_set_x (buf, len++, chr);
1529
1530       if (len >= scm_i_string_length (buf) - 2)
1531         {
1532           SCM addy;
1533
1534           scm_i_string_stop_writing ();
1535           addy = scm_i_make_string (1024, NULL, 0);
1536           buf = scm_string_append (scm_list_2 (buf, addy));
1537           len = 0;
1538           buf = scm_i_string_start_writing (buf);
1539         }
1540     }
1541
1542  done:
1543   scm_i_string_stop_writing ();
1544   if (chr == EOF)
1545     scm_i_input_error ("scm_read_extended_symbol", port,
1546                        "end of file while reading symbol", SCM_EOL);
1547
1548   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1549 }
1550
1551
1552 \f
1553 /* Top-level token readers, i.e., dispatchers.  */
1554
1555 static SCM
1556 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1557 {
1558   SCM proc;
1559
1560   proc = scm_get_hash_procedure (chr);
1561   if (scm_is_true (scm_procedure_p (proc)))
1562     {
1563       long line = SCM_LINUM (port);
1564       int column = SCM_COL (port) - 2;
1565       SCM got;
1566
1567       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1568
1569       if (opts->record_positions_p && SCM_NIMP (got)
1570           && !scm_i_has_source_properties (got))
1571         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1572
1573       return got;
1574     }
1575
1576   return SCM_UNSPECIFIED;
1577 }
1578
1579 /* The reader for the sharp `#' character.  It basically dispatches reads
1580    among the above token readers.   */
1581 static SCM
1582 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1583                 long line, int column)
1584 #define FUNC_NAME "scm_lreadr"
1585 {
1586   SCM result;
1587
1588   chr = scm_getc (port);
1589
1590   result = scm_read_sharp_extension (chr, port, opts);
1591   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1592     return result;
1593
1594   switch (chr)
1595     {
1596     case '\\':
1597       return (scm_read_character (chr, port, opts));
1598     case '(':
1599       return (scm_read_vector (chr, port, opts, line, column));
1600     case 's':
1601     case 'u':
1602     case 'f':
1603     case 'c':
1604       /* This one may return either a boolean or an SRFI-4 vector.  */
1605       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1606     case 'v':
1607       return (scm_read_bytevector (chr, port, opts, line, column));
1608     case '*':
1609       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1610     case 't':
1611     case 'T':
1612     case 'F':
1613       return (scm_read_boolean (chr, port));
1614     case ':':
1615       return (scm_read_keyword (chr, port, opts));
1616     case '0': case '1': case '2': case '3': case '4':
1617     case '5': case '6': case '7': case '8': case '9':
1618     case '@':
1619 #if SCM_ENABLE_DEPRECATED
1620       /* See below for 'i' and 'e'. */
1621     case 'a':
1622     case 'y':
1623     case 'h':
1624     case 'l':
1625 #endif
1626       return (scm_read_array (chr, port, opts, line, column));
1627
1628     case 'i':
1629     case 'e':
1630 #if SCM_ENABLE_DEPRECATED
1631       {
1632         /* When next char is '(', it really is an old-style
1633            uniform array. */
1634         scm_t_wchar next_c = scm_getc (port);
1635         if (next_c != EOF)
1636           scm_ungetc (next_c, port);
1637         if (next_c == '(')
1638           return scm_read_array (chr, port, opts, line, column);
1639         /* Fall through. */
1640       }
1641 #endif
1642     case 'b':
1643     case 'B':
1644     case 'o':
1645     case 'O':
1646     case 'd':
1647     case 'D':
1648     case 'x':
1649     case 'X':
1650     case 'I':
1651     case 'E':
1652       return (scm_read_number_and_radix (chr, port, opts));
1653     case '{':
1654       return (scm_read_extended_symbol (chr, port));
1655     case '!':
1656       return (scm_read_shebang (chr, port, opts));
1657     case ';':
1658       return (scm_read_commented_expression (chr, port, opts));
1659     case '`':
1660     case '\'':
1661     case ',':
1662       return (scm_read_syntax (chr, port, opts));
1663     case 'n':
1664       return (scm_read_nil (chr, port, opts));
1665     default:
1666       result = scm_read_sharp_extension (chr, port, opts);
1667       if (scm_is_eq (result, SCM_UNSPECIFIED))
1668         {
1669           /* To remain compatible with 1.8 and earlier, the following
1670              characters have lower precedence than `read-hash-extend'
1671              characters.  */
1672           switch (chr)
1673             {
1674             case '|':
1675               return scm_read_r6rs_block_comment (chr, port);
1676             default:
1677               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1678                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1679             }
1680         }
1681       else
1682         return result;
1683     }
1684
1685   return SCM_UNSPECIFIED;
1686 }
1687 #undef FUNC_NAME
1688
1689 static SCM
1690 read_inner_expression (SCM port, scm_t_read_opts *opts)
1691 #define FUNC_NAME "read_inner_expression"
1692 {
1693   while (1)
1694     {
1695       scm_t_wchar chr;
1696
1697       chr = scm_getc (port);
1698
1699       switch (chr)
1700         {
1701         case SCM_WHITE_SPACES:
1702         case SCM_LINE_INCREMENTORS:
1703           break;
1704         case ';':
1705           (void) scm_read_semicolon_comment (chr, port);
1706           break;
1707         case '{':
1708           if (opts->curly_infix_p)
1709             {
1710               if (opts->neoteric_p)
1711                 return scm_read_sexp (chr, port, opts);
1712               else
1713                 {
1714                   SCM expr;
1715
1716                   /* Enable neoteric expressions within curly braces */
1717                   opts->neoteric_p = 1;
1718                   expr = scm_read_sexp (chr, port, opts);
1719                   opts->neoteric_p = 0;
1720                   return expr;
1721                 }
1722             }
1723           else
1724             return scm_read_mixed_case_symbol (chr, port, opts);
1725         case '[':
1726           if (opts->square_brackets_p)
1727             return scm_read_sexp (chr, port, opts);
1728           else if (opts->curly_infix_p)
1729             {
1730               /* The syntax of neoteric expressions requires that '[' be
1731                  a delimiter when curly-infix is enabled, so it cannot
1732                  be part of an unescaped symbol.  We might as well do
1733                  something useful with it, so we adopt Kawa's convention:
1734                  [...] => ($bracket-list$ ...) */
1735               long line = SCM_LINUM (port);
1736               int column = SCM_COL (port) - 1;
1737               return maybe_annotate_source
1738                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1739                  port, opts, line, column);
1740             }
1741           else
1742             return scm_read_mixed_case_symbol (chr, port, opts);
1743         case '(':
1744           return (scm_read_sexp (chr, port, opts));
1745         case '"':
1746           return (scm_read_string (chr, port, opts));
1747         case '\'':
1748         case '`':
1749         case ',':
1750           return (scm_read_quote (chr, port, opts));
1751         case '#':
1752           {
1753             long line  = SCM_LINUM (port);
1754             int column = SCM_COL (port) - 1;
1755             SCM result = scm_read_sharp (chr, port, opts, line, column);
1756             if (scm_is_eq (result, SCM_UNSPECIFIED))
1757               /* We read a comment or some such.  */
1758               break;
1759             else
1760               return result;
1761           }
1762         case ')':
1763           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1764           break;
1765         case '}':
1766           if (opts->curly_infix_p)
1767             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1768           else
1769             return scm_read_mixed_case_symbol (chr, port, opts);
1770         case ']':
1771           if (opts->square_brackets_p)
1772             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1773           /* otherwise fall through */
1774         case EOF:
1775           return SCM_EOF_VAL;
1776         case ':':
1777           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1778             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1779           /* Fall through.  */
1780
1781         default:
1782           {
1783             if (((chr >= '0') && (chr <= '9'))
1784                 || (strchr ("+-.", chr)))
1785               return (scm_read_number (chr, port, opts));
1786             else
1787               return (scm_read_mixed_case_symbol (chr, port, opts));
1788           }
1789         }
1790     }
1791 }
1792 #undef FUNC_NAME
1793
1794 static SCM
1795 scm_read_expression (SCM port, scm_t_read_opts *opts)
1796 #define FUNC_NAME "scm_read_expression"
1797 {
1798   if (!opts->neoteric_p)
1799     return read_inner_expression (port, opts);
1800   else
1801     {
1802       long line = 0;
1803       int column = 0;
1804       SCM expr;
1805
1806       if (opts->record_positions_p)
1807         {
1808           /* We need to get the position of the first non-whitespace
1809              character in order to correctly annotate neoteric
1810              expressions.  For example, for the expression 'f(x)', the
1811              first call to 'read_inner_expression' reads the 'f' (which
1812              cannot be annotated), and then we later read the '(x)' and
1813              use it to construct the new list (f x). */
1814           int c = flush_ws (port, opts, (char *) NULL);
1815           if (c == EOF)
1816             return SCM_EOF_VAL;
1817           scm_ungetc (c, port);
1818           line = SCM_LINUM (port);
1819           column = SCM_COL (port);
1820         }
1821
1822       expr = read_inner_expression (port, opts);
1823
1824       /* 'expr' is the first component of the neoteric expression.  Now
1825          we loop, and as long as the next character is '(', '[', or '{',
1826          (without any intervening whitespace), we use it to construct a
1827          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1828       for (;;)
1829         {
1830           int chr = scm_getc (port);
1831
1832           if (chr == '(')
1833             /* e(...) => (e ...) */
1834             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1835           else if (chr == '[')
1836             /* e[...] => ($bracket-apply$ e ...) */
1837             expr = scm_cons (sym_bracket_apply,
1838                              scm_cons (expr,
1839                                        scm_read_sexp (chr, port, opts)));
1840           else if (chr == '{')
1841             {
1842               SCM arg = scm_read_sexp (chr, port, opts);
1843
1844               if (scm_is_null (arg))
1845                 expr = scm_list_1 (expr);       /* e{} => (e) */
1846               else
1847                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1848             }
1849           else
1850             {
1851               if (chr != EOF)
1852                 scm_ungetc (chr, port);
1853               break;
1854             }
1855           maybe_annotate_source (expr, port, opts, line, column);
1856         }
1857       return expr;
1858     }
1859 }
1860 #undef FUNC_NAME
1861
1862 \f
1863 /* Actual reader.  */
1864
1865 static void init_read_options (SCM port, scm_t_read_opts *opts);
1866
1867 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1868             (SCM port),
1869             "Read an s-expression from the input port @var{port}, or from\n"
1870             "the current input port if @var{port} is not specified.\n"
1871             "Any whitespace before the next token is discarded.")
1872 #define FUNC_NAME s_scm_read
1873 {
1874   scm_t_read_opts opts;
1875   int c;
1876
1877   if (SCM_UNBNDP (port))
1878     port = scm_current_input_port ();
1879   SCM_VALIDATE_OPINPORT (1, port);
1880
1881   init_read_options (port, &opts);
1882
1883   c = flush_ws (port, &opts, (char *) NULL);
1884   if (EOF == c)
1885     return SCM_EOF_VAL;
1886   scm_ungetc (c, port);
1887
1888   return (scm_read_expression (port, &opts));
1889 }
1890 #undef FUNC_NAME
1891
1892
1893 \f
1894
1895 /* Manipulate the read-hash-procedures alist.  This could be written in
1896    Scheme, but maybe it will also be used by C code during initialisation.  */
1897 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1898             (SCM chr, SCM proc),
1899             "Install the procedure @var{proc} for reading expressions\n"
1900             "starting with the character sequence @code{#} and @var{chr}.\n"
1901             "@var{proc} will be called with two arguments:  the character\n"
1902             "@var{chr} and the port to read further data from. The object\n"
1903             "returned will be the return value of @code{read}. \n"
1904             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1905             )
1906 #define FUNC_NAME s_scm_read_hash_extend
1907 {
1908   SCM this;
1909   SCM prev;
1910
1911   SCM_VALIDATE_CHAR (1, chr);
1912   SCM_ASSERT (scm_is_false (proc)
1913               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1914               proc, SCM_ARG2, FUNC_NAME);
1915
1916   /* Check if chr is already in the alist.  */
1917   this = scm_i_read_hash_procedures_ref ();
1918   prev = SCM_BOOL_F;
1919   while (1)
1920     {
1921       if (scm_is_null (this))
1922         {
1923           /* not found, so add it to the beginning.  */
1924           if (scm_is_true (proc))
1925             {
1926               SCM new = scm_cons (scm_cons (chr, proc),
1927                                   scm_i_read_hash_procedures_ref ());
1928               scm_i_read_hash_procedures_set_x (new);
1929             }
1930           break;
1931         }
1932       if (scm_is_eq (chr, SCM_CAAR (this)))
1933         {
1934           /* already in the alist.  */
1935           if (scm_is_false (proc))
1936             {
1937               /* remove it.  */
1938               if (scm_is_false (prev))
1939                 {
1940                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1941                   scm_i_read_hash_procedures_set_x (rest);
1942                 }
1943               else
1944                 scm_set_cdr_x (prev, SCM_CDR (this));
1945             }
1946           else
1947             {
1948               /* replace it.  */
1949               scm_set_cdr_x (SCM_CAR (this), proc);
1950             }
1951           break;
1952         }
1953       prev = this;
1954       this = SCM_CDR (this);
1955     }
1956
1957   return SCM_UNSPECIFIED;
1958 }
1959 #undef FUNC_NAME
1960
1961 /* Recover the read-hash procedure corresponding to char c.  */
1962 static SCM
1963 scm_get_hash_procedure (int c)
1964 {
1965   SCM rest = scm_i_read_hash_procedures_ref ();
1966
1967   while (1)
1968     {
1969       if (scm_is_null (rest))
1970         return SCM_BOOL_F;
1971
1972       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1973         return SCM_CDAR (rest);
1974
1975       rest = SCM_CDR (rest);
1976     }
1977 }
1978
1979 #define SCM_ENCODING_SEARCH_SIZE (500)
1980
1981 /* Search the first few hundred characters of a file for an Emacs-like coding
1982    declaration.  Returns either NULL or a string whose storage has been
1983    allocated with `scm_gc_malloc ()'.  */
1984 char *
1985 scm_i_scan_for_encoding (SCM port)
1986 {
1987   scm_t_port *pt;
1988   char header[SCM_ENCODING_SEARCH_SIZE+1];
1989   size_t bytes_read, encoding_length, i;
1990   char *encoding = NULL;
1991   int utf8_bom = 0;
1992   char *pos, *encoding_start;
1993   int in_comment;
1994
1995   pt = SCM_PTAB_ENTRY (port);
1996
1997   if (pt->rw_active == SCM_PORT_WRITE)
1998     scm_flush (port);
1999
2000   if (pt->rw_random)
2001     pt->rw_active = SCM_PORT_READ;
2002
2003   if (pt->read_pos == pt->read_end)
2004     {
2005       /* We can use the read buffer, and thus avoid a seek. */
2006       if (scm_fill_input (port) == EOF)
2007         return NULL;
2008
2009       bytes_read = pt->read_end - pt->read_pos;
2010       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2011         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2012
2013       if (bytes_read <= 1)
2014         /* An unbuffered port -- don't scan.  */
2015         return NULL;
2016
2017       memcpy (header, pt->read_pos, bytes_read);
2018       header[bytes_read] = '\0';
2019     }
2020   else
2021     {
2022       /* Try to read some bytes and then seek back.  Not all ports
2023          support seeking back; and indeed some file ports (like
2024          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2025          check performed by SCM_FPORT_FDES---but fail to seek
2026          backwards.  Hence this block comes second.  We prefer to use
2027          the read buffer in-place.  */
2028       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2029         return NULL;
2030
2031       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2032       header[bytes_read] = '\0';
2033       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2034     }
2035
2036   if (bytes_read > 3
2037       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2038     utf8_bom = 1;
2039
2040   /* search past "coding[:=]" */
2041   pos = header;
2042   while (1)
2043     {
2044       if ((pos = strstr(pos, "coding")) == NULL)
2045         return NULL;
2046
2047       pos += strlen("coding");
2048       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2049           (*pos == ':' || *pos == '='))
2050         {
2051           pos ++;
2052           break;
2053         }
2054     }
2055
2056   /* skip spaces */
2057   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2058          (*pos == ' ' || *pos == '\t'))
2059     pos ++;
2060
2061   /* grab the next token */
2062   encoding_start = pos;
2063   i = 0;
2064   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2065          && encoding_start + i - header < bytes_read
2066          && (isalnum ((int) encoding_start[i])
2067              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2068     i++;
2069
2070   encoding_length = i;
2071   if (encoding_length == 0)
2072     return NULL;
2073
2074   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2075   for (i = 0; i < encoding_length; i++)
2076     encoding[i] = toupper ((int) encoding[i]);
2077
2078   /* push backwards to make sure we were in a comment */
2079   in_comment = 0;
2080   pos = encoding_start;
2081   while (pos >= header)
2082     {
2083       if (*pos == ';')
2084         {
2085           in_comment = 1;
2086           break;
2087         }
2088       else if (*pos == '\n' || pos == header)
2089         {
2090           /* This wasn't in a semicolon comment. Check for a
2091            hash-bang comment. */
2092           char *beg = strstr (header, "#!");
2093           char *end = strstr (header, "!#");
2094           if (beg < encoding_start && encoding_start + encoding_length <= end)
2095             in_comment = 1;
2096           break;
2097         }
2098       else
2099         {
2100           pos --;
2101           continue;
2102         }
2103     }
2104   if (!in_comment)
2105     /* This wasn't in a comment */
2106     return NULL;
2107
2108   if (utf8_bom && strcmp(encoding, "UTF-8"))
2109     scm_misc_error (NULL,
2110                     "the port input declares the encoding ~s but is encoded as UTF-8",
2111                     scm_list_1 (scm_from_locale_string (encoding)));
2112
2113   return encoding;
2114 }
2115
2116 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2117             (SCM port),
2118             "Scans the port for an Emacs-like character coding declaration\n"
2119             "near the top of the contents of a port with random-accessible contents.\n"
2120             "The coding declaration is of the form\n"
2121             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2122             "\n"
2123             "Returns a string containing the character encoding of the file\n"
2124             "if a declaration was found, or @code{#f} otherwise.\n")
2125 #define FUNC_NAME s_scm_file_encoding
2126 {
2127   char *enc;
2128   SCM s_enc;
2129
2130   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2131
2132   enc = scm_i_scan_for_encoding (port);
2133   if (enc == NULL)
2134     return SCM_BOOL_F;
2135   else
2136     {
2137       s_enc = scm_from_locale_string (enc);
2138       return s_enc;
2139     }
2140
2141   return SCM_BOOL_F;
2142 }
2143 #undef FUNC_NAME
2144
2145 \f
2146 /* Per-port read options.
2147
2148    We store per-port read options in the 'port-read-options' key of the
2149    port's alist, which is stored in 'scm_i_port_weak_hash'.  The value
2150    stored in the alist is a single integer that contains a two-bit field
2151    for each read option.
2152
2153    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2154    the applicable value should be inherited from the corresponding
2155    global read option.  Otherwise, the bit field contains the value of
2156    the read option.  For boolean read options that have been set
2157    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2158    read option has been set per-port, its possible values are those in
2159    'enum t_keyword_style'. */
2160
2161 /* Key to read options in per-port alists. */
2162 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2163
2164 /* Offsets of bit fields for each per-port override */
2165 #define READ_OPTION_COPY_SOURCE_P          0
2166 #define READ_OPTION_RECORD_POSITIONS_P     2
2167 #define READ_OPTION_CASE_INSENSITIVE_P     4
2168 #define READ_OPTION_KEYWORD_STYLE          6
2169 #define READ_OPTION_R6RS_ESCAPES_P         8
2170 #define READ_OPTION_SQUARE_BRACKETS_P     10
2171 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2172 #define READ_OPTION_CURLY_INFIX_P         14
2173
2174 /* The total width in bits of the per-port overrides */
2175 #define READ_OPTIONS_NUM_BITS             16
2176
2177 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2178 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2179
2180 #define READ_OPTION_MASK     3
2181 #define READ_OPTION_INHERIT  3
2182
2183 static void
2184 set_port_read_option (SCM port, int option, int new_value)
2185 {
2186   SCM alist, scm_read_options;
2187   unsigned int read_options;
2188
2189   new_value &= READ_OPTION_MASK;
2190   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2191   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2192   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2193   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2194     read_options = scm_to_uint (scm_read_options);
2195   else
2196     read_options = READ_OPTIONS_INHERIT_ALL;
2197   read_options &= ~(READ_OPTION_MASK << option);
2198   read_options |= new_value << option;
2199   scm_read_options = scm_from_uint (read_options);
2200   alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2201   scm_hashq_set_x (scm_i_port_weak_hash, port, alist);
2202   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2203 }
2204
2205 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2206 static void
2207 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2208 {
2209   value = !!value;
2210   opts->case_insensitive_p = value;
2211   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2212 }
2213
2214 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2215 static void
2216 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2217 {
2218   value = !!value;
2219   opts->square_brackets_p = value;
2220   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2221 }
2222
2223 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2224 static void
2225 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2226 {
2227   value = !!value;
2228   opts->curly_infix_p = value;
2229   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2230 }
2231
2232 /* Initialize OPTS based on PORT's read options and the global read
2233    options. */
2234 static void
2235 init_read_options (SCM port, scm_t_read_opts *opts)
2236 {
2237   SCM alist, val, scm_read_options;
2238   unsigned int read_options, x;
2239
2240   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2241   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2242   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2243   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2244
2245   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2246     read_options = scm_to_uint (scm_read_options);
2247   else
2248     read_options = READ_OPTIONS_INHERIT_ALL;
2249
2250   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2251   if (x == READ_OPTION_INHERIT)
2252     {
2253       val = SCM_PACK (SCM_KEYWORD_STYLE);
2254       if (scm_is_eq (val, scm_keyword_prefix))
2255         x = KEYWORD_STYLE_PREFIX;
2256       else if (scm_is_eq (val, scm_keyword_postfix))
2257         x = KEYWORD_STYLE_POSTFIX;
2258       else
2259         x = KEYWORD_STYLE_HASH_PREFIX;
2260     }
2261   opts->keyword_style = x;
2262
2263 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2264   do                                                                    \
2265     {                                                                   \
2266       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2267       if (x == READ_OPTION_INHERIT)                                     \
2268         x = !!SCM_ ## NAME;                                             \
2269           opts->name = x;                                               \
2270     }                                                                   \
2271   while (0)
2272
2273   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2274   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2275   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2276   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2277   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2278   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2279   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2280
2281 #undef RESOLVE_BOOLEAN_OPTION
2282
2283   opts->neoteric_p = 0;
2284 }
2285
2286 void
2287 scm_init_read ()
2288 {
2289   SCM read_hash_procs;
2290
2291   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2292
2293   scm_i_read_hash_procedures =
2294     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2295
2296   scm_init_opts (scm_read_options, scm_read_opts);
2297 #include "libguile/read.x"
2298 }
2299
2300 /*
2301   Local Variables:
2302   c-file-style: "gnu"
2303   End:
2304 */