libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <unistd.h>
  30 #include <unicase.h>
  31 #include <unictype.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63 SCM_SYMBOL (sym_nil, "nil");
  64
  65 /* SRFI-105 curly infix expression support */
  66 SCM_SYMBOL (sym_nfx, "$nfx$");
  67 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  68 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  69
  70 scm_t_option scm_read_opts[] =
  71   {
  72     { SCM_OPTION_BOOLEAN, "copy", 0,
  73       "Copy source code expressions." },
  74     { SCM_OPTION_BOOLEAN, "positions", 1,
  75       "Record positions of source code expressions." },
  76     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  77       "Convert symbols to lower case."},
  78     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  79       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  80     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  81       "Use R6RS variable-length character and string hex escapes."},
  82     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  83       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  84     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  85       "In strings, consume leading whitespace after an escaped end-of-line."},
  86     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  87       "Support SRFI-105 curly infix expressions."},
  88     { 0, },
  89   };
  90
  91 /* Internal read options structure.  This is initialized by 'scm_read'
  92    from the global and per-port read options, and a pointer is passed
  93    down to all helper functions. */
  94
  95 enum t_keyword_style
  96   {
  97     KEYWORD_STYLE_HASH_PREFIX,
  98     KEYWORD_STYLE_PREFIX,
  99     KEYWORD_STYLE_POSTFIX
 100   };
 101
 102 struct t_read_opts
 103 {
 104   enum t_keyword_style keyword_style;
 105   unsigned int copy_source_p        : 1;
 106   unsigned int record_positions_p   : 1;
 107   unsigned int case_insensitive_p   : 1;
 108   unsigned int r6rs_escapes_p       : 1;
 109   unsigned int square_brackets_p    : 1;
 110   unsigned int hungry_eol_escapes_p : 1;
 111   unsigned int curly_infix_p        : 1;
 112   unsigned int neoteric_p           : 1;
 113 };
 114
 115 typedef struct t_read_opts scm_t_read_opts;
 116
 117
 118 /*
 119   Give meaningful error messages for errors
 120
 121   We use the format
 122
 123   FILE:LINE:COL: MESSAGE
 124   This happened in ....
 125
 126   This is not standard GNU format, but the test-suite likes the real
 127   message to be in front.
 128
 129  */
 130
 131
 132 void
 133 scm_i_input_error (char const *function,
 134                    SCM port, const char *message, SCM arg)
 135 {
 136   SCM fn = (scm_is_string (SCM_FILENAME(port))
 137             ? SCM_FILENAME(port)
 138             : scm_from_locale_string ("#<unknown port>"));
 139
 140   SCM string_port = scm_open_output_string ();
 141   SCM string = SCM_EOL;
 142   scm_simple_format (string_port,
 143                      scm_from_locale_string ("~A:~S:~S: ~A"),
 144                      scm_list_4 (fn,
 145                                  scm_from_long (SCM_LINUM (port) + 1),
 146                                  scm_from_int (SCM_COL (port) + 1),
 147                                  scm_from_locale_string (message)));
 148
 149   string = scm_get_output_string (string_port);
 150   scm_close_output_port (string_port);
 151   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 152                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 153                  string,
 154                  arg,
 155                  SCM_BOOL_F);
 156 }
 157
 158
 159 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 160             (SCM setting),
 161             "Option interface for the read options. Instead of using\n"
 162             "this procedure directly, use the procedures @code{read-enable},\n"
 163             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 164 #define FUNC_NAME s_scm_read_options
 165 {
 166   SCM ans = scm_options (setting,
 167                          scm_read_opts,
 168                          FUNC_NAME);
 169   if (SCM_COPY_SOURCE_P)
 170     SCM_RECORD_POSITIONS_P = 1;
 171   return ans;
 172 }
 173 #undef FUNC_NAME
 174
 175 /* A fluid referring to an association list mapping extra hash
 176    characters to procedures.  */
 177 static SCM *scm_i_read_hash_procedures;
 178
 179 static SCM
 180 scm_i_read_hash_procedures_ref (void)
 181 {
 182   return scm_fluid_ref (*scm_i_read_hash_procedures);
 183 }
 184
 185 static void
 186 scm_i_read_hash_procedures_set_x (SCM value)
 187 {
 188   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 189 }
 190
 191 \f
 192 /* Token readers.  */
 193
 194
 195 /* Size of the C buffer used to read symbols and numbers.  */
 196 #define READER_BUFFER_SIZE            128
 197
 198 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 199 #define READER_STRING_BUFFER_SIZE     128
 200
 201 /* The maximum size of Scheme character names.  */
 202 #define READER_CHAR_NAME_MAX_SIZE      50
 203
 204 /* The maximum size of reader directive names.  */
 205 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 206
 207
 208 /* `isblank' is only in C99.  */
 209 #define CHAR_IS_BLANK_(_chr)                                    \
 210   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 211    || ((_chr) == '\f') || ((_chr) == '\r'))
 212
 213 #ifdef MSDOS
 214 # define CHAR_IS_BLANK(_chr)                    \
 215   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 216 #else
 217 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 218 #endif
 219
 220
 221 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 222    structure'').  */
 223 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 224   (CHAR_IS_BLANK (c)                                            \
 225    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 226
 227 #define CHAR_IS_DELIMITER(c)                                    \
 228   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 229    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 230                                       || opts->curly_infix_p))  \
 231    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 232
 233 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 234    Structure''.  */
 235 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 236   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 237    || ((_chr) == 'd') || ((_chr) == 'l'))
 238
 239 /* Read an SCSH block comment.  */
 240 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 241 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 242 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 243 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 244 static SCM scm_get_hash_procedure (int);
 245
 246 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 247    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 248    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 249    bytes actually read.  */
 250 static int
 251 read_token (SCM port, scm_t_read_opts *opts,
 252             char *buf, size_t buf_size, size_t *read)
 253 {
 254    *read = 0;
 255
 256    while (*read < buf_size)
 257      {
 258        int chr;
 259
 260        chr = scm_get_byte_or_eof_unlocked (port);
 261
 262        if (chr == EOF)
 263         return 0;
 264       else if (CHAR_IS_DELIMITER (chr))
 265         {
 266           scm_unget_byte_unlocked (chr, port);
 267           return 0;
 268         }
 269       else
 270         {
 271           *buf = (char) chr;
 272           buf++, (*read)++;
 273         }
 274      }
 275
 276    return 1;
 277  }
 278
 279 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 280    if the token doesn't fit in BUFFER_SIZE bytes.  */
 281 static char *
 282 read_complete_token (SCM port, scm_t_read_opts *opts,
 283                      char *buffer, size_t buffer_size, size_t *read)
 284 {
 285   int overflow = 0;
 286   size_t bytes_read, overflow_size = 0;
 287   char *overflow_buffer = NULL;
 288
 289   do
 290     {
 291       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 292       if (bytes_read == 0)
 293         break;
 294       if (overflow || overflow_size != 0)
 295         {
 296           if (overflow_size == 0)
 297             {
 298               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 299               memcpy (overflow_buffer, buffer, bytes_read);
 300               overflow_size = bytes_read;
 301             }
 302           else
 303             {
 304               char *new_buf =
 305                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 306
 307               memcpy (new_buf, overflow_buffer, overflow_size);
 308               memcpy (new_buf + overflow_size, buffer, bytes_read);
 309
 310               overflow_buffer = new_buf;
 311               overflow_size += bytes_read;
 312             }
 313         }
 314     }
 315   while (overflow);
 316
 317   if (overflow_size)
 318     *read = overflow_size;
 319   else
 320     *read = bytes_read;
 321
 322   return (overflow_size > 0 ? overflow_buffer : buffer);
 323 }
 324
 325 /* Skip whitespace from PORT and return the first non-whitespace character
 326    read.  Raise an error on end-of-file.  */
 327 static int
 328 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 329 {
 330   scm_t_wchar c;
 331   while (1)
 332     switch (c = scm_getc_unlocked (port))
 333       {
 334       case EOF:
 335       goteof:
 336         if (eoferr)
 337           {
 338             scm_i_input_error (eoferr,
 339                                port,
 340                                "end of file",
 341                                SCM_EOL);
 342           }
 343         return c;
 344
 345       case ';':
 346       lp:
 347         switch (c = scm_getc_unlocked (port))
 348           {
 349           case EOF:
 350             goto goteof;
 351           default:
 352             goto lp;
 353           case SCM_LINE_INCREMENTORS:
 354             break;
 355           }
 356         break;
 357
 358       case '#':
 359         switch (c = scm_getc_unlocked (port))
 360           {
 361           case EOF:
 362             eoferr = "read_sharp";
 363             goto goteof;
 364           case '!':
 365             scm_read_shebang (c, port, opts);
 366             break;
 367           case ';':
 368             scm_read_commented_expression (c, port, opts);
 369             break;
 370           case '|':
 371             if (scm_is_false (scm_get_hash_procedure (c)))
 372               {
 373                 scm_read_r6rs_block_comment (c, port);
 374                 break;
 375               }
 376             /* fall through */
 377           default:
 378             scm_ungetc_unlocked (c, port);
 379             return '#';
 380           }
 381         break;
 382
 383       case SCM_LINE_INCREMENTORS:
 384       case SCM_SINGLE_SPACES:
 385       case '\t':
 386         break;
 387
 388       default:
 389         return c;
 390       }
 391
 392   return 0;
 393 }
 394
 395
 396 \f
 397 /* Token readers.  */
 398
 399 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 400 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 401                            long line, int column);
 402
 403
 404 static SCM
 405 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 406                        long line, int column)
 407 {
 408   if (opts->record_positions_p)
 409     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 410   return x;
 411 }
 412
 413 static SCM
 414 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 415 #define FUNC_NAME "scm_i_lreadparen"
 416 {
 417   int c;
 418   SCM tmp, tl, ans = SCM_EOL;
 419   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 420   const int terminating_char = ((chr == '{') ? '}'
 421                                 : ((chr == '[') ? ']'
 422                                    : ')'));
 423
 424   /* Need to capture line and column numbers here. */
 425   long line = SCM_LINUM (port);
 426   int column = SCM_COL (port) - 1;
 427
 428   c = flush_ws (port, opts, FUNC_NAME);
 429   if (terminating_char == c)
 430     return SCM_EOL;
 431
 432   scm_ungetc_unlocked (c, port);
 433   tmp = scm_read_expression (port, opts);
 434
 435   /* Note that it is possible for scm_read_expression to return
 436      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 437      check that it's a real dot by checking `c'.  */
 438   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 439     {
 440       ans = scm_read_expression (port, opts);
 441       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 442         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 443                            SCM_EOL);
 444       return ans;
 445     }
 446
 447   /* Build the head of the list structure. */
 448   ans = tl = scm_cons (tmp, SCM_EOL);
 449
 450   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 451     {
 452       SCM new_tail;
 453
 454       if (c == ')' || (c == ']' && opts->square_brackets_p)
 455           || ((c == '}' || c == ']') && opts->curly_infix_p))
 456         scm_i_input_error (FUNC_NAME, port,
 457                            "in pair: mismatched close paren: ~A",
 458                            scm_list_1 (SCM_MAKE_CHAR (c)));
 459
 460       scm_ungetc_unlocked (c, port);
 461       tmp = scm_read_expression (port, opts);
 462
 463       /* See above note about scm_sym_dot.  */
 464       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 465         {
 466           SCM_SETCDR (tl, scm_read_expression (port, opts));
 467
 468           c = flush_ws (port, opts, FUNC_NAME);
 469           if (terminating_char != c)
 470             scm_i_input_error (FUNC_NAME, port,
 471                                "in pair: missing close paren", SCM_EOL);
 472           break;
 473         }
 474
 475       new_tail = scm_cons (tmp, SCM_EOL);
 476       SCM_SETCDR (tl, new_tail);
 477       tl = new_tail;
 478     }
 479
 480   if (curly_list_p)
 481     {
 482       /* In addition to finding the length, 'scm_ilength' checks for
 483          improper or circular lists, in which case it returns -1. */
 484       int len = scm_ilength (ans);
 485
 486       /* The (len == 0) case is handled above */
 487       if (len == 1)
 488         /* Return directly to avoid re-annotating the element's source
 489            location with the position of the outer brace.  Also, it
 490            might not be possible to annotate the element. */
 491         return scm_car (ans);  /* {e} => e */
 492       else if (len == 2)
 493         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 494       else if (len >= 3 && (len & 1))
 495         {
 496           /* It's a proper list whose length is odd and at least 3.  If
 497              the elements at odd indices (the infix operator positions)
 498              are all 'equal?', then it's a simple curly-infix list.
 499              Otherwise it's a mixed curly-infix list. */
 500           SCM op = scm_cadr (ans);
 501
 502           /* Check to see if the elements at odd indices are 'equal?' */
 503           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 504             {
 505               if (scm_is_null (tl))
 506                 {
 507                   /* Convert simple curly-infix list to prefix:
 508                      {a <op> b <op> ...} => (<op> a b ...) */
 509                   tl = ans;
 510                   while (scm_is_pair (scm_cdr (tl)))
 511                     {
 512                       tmp = scm_cddr (tl);
 513                       SCM_SETCDR (tl, tmp);
 514                       tl = tmp;
 515                     }
 516                   ans = scm_cons (op, ans);
 517                   break;
 518                 }
 519               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 520                 {
 521                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 522                   ans = scm_cons (sym_nfx, ans);
 523                   break;
 524                 }
 525             }
 526         }
 527       else
 528         /* Mixed curly-infix (possibly improper) list:
 529            {e . tail} => ($nfx$ e . tail) */
 530         ans = scm_cons (sym_nfx, ans);
 531     }
 532
 533   return maybe_annotate_source (ans, port, opts, line, column);
 534 }
 535 #undef FUNC_NAME
 536
 537
 538 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 539    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 540    found.  */
 541 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 542   do                                                               \
 543     {                                                              \
 544       scm_t_wchar a;                                               \
 545       size_t i = 0;                                                \
 546       c = 0;                                                       \
 547       while (i < ndigits)                                          \
 548         {                                                          \
 549           a = scm_getc_unlocked (port);                                     \
 550           if (a == EOF)                                            \
 551             goto str_eof;                                          \
 552           if (terminator                                           \
 553               && (a == (scm_t_wchar) terminator)                   \
 554               && (i > 0))                                          \
 555             break;                                                 \
 556           if ('0' <= a && a <= '9')                                \
 557             a -= '0';                                              \
 558           else if ('A' <= a && a <= 'F')                           \
 559             a = a - 'A' + 10;                                      \
 560           else if ('a' <= a && a <= 'f')                           \
 561             a = a - 'a' + 10;                                      \
 562           else                                                     \
 563             {                                                      \
 564               c = a;                                               \
 565               goto bad_escaped;                                    \
 566             }                                                      \
 567           c = c * 16 + a;                                          \
 568           i ++;                                                    \
 569         }                                                          \
 570     } while (0)
 571
 572 static void
 573 skip_intraline_whitespace (SCM port)
 574 {
 575   scm_t_wchar c;
 576
 577   do
 578     {
 579       c = scm_getc_unlocked (port);
 580       if (c == EOF)
 581         return;
 582     }
 583   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 584
 585   scm_ungetc_unlocked (c, port);
 586 }
 587
 588 static SCM
 589 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 590 #define FUNC_NAME "scm_lreadr"
 591 {
 592   /* For strings smaller than C_STR, this function creates only one Scheme
 593      object (the string returned).  */
 594
 595   SCM str = SCM_EOL;
 596   size_t c_str_len = 0;
 597   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 598
 599   /* Need to capture line and column numbers here. */
 600   long line = SCM_LINUM (port);
 601   int column = SCM_COL (port) - 1;
 602
 603   while ('"' != (c = scm_getc_unlocked (port)))
 604     {
 605       if (c == EOF)
 606         {
 607         str_eof:
 608           scm_i_input_error (FUNC_NAME, port,
 609                              "end of file in string constant", SCM_EOL);
 610         }
 611
 612       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 613         {
 614           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 615           c_str_len = 0;
 616         }
 617
 618       if (c == '\\')
 619         {
 620           switch (c = scm_getc_unlocked (port))
 621             {
 622             case EOF:
 623               goto str_eof;
 624             case '"':
 625             case '\\':
 626               break;
 627             case '\n':
 628               if (opts->hungry_eol_escapes_p)
 629                 skip_intraline_whitespace (port);
 630               continue;
 631             case '0':
 632               c = '\0';
 633               break;
 634             case 'f':
 635               c = '\f';
 636               break;
 637             case 'n':
 638               c = '\n';
 639               break;
 640             case 'r':
 641               c = '\r';
 642               break;
 643             case 't':
 644               c = '\t';
 645               break;
 646             case 'a':
 647               c = '\007';
 648               break;
 649             case 'v':
 650               c = '\v';
 651               break;
 652             case 'b':
 653               c = '\010';
 654               break;
 655             case 'x':
 656               if (opts->r6rs_escapes_p)
 657                 SCM_READ_HEX_ESCAPE (10, ';');
 658               else
 659                 SCM_READ_HEX_ESCAPE (2, '\0');
 660               break;
 661             case 'u':
 662               if (!opts->r6rs_escapes_p)
 663                 {
 664                   SCM_READ_HEX_ESCAPE (4, '\0');
 665                   break;
 666                 }
 667             case 'U':
 668               if (!opts->r6rs_escapes_p)
 669                 {
 670                   SCM_READ_HEX_ESCAPE (6, '\0');
 671                   break;
 672                 }
 673             default:
 674             bad_escaped:
 675               scm_i_input_error (FUNC_NAME, port,
 676                                  "illegal character in escape sequence: ~S",
 677                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 678             }
 679         }
 680
 681       c_str[c_str_len++] = c;
 682     }
 683
 684   if (scm_is_null (str))
 685     /* Fast path: we got a string that fits in C_STR.  */
 686     str = scm_from_utf32_stringn (c_str, c_str_len);
 687   else
 688     {
 689       if (c_str_len > 0)
 690         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 691
 692       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 693     }
 694
 695   return maybe_annotate_source (str, port, opts, line, column);
 696 }
 697 #undef FUNC_NAME
 698
 699
 700 static SCM
 701 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 702 {
 703   SCM result, str = SCM_EOL;
 704   char local_buffer[READER_BUFFER_SIZE], *buffer;
 705   size_t bytes_read;
 706
 707   /* Need to capture line and column numbers here. */
 708   long line = SCM_LINUM (port);
 709   int column = SCM_COL (port) - 1;
 710
 711   scm_ungetc_unlocked (chr, port);
 712   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 713                                 &bytes_read);
 714
 715   str = scm_from_port_stringn (buffer, bytes_read, port);
 716
 717   result = scm_string_to_number (str, SCM_UNDEFINED);
 718   if (scm_is_false (result))
 719     {
 720       /* Return a symbol instead of a number */
 721       if (opts->case_insensitive_p)
 722         str = scm_string_downcase_x (str);
 723       result = scm_string_to_symbol (str);
 724     }
 725   else if (SCM_NIMP (result))
 726     result = maybe_annotate_source (result, port, opts, line, column);
 727
 728   SCM_COL (port) += scm_i_string_length (str);
 729   return result;
 730 }
 731
 732 static SCM
 733 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 734 {
 735   SCM result;
 736   int ends_with_colon = 0;
 737   size_t bytes_read;
 738   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 739   char local_buffer[READER_BUFFER_SIZE], *buffer;
 740   SCM str;
 741
 742   scm_ungetc_unlocked (chr, port);
 743   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 744                                 &bytes_read);
 745   if (bytes_read > 0)
 746     ends_with_colon = buffer[bytes_read - 1] == ':';
 747
 748   if (postfix && ends_with_colon && (bytes_read > 1))
 749     {
 750       str = scm_from_port_stringn (buffer, bytes_read - 1, port);
 751
 752       if (opts->case_insensitive_p)
 753         str = scm_string_downcase_x (str);
 754       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 755     }
 756   else
 757     {
 758       str = scm_from_port_stringn (buffer, bytes_read, port);
 759
 760       if (opts->case_insensitive_p)
 761         str = scm_string_downcase_x (str);
 762       result = scm_string_to_symbol (str);
 763     }
 764
 765   SCM_COL (port) += scm_i_string_length (str);
 766   return result;
 767 }
 768
 769 static SCM
 770 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 771 #define FUNC_NAME "scm_lreadr"
 772 {
 773   SCM result;
 774   size_t read;
 775   char local_buffer[READER_BUFFER_SIZE], *buffer;
 776   unsigned int radix;
 777   SCM str;
 778
 779   switch (chr)
 780     {
 781     case 'B':
 782     case 'b':
 783       radix = 2;
 784       break;
 785
 786     case 'o':
 787     case 'O':
 788       radix = 8;
 789       break;
 790
 791     case 'd':
 792     case 'D':
 793       radix = 10;
 794       break;
 795
 796     case 'x':
 797     case 'X':
 798       radix = 16;
 799       break;
 800
 801     default:
 802       scm_ungetc_unlocked (chr, port);
 803       scm_ungetc_unlocked ('#', port);
 804       radix = 10;
 805     }
 806
 807   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 808                                 &read);
 809
 810   str = scm_from_port_stringn (buffer, read, port);
 811
 812   result = scm_string_to_number (str, scm_from_uint (radix));
 813
 814   SCM_COL (port) += scm_i_string_length (str);
 815
 816   if (scm_is_true (result))
 817     return result;
 818
 819   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 820
 821   return SCM_BOOL_F;
 822 }
 823 #undef FUNC_NAME
 824
 825 static SCM
 826 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 827 {
 828   SCM p;
 829   long line = SCM_LINUM (port);
 830   int column = SCM_COL (port) - 1;
 831
 832   switch (chr)
 833     {
 834     case '`':
 835       p = scm_sym_quasiquote;
 836       break;
 837
 838     case '\'':
 839       p = scm_sym_quote;
 840       break;
 841
 842     case ',':
 843       {
 844         scm_t_wchar c;
 845
 846         c = scm_getc_unlocked (port);
 847         if ('@' == c)
 848           p = scm_sym_uq_splicing;
 849         else
 850           {
 851             scm_ungetc_unlocked (c, port);
 852             p = scm_sym_unquote;
 853           }
 854         break;
 855       }
 856
 857     default:
 858       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 859                "scm_read_quote", chr);
 860       abort ();
 861     }
 862
 863   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 864   return maybe_annotate_source (p, port, opts, line, column);
 865 }
 866
 867 SCM_SYMBOL (sym_syntax, "syntax");
 868 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 869 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 870 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 871
 872 static SCM
 873 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 874 {
 875   SCM p;
 876   long line = SCM_LINUM (port);
 877   int column = SCM_COL (port) - 1;
 878
 879   switch (chr)
 880     {
 881     case '`':
 882       p = sym_quasisyntax;
 883       break;
 884
 885     case '\'':
 886       p = sym_syntax;
 887       break;
 888
 889     case ',':
 890       {
 891         int c;
 892
 893         c = scm_getc_unlocked (port);
 894         if ('@' == c)
 895           p = sym_unsyntax_splicing;
 896         else
 897           {
 898             scm_ungetc_unlocked (c, port);
 899             p = sym_unsyntax;
 900           }
 901         break;
 902       }
 903
 904     default:
 905       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 906                "scm_read_syntax", chr);
 907       abort ();
 908     }
 909
 910   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 911   return maybe_annotate_source (p, port, opts, line, column);
 912 }
 913
 914 static SCM
 915 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 916 {
 917   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 918
 919   if (!scm_is_eq (id, sym_nil))
 920     scm_i_input_error ("scm_read_nil", port,
 921                        "unexpected input while reading #nil: ~a",
 922                        scm_list_1 (id));
 923
 924   return SCM_ELISP_NIL;
 925 }
 926
 927 static SCM
 928 scm_read_semicolon_comment (int chr, SCM port)
 929 {
 930   int c;
 931
 932   /* We use the get_byte here because there is no need to get the
 933      locale correct with comment input. This presumes that newline
 934      always represents itself no matter what the encoding is.  */
 935   for (c = scm_get_byte_or_eof_unlocked (port);
 936        (c != EOF) && (c != '\n');
 937        c = scm_get_byte_or_eof_unlocked (port));
 938
 939   return SCM_UNSPECIFIED;
 940 }
 941
 942 \f
 943 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 944
 945 static SCM
 946 scm_read_boolean (int chr, SCM port)
 947 {
 948   switch (chr)
 949     {
 950     case 't':
 951     case 'T':
 952       return SCM_BOOL_T;
 953
 954     case 'f':
 955     case 'F':
 956       return SCM_BOOL_F;
 957     }
 958
 959   return SCM_UNSPECIFIED;
 960 }
 961
 962 static SCM
 963 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 964 #define FUNC_NAME "scm_lreadr"
 965 {
 966   char buffer[READER_CHAR_NAME_MAX_SIZE];
 967   SCM charname;
 968   size_t charname_len, bytes_read;
 969   scm_t_wchar cp;
 970   int overflow;
 971   scm_t_port *pt;
 972
 973   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 974                          &bytes_read);
 975   if (overflow)
 976     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 977
 978   if (bytes_read == 0)
 979     {
 980       chr = scm_getc_unlocked (port);
 981       if (chr == EOF)
 982         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 983                            "while reading character", SCM_EOL);
 984
 985       /* CHR must be a token delimiter, like a whitespace.  */
 986       return (SCM_MAKE_CHAR (chr));
 987     }
 988
 989   pt = SCM_PTAB_ENTRY (port);
 990
 991   /* Simple ASCII characters can be processed immediately.  Also, simple
 992      ISO-8859-1 characters can be processed immediately if the encoding for this
 993      port is ISO-8859-1.  */
 994   if (bytes_read == 1 &&
 995       ((unsigned char) buffer[0] <= 127
 996        || pt->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
 997     {
 998       SCM_COL (port) += 1;
 999       return SCM_MAKE_CHAR (buffer[0]);
1000     }
1001
1002   /* Otherwise, convert the buffer into a proper scheme string for
1003      processing.  */
1004   charname = scm_from_port_stringn (buffer, bytes_read, port);
1005   charname_len = scm_i_string_length (charname);
1006   SCM_COL (port) += charname_len;
1007   cp = scm_i_string_ref (charname, 0);
1008   if (charname_len == 1)
1009     return SCM_MAKE_CHAR (cp);
1010
1011   /* Ignore dotted circles, which may be used to keep combining characters from
1012      combining with the backslash in #\charname.  */
1013   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1014     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1015
1016   if (cp >= '0' && cp < '8')
1017     {
1018       /* Dirk:FIXME::  This type of character syntax is not R5RS
1019        * compliant.  Further, it should be verified that the constant
1020        * does only consist of octal digits.  */
1021       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1022       if (SCM_I_INUMP (p))
1023         {
1024           scm_t_wchar c = scm_to_uint32 (p);
1025           if (SCM_IS_UNICODE_CHAR (c))
1026             return SCM_MAKE_CHAR (c);
1027           else
1028             scm_i_input_error (FUNC_NAME, port,
1029                                "out-of-range octal character escape: ~a",
1030                                scm_list_1 (charname));
1031         }
1032     }
1033
1034   if (cp == 'x' && (charname_len > 1))
1035     {
1036       SCM p;
1037
1038       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1039       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1040                                 scm_from_uint (16));
1041       if (SCM_I_INUMP (p))
1042         {
1043           scm_t_wchar c = scm_to_uint32 (p);
1044           if (SCM_IS_UNICODE_CHAR (c))
1045             return SCM_MAKE_CHAR (c);
1046           else
1047             scm_i_input_error (FUNC_NAME, port,
1048                                "out-of-range hex character escape: ~a",
1049                                scm_list_1 (charname));
1050         }
1051     }
1052
1053   /* The names of characters should never have non-Latin1
1054      characters.  */
1055   if (scm_i_is_narrow_string (charname)
1056       || scm_i_try_narrow_string (charname))
1057     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1058                                        charname_len);
1059       if (scm_is_true (ch))
1060         return ch;
1061     }
1062
1063   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1064                      scm_list_1 (charname));
1065
1066   return SCM_UNSPECIFIED;
1067 }
1068 #undef FUNC_NAME
1069
1070 static SCM
1071 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1072 {
1073   SCM symbol;
1074
1075   /* Read the symbol that comprises the keyword.  Doing this instead of
1076      invoking a specific symbol reader function allows `scm_read_keyword ()'
1077      to adapt to the delimiters currently valid of symbols.
1078
1079      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1080   symbol = scm_read_expression (port, opts);
1081   if (!scm_is_symbol (symbol))
1082     scm_i_input_error ("scm_read_keyword", port,
1083                        "keyword prefix `~a' not followed by a symbol: ~s",
1084                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1085
1086   return (scm_symbol_to_keyword (symbol));
1087 }
1088
1089 static SCM
1090 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1091                  long line, int column)
1092 {
1093   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1094      guarantee that it's going to do what we want.  After all, this is an
1095      implementation detail of `scm_read_vector ()', not a desirable
1096      property.  */
1097   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1098                                 port, opts, line, column);
1099 }
1100
1101 /* Helper used by scm_read_array */
1102 static int
1103 read_decimal_integer (SCM port, int c, ssize_t *resp)
1104 {
1105   ssize_t sign = 1;
1106   ssize_t res = 0;
1107   int got_it = 0;
1108
1109   if (c == '-')
1110     {
1111       sign = -1;
1112       c = scm_getc_unlocked (port);
1113     }
1114
1115   while ('0' <= c && c <= '9')
1116     {
1117       res = 10*res + c-'0';
1118       got_it = 1;
1119       c = scm_getc_unlocked (port);
1120     }
1121
1122   if (got_it)
1123     *resp = sign * res;
1124   return c;
1125 }
1126
1127 /* Read an array.  This function can also read vectors and uniform
1128    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1129    handled here.
1130
1131    C is the first character read after the '#'. */
1132 static SCM
1133 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1134 {
1135   ssize_t rank;
1136   scm_t_wchar tag_buf[8];
1137   int tag_len;
1138
1139   SCM tag, shape = SCM_BOOL_F, elements, array;
1140
1141   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1142      the array code can not deal with zero-length dimensions yet, and
1143      we want to allow zero-length vectors, of course. */
1144   if (c == '(')
1145     return scm_read_vector (c, port, opts, line, column);
1146
1147   /* Disambiguate between '#f' and uniform floating point vectors. */
1148   if (c == 'f')
1149     {
1150       c = scm_getc_unlocked (port);
1151       if (c != '3' && c != '6')
1152         {
1153           if (c != EOF)
1154             scm_ungetc_unlocked (c, port);
1155           return SCM_BOOL_F;
1156         }
1157       rank = 1;
1158       tag_buf[0] = 'f';
1159       tag_len = 1;
1160       goto continue_reading_tag;
1161     }
1162
1163   /* Read rank. */
1164   rank = 1;
1165   c = read_decimal_integer (port, c, &rank);
1166   if (rank < 0)
1167     scm_i_input_error (NULL, port, "array rank must be non-negative",
1168                        SCM_EOL);
1169
1170   /* Read tag. */
1171   tag_len = 0;
1172  continue_reading_tag:
1173   while (c != EOF && c != '(' && c != '@' && c != ':'
1174          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1175     {
1176       tag_buf[tag_len++] = c;
1177       c = scm_getc_unlocked (port);
1178     }
1179   if (tag_len == 0)
1180     tag = SCM_BOOL_T;
1181   else
1182     {
1183       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1184       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1185         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1186                            scm_list_1 (tag));
1187     }
1188
1189   /* Read shape. */
1190   if (c == '@' || c == ':')
1191     {
1192       shape = SCM_EOL;
1193
1194       do
1195         {
1196           ssize_t lbnd = 0, len = 0;
1197           SCM s;
1198
1199           if (c == '@')
1200             {
1201               c = scm_getc_unlocked (port);
1202               c = read_decimal_integer (port, c, &lbnd);
1203             }
1204
1205           s = scm_from_ssize_t (lbnd);
1206
1207           if (c == ':')
1208             {
1209               c = scm_getc_unlocked (port);
1210               c = read_decimal_integer (port, c, &len);
1211               if (len < 0)
1212                 scm_i_input_error (NULL, port,
1213                                    "array length must be non-negative",
1214                                    SCM_EOL);
1215
1216               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1217             }
1218
1219           shape = scm_cons (s, shape);
1220         } while (c == '@' || c == ':');
1221
1222       shape = scm_reverse_x (shape, SCM_EOL);
1223     }
1224
1225   /* Read nested lists of elements. */
1226   if (c != '(')
1227     scm_i_input_error (NULL, port,
1228                        "missing '(' in vector or array literal",
1229                        SCM_EOL);
1230   elements = scm_read_sexp (c, port, opts);
1231
1232   if (scm_is_false (shape))
1233     shape = scm_from_ssize_t (rank);
1234   else if (scm_ilength (shape) != rank)
1235     scm_i_input_error
1236       (NULL, port,
1237        "the number of shape specifications must match the array rank",
1238        SCM_EOL);
1239
1240   /* Handle special print syntax of rank zero arrays; see
1241      scm_i_print_array for a rationale. */
1242   if (rank == 0)
1243     {
1244       if (!scm_is_pair (elements))
1245         scm_i_input_error (NULL, port,
1246                            "too few elements in array literal, need 1",
1247                            SCM_EOL);
1248       if (!scm_is_null (SCM_CDR (elements)))
1249         scm_i_input_error (NULL, port,
1250                            "too many elements in array literal, want 1",
1251                            SCM_EOL);
1252       elements = SCM_CAR (elements);
1253     }
1254
1255   /* Construct array, annotate with source location, and return. */
1256   array = scm_list_to_typed_array (tag, shape, elements);
1257   return maybe_annotate_source (array, port, opts, line, column);
1258 }
1259
1260 static SCM
1261 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1262                        long line, int column)
1263 {
1264   return scm_read_array (chr, port, opts, line, column);
1265 }
1266
1267 static SCM
1268 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1269                      long line, int column)
1270 {
1271   chr = scm_getc_unlocked (port);
1272   if (chr != 'u')
1273     goto syntax;
1274
1275   chr = scm_getc_unlocked (port);
1276   if (chr != '8')
1277     goto syntax;
1278
1279   chr = scm_getc_unlocked (port);
1280   if (chr != '(')
1281     goto syntax;
1282
1283   return maybe_annotate_source
1284     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1285      port, opts, line, column);
1286
1287  syntax:
1288   scm_i_input_error ("read_bytevector", port,
1289                      "invalid bytevector prefix",
1290                      SCM_MAKE_CHAR (chr));
1291   return SCM_UNSPECIFIED;
1292 }
1293
1294 static SCM
1295 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1296                            long line, int column)
1297 {
1298   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1299      terribly inefficient but who cares?  */
1300   SCM s_bits = SCM_EOL;
1301
1302   for (chr = scm_getc_unlocked (port);
1303        (chr != EOF) && ((chr == '0') || (chr == '1'));
1304        chr = scm_getc_unlocked (port))
1305     {
1306       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1307     }
1308
1309   if (chr != EOF)
1310     scm_ungetc_unlocked (chr, port);
1311
1312   return maybe_annotate_source
1313     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1314      port, opts, line, column);
1315 }
1316
1317 static SCM
1318 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1319 {
1320   int bang_seen = 0;
1321
1322   for (;;)
1323     {
1324       int c = scm_getc_unlocked (port);
1325
1326       if (c == EOF)
1327         scm_i_input_error ("skip_block_comment", port,
1328                            "unterminated `#! ... !#' comment", SCM_EOL);
1329
1330       if (c == '!')
1331         bang_seen = 1;
1332       else if (c == '#' && bang_seen)
1333         break;
1334       else
1335         bang_seen = 0;
1336     }
1337
1338   return SCM_UNSPECIFIED;
1339 }
1340
1341 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1342                                          int value);
1343 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1344                                         int value);
1345 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1346                                     int value);
1347
1348 static SCM
1349 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1350 {
1351   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1352   int c;
1353   int i = 0;
1354
1355   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1356     {
1357       c = scm_getc_unlocked (port);
1358       if (c == EOF)
1359         scm_i_input_error ("skip_block_comment", port,
1360                            "unterminated `#! ... !#' comment", SCM_EOL);
1361       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1362         name[i++] = c;
1363       else if (CHAR_IS_DELIMITER (c))
1364         {
1365           scm_ungetc_unlocked (c, port);
1366           name[i] = '\0';
1367           if (0 == strcmp ("r6rs", name))
1368             ;  /* Silently ignore */
1369           else if (0 == strcmp ("fold-case", name))
1370             set_port_case_insensitive_p (port, opts, 1);
1371           else if (0 == strcmp ("no-fold-case", name))
1372             set_port_case_insensitive_p (port, opts, 0);
1373           else if (0 == strcmp ("curly-infix", name))
1374             set_port_curly_infix_p (port, opts, 1);
1375           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1376             {
1377               set_port_curly_infix_p (port, opts, 1);
1378               set_port_square_brackets_p (port, opts, 0);
1379             }
1380           else
1381             break;
1382
1383           return SCM_UNSPECIFIED;
1384         }
1385       else
1386         {
1387           scm_ungetc_unlocked (c, port);
1388           break;
1389         }
1390     }
1391   while (i > 0)
1392     scm_ungetc_unlocked (name[--i], port);
1393   return scm_read_scsh_block_comment (chr, port);
1394 }
1395
1396 static SCM
1397 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1398 {
1399   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1400      nested.  So care must be taken.  */
1401   int nesting_level = 1;
1402
1403   int a = scm_getc_unlocked (port);
1404
1405   if (a == EOF)
1406     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1407                        "unterminated `#| ... |#' comment", SCM_EOL);
1408
1409   while (nesting_level > 0)
1410     {
1411       int b = scm_getc_unlocked (port);
1412
1413       if (b == EOF)
1414         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1415                            "unterminated `#| ... |#' comment", SCM_EOL);
1416
1417       if (a == '|' && b == '#')
1418         {
1419           nesting_level--;
1420           b = EOF;
1421         }
1422       else if (a == '#' && b == '|')
1423         {
1424           nesting_level++;
1425           b = EOF;
1426         }
1427
1428       a = b;
1429     }
1430
1431   return SCM_UNSPECIFIED;
1432 }
1433
1434 static SCM
1435 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1436                                scm_t_read_opts *opts)
1437 {
1438   scm_t_wchar c;
1439
1440   c = flush_ws (port, opts, (char *) NULL);
1441   if (EOF == c)
1442     scm_i_input_error ("read_commented_expression", port,
1443                        "no expression after #; comment", SCM_EOL);
1444   scm_ungetc_unlocked (c, port);
1445   scm_read_expression (port, opts);
1446   return SCM_UNSPECIFIED;
1447 }
1448
1449 static SCM
1450 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1451 {
1452   /* Guile's extended symbol read syntax looks like this:
1453
1454        #{This is all a symbol name}#
1455
1456      So here, CHR is expected to be `{'.  */
1457   int saw_brace = 0;
1458   size_t len = 0;
1459   SCM buf = scm_i_make_string (1024, NULL, 0);
1460
1461   buf = scm_i_string_start_writing (buf);
1462
1463   while ((chr = scm_getc_unlocked (port)) != EOF)
1464     {
1465       if (saw_brace)
1466         {
1467           if (chr == '#')
1468             {
1469               break;
1470             }
1471           else
1472             {
1473               saw_brace = 0;
1474               scm_i_string_set_x (buf, len++, '}');
1475             }
1476         }
1477
1478       if (chr == '}')
1479         saw_brace = 1;
1480       else if (chr == '\\')
1481         {
1482           /* It used to be that print.c would print extended-read-syntax
1483              symbols with backslashes before "non-standard" chars, but
1484              this routine wouldn't do anything with those escapes.
1485              Bummer.  What we've done is to change print.c to output
1486              R6RS hex escapes for those characters, relying on the fact
1487              that the extended read syntax would never put a `\' before
1488              an `x'.  For now, we just ignore other instances of
1489              backslash in the string.  */
1490           switch ((chr = scm_getc_unlocked (port)))
1491             {
1492             case EOF:
1493               goto done;
1494             case 'x':
1495               {
1496                 scm_t_wchar c;
1497
1498                 SCM_READ_HEX_ESCAPE (10, ';');
1499                 scm_i_string_set_x (buf, len++, c);
1500                 break;
1501
1502               str_eof:
1503                 chr = EOF;
1504                 goto done;
1505
1506               bad_escaped:
1507                 scm_i_string_stop_writing ();
1508                 scm_i_input_error ("scm_read_extended_symbol", port,
1509                                    "illegal character in escape sequence: ~S",
1510                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1511                 break;
1512               }
1513             default:
1514               scm_i_string_set_x (buf, len++, chr);
1515               break;
1516             }
1517         }
1518       else
1519         scm_i_string_set_x (buf, len++, chr);
1520
1521       if (len >= scm_i_string_length (buf) - 2)
1522         {
1523           SCM addy;
1524
1525           scm_i_string_stop_writing ();
1526           addy = scm_i_make_string (1024, NULL, 0);
1527           buf = scm_string_append (scm_list_2 (buf, addy));
1528           len = 0;
1529           buf = scm_i_string_start_writing (buf);
1530         }
1531     }
1532
1533  done:
1534   scm_i_string_stop_writing ();
1535   if (chr == EOF)
1536     scm_i_input_error ("scm_read_extended_symbol", port,
1537                        "end of file while reading symbol", SCM_EOL);
1538
1539   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1540 }
1541
1542
1543 \f
1544 /* Top-level token readers, i.e., dispatchers.  */
1545
1546 static SCM
1547 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1548 {
1549   SCM proc;
1550
1551   proc = scm_get_hash_procedure (chr);
1552   if (scm_is_true (scm_procedure_p (proc)))
1553     {
1554       long line = SCM_LINUM (port);
1555       int column = SCM_COL (port) - 2;
1556       SCM got;
1557
1558       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1559
1560       if (opts->record_positions_p && SCM_NIMP (got)
1561           && !scm_i_has_source_properties (got))
1562         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1563
1564       return got;
1565     }
1566
1567   return SCM_UNSPECIFIED;
1568 }
1569
1570 /* The reader for the sharp `#' character.  It basically dispatches reads
1571    among the above token readers.   */
1572 static SCM
1573 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1574                 long line, int column)
1575 #define FUNC_NAME "scm_lreadr"
1576 {
1577   SCM result;
1578
1579   chr = scm_getc_unlocked (port);
1580
1581   result = scm_read_sharp_extension (chr, port, opts);
1582   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1583     return result;
1584
1585   switch (chr)
1586     {
1587     case '\\':
1588       return (scm_read_character (chr, port, opts));
1589     case '(':
1590       return (scm_read_vector (chr, port, opts, line, column));
1591     case 's':
1592     case 'u':
1593     case 'f':
1594     case 'c':
1595       /* This one may return either a boolean or an SRFI-4 vector.  */
1596       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1597     case 'v':
1598       return (scm_read_bytevector (chr, port, opts, line, column));
1599     case '*':
1600       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1601     case 't':
1602     case 'T':
1603     case 'F':
1604       return (scm_read_boolean (chr, port));
1605     case ':':
1606       return (scm_read_keyword (chr, port, opts));
1607     case '0': case '1': case '2': case '3': case '4':
1608     case '5': case '6': case '7': case '8': case '9':
1609     case '@':
1610       return (scm_read_array (chr, port, opts, line, column));
1611
1612     case 'i':
1613     case 'e':
1614     case 'b':
1615     case 'B':
1616     case 'o':
1617     case 'O':
1618     case 'd':
1619     case 'D':
1620     case 'x':
1621     case 'X':
1622     case 'I':
1623     case 'E':
1624       return (scm_read_number_and_radix (chr, port, opts));
1625     case '{':
1626       return (scm_read_extended_symbol (chr, port));
1627     case '!':
1628       return (scm_read_shebang (chr, port, opts));
1629     case ';':
1630       return (scm_read_commented_expression (chr, port, opts));
1631     case '`':
1632     case '\'':
1633     case ',':
1634       return (scm_read_syntax (chr, port, opts));
1635     case 'n':
1636       return (scm_read_nil (chr, port, opts));
1637     default:
1638       result = scm_read_sharp_extension (chr, port, opts);
1639       if (scm_is_eq (result, SCM_UNSPECIFIED))
1640         {
1641           /* To remain compatible with 1.8 and earlier, the following
1642              characters have lower precedence than `read-hash-extend'
1643              characters.  */
1644           switch (chr)
1645             {
1646             case '|':
1647               return scm_read_r6rs_block_comment (chr, port);
1648             default:
1649               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1650                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1651             }
1652         }
1653       else
1654         return result;
1655     }
1656
1657   return SCM_UNSPECIFIED;
1658 }
1659 #undef FUNC_NAME
1660
1661 static SCM
1662 read_inner_expression (SCM port, scm_t_read_opts *opts)
1663 #define FUNC_NAME "read_inner_expression"
1664 {
1665   while (1)
1666     {
1667       scm_t_wchar chr;
1668
1669       chr = scm_getc_unlocked (port);
1670
1671       switch (chr)
1672         {
1673         case SCM_WHITE_SPACES:
1674         case SCM_LINE_INCREMENTORS:
1675           break;
1676         case ';':
1677           (void) scm_read_semicolon_comment (chr, port);
1678           break;
1679         case '{':
1680           if (opts->curly_infix_p)
1681             {
1682               if (opts->neoteric_p)
1683                 return scm_read_sexp (chr, port, opts);
1684               else
1685                 {
1686                   SCM expr;
1687
1688                   /* Enable neoteric expressions within curly braces */
1689                   opts->neoteric_p = 1;
1690                   expr = scm_read_sexp (chr, port, opts);
1691                   opts->neoteric_p = 0;
1692                   return expr;
1693                 }
1694             }
1695           else
1696             return scm_read_mixed_case_symbol (chr, port, opts);
1697         case '[':
1698           if (opts->square_brackets_p)
1699             return scm_read_sexp (chr, port, opts);
1700           else if (opts->curly_infix_p)
1701             {
1702               /* The syntax of neoteric expressions requires that '[' be
1703                  a delimiter when curly-infix is enabled, so it cannot
1704                  be part of an unescaped symbol.  We might as well do
1705                  something useful with it, so we adopt Kawa's convention:
1706                  [...] => ($bracket-list$ ...) */
1707               long line = SCM_LINUM (port);
1708               int column = SCM_COL (port) - 1;
1709               return maybe_annotate_source
1710                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1711                  port, opts, line, column);
1712             }
1713           else
1714             return scm_read_mixed_case_symbol (chr, port, opts);
1715         case '(':
1716           return (scm_read_sexp (chr, port, opts));
1717         case '"':
1718           return (scm_read_string (chr, port, opts));
1719         case '\'':
1720         case '`':
1721         case ',':
1722           return (scm_read_quote (chr, port, opts));
1723         case '#':
1724           {
1725             long line  = SCM_LINUM (port);
1726             int column = SCM_COL (port) - 1;
1727             SCM result = scm_read_sharp (chr, port, opts, line, column);
1728             if (scm_is_eq (result, SCM_UNSPECIFIED))
1729               /* We read a comment or some such.  */
1730               break;
1731             else
1732               return result;
1733           }
1734         case ')':
1735           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1736           break;
1737         case '}':
1738           if (opts->curly_infix_p)
1739             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1740           else
1741             return scm_read_mixed_case_symbol (chr, port, opts);
1742         case ']':
1743           if (opts->square_brackets_p)
1744             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1745           /* otherwise fall through */
1746         case EOF:
1747           return SCM_EOF_VAL;
1748         case ':':
1749           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1750             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1751           /* Fall through.  */
1752
1753         default:
1754           {
1755             if (((chr >= '0') && (chr <= '9'))
1756                 || (strchr ("+-.", chr)))
1757               return (scm_read_number (chr, port, opts));
1758             else
1759               return (scm_read_mixed_case_symbol (chr, port, opts));
1760           }
1761         }
1762     }
1763 }
1764 #undef FUNC_NAME
1765
1766 static SCM
1767 scm_read_expression (SCM port, scm_t_read_opts *opts)
1768 #define FUNC_NAME "scm_read_expression"
1769 {
1770   if (!opts->neoteric_p)
1771     return read_inner_expression (port, opts);
1772   else
1773     {
1774       long line = 0;
1775       int column = 0;
1776       SCM expr;
1777
1778       if (opts->record_positions_p)
1779         {
1780           /* We need to get the position of the first non-whitespace
1781              character in order to correctly annotate neoteric
1782              expressions.  For example, for the expression 'f(x)', the
1783              first call to 'read_inner_expression' reads the 'f' (which
1784              cannot be annotated), and then we later read the '(x)' and
1785              use it to construct the new list (f x). */
1786           int c = flush_ws (port, opts, (char *) NULL);
1787           if (c == EOF)
1788             return SCM_EOF_VAL;
1789           scm_ungetc_unlocked (c, port);
1790           line = SCM_LINUM (port);
1791           column = SCM_COL (port);
1792         }
1793
1794       expr = read_inner_expression (port, opts);
1795
1796       /* 'expr' is the first component of the neoteric expression.  Now
1797          we loop, and as long as the next character is '(', '[', or '{',
1798          (without any intervening whitespace), we use it to construct a
1799          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1800       for (;;)
1801         {
1802           int chr = scm_getc_unlocked (port);
1803
1804           if (chr == '(')
1805             /* e(...) => (e ...) */
1806             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1807           else if (chr == '[')
1808             /* e[...] => ($bracket-apply$ e ...) */
1809             expr = scm_cons (sym_bracket_apply,
1810                              scm_cons (expr,
1811                                        scm_read_sexp (chr, port, opts)));
1812           else if (chr == '{')
1813             {
1814               SCM arg = scm_read_sexp (chr, port, opts);
1815
1816               if (scm_is_null (arg))
1817                 expr = scm_list_1 (expr);       /* e{} => (e) */
1818               else
1819                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1820             }
1821           else
1822             {
1823               if (chr != EOF)
1824                 scm_ungetc_unlocked (chr, port);
1825               break;
1826             }
1827           maybe_annotate_source (expr, port, opts, line, column);
1828         }
1829       return expr;
1830     }
1831 }
1832 #undef FUNC_NAME
1833
1834 \f
1835 /* Actual reader.  */
1836
1837 static void init_read_options (SCM port, scm_t_read_opts *opts);
1838
1839 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1840             (SCM port),
1841             "Read an s-expression from the input port @var{port}, or from\n"
1842             "the current input port if @var{port} is not specified.\n"
1843             "Any whitespace before the next token is discarded.")
1844 #define FUNC_NAME s_scm_read
1845 {
1846   scm_t_read_opts opts;
1847   int c;
1848
1849   if (SCM_UNBNDP (port))
1850     port = scm_current_input_port ();
1851   SCM_VALIDATE_OPINPORT (1, port);
1852
1853   init_read_options (port, &opts);
1854
1855   c = flush_ws (port, &opts, (char *) NULL);
1856   if (EOF == c)
1857     return SCM_EOF_VAL;
1858   scm_ungetc_unlocked (c, port);
1859
1860   return (scm_read_expression (port, &opts));
1861 }
1862 #undef FUNC_NAME
1863
1864
1865 \f
1866
1867 /* Manipulate the read-hash-procedures alist.  This could be written in
1868    Scheme, but maybe it will also be used by C code during initialisation.  */
1869 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1870             (SCM chr, SCM proc),
1871             "Install the procedure @var{proc} for reading expressions\n"
1872             "starting with the character sequence @code{#} and @var{chr}.\n"
1873             "@var{proc} will be called with two arguments:  the character\n"
1874             "@var{chr} and the port to read further data from. The object\n"
1875             "returned will be the return value of @code{read}. \n"
1876             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1877             )
1878 #define FUNC_NAME s_scm_read_hash_extend
1879 {
1880   SCM this;
1881   SCM prev;
1882
1883   SCM_VALIDATE_CHAR (1, chr);
1884   SCM_ASSERT (scm_is_false (proc)
1885               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1886               proc, SCM_ARG2, FUNC_NAME);
1887
1888   /* Check if chr is already in the alist.  */
1889   this = scm_i_read_hash_procedures_ref ();
1890   prev = SCM_BOOL_F;
1891   while (1)
1892     {
1893       if (scm_is_null (this))
1894         {
1895           /* not found, so add it to the beginning.  */
1896           if (scm_is_true (proc))
1897             {
1898               SCM new = scm_cons (scm_cons (chr, proc),
1899                                   scm_i_read_hash_procedures_ref ());
1900               scm_i_read_hash_procedures_set_x (new);
1901             }
1902           break;
1903         }
1904       if (scm_is_eq (chr, SCM_CAAR (this)))
1905         {
1906           /* already in the alist.  */
1907           if (scm_is_false (proc))
1908             {
1909               /* remove it.  */
1910               if (scm_is_false (prev))
1911                 {
1912                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1913                   scm_i_read_hash_procedures_set_x (rest);
1914                 }
1915               else
1916                 scm_set_cdr_x (prev, SCM_CDR (this));
1917             }
1918           else
1919             {
1920               /* replace it.  */
1921               scm_set_cdr_x (SCM_CAR (this), proc);
1922             }
1923           break;
1924         }
1925       prev = this;
1926       this = SCM_CDR (this);
1927     }
1928
1929   return SCM_UNSPECIFIED;
1930 }
1931 #undef FUNC_NAME
1932
1933 /* Recover the read-hash procedure corresponding to char c.  */
1934 static SCM
1935 scm_get_hash_procedure (int c)
1936 {
1937   SCM rest = scm_i_read_hash_procedures_ref ();
1938
1939   while (1)
1940     {
1941       if (scm_is_null (rest))
1942         return SCM_BOOL_F;
1943
1944       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1945         return SCM_CDAR (rest);
1946
1947       rest = SCM_CDR (rest);
1948     }
1949 }
1950
1951 #define SCM_ENCODING_SEARCH_SIZE (500)
1952
1953 static int
1954 is_encoding_char (char c)
1955 {
1956   if (c >= 'a' && c <= 'z') return 1;
1957   if (c >= 'A' && c <= 'Z') return 1;
1958   if (c >= '0' && c <= '9') return 1;
1959   return strchr ("_-.:/,+=()", c) != NULL;
1960 }
1961
1962 /* Search the first few hundred characters of a file for an Emacs-like coding
1963    declaration.  Returns either NULL or a string whose storage has been
1964    allocated with `scm_gc_malloc ()'.  */
1965 char *
1966 scm_i_scan_for_encoding (SCM port)
1967 {
1968   scm_t_port *pt;
1969   char header[SCM_ENCODING_SEARCH_SIZE+1];
1970   size_t bytes_read, encoding_length, i;
1971   char *encoding = NULL;
1972   int utf8_bom = 0;
1973   char *pos, *encoding_start;
1974   int in_comment;
1975
1976   pt = SCM_PTAB_ENTRY (port);
1977
1978   if (pt->rw_active == SCM_PORT_WRITE)
1979     scm_flush_unlocked (port);
1980
1981   if (pt->rw_random)
1982     pt->rw_active = SCM_PORT_READ;
1983
1984   if (pt->read_pos == pt->read_end)
1985     {
1986       /* We can use the read buffer, and thus avoid a seek. */
1987       if (scm_fill_input_unlocked (port) == EOF)
1988         return NULL;
1989
1990       bytes_read = pt->read_end - pt->read_pos;
1991       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1992         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1993
1994       if (bytes_read <= 1)
1995         /* An unbuffered port -- don't scan.  */
1996         return NULL;
1997
1998       memcpy (header, pt->read_pos, bytes_read);
1999       header[bytes_read] = '\0';
2000     }
2001   else
2002     {
2003       /* Try to read some bytes and then seek back.  Not all ports
2004          support seeking back; and indeed some file ports (like
2005          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2006          check performed by SCM_FPORT_FDES---but fail to seek
2007          backwards.  Hence this block comes second.  We prefer to use
2008          the read buffer in-place.  */
2009       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2010         return NULL;
2011
2012       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2013       header[bytes_read] = '\0';
2014       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2015     }
2016
2017   if (bytes_read > 3
2018       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2019     utf8_bom = 1;
2020
2021   /* search past "coding[:=]" */
2022   pos = header;
2023   while (1)
2024     {
2025       if ((pos = strstr(pos, "coding")) == NULL)
2026         return NULL;
2027
2028       pos += strlen("coding");
2029       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2030           (*pos == ':' || *pos == '='))
2031         {
2032           pos ++;
2033           break;
2034         }
2035     }
2036
2037   /* skip spaces */
2038   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2039          (*pos == ' ' || *pos == '\t'))
2040     pos ++;
2041
2042   /* grab the next token */
2043   encoding_start = pos;
2044   i = 0;
2045   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2046          && encoding_start + i - header < bytes_read
2047          && is_encoding_char (encoding_start[i]))
2048     i++;
2049
2050   encoding_length = i;
2051   if (encoding_length == 0)
2052     return NULL;
2053
2054   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2055
2056   /* push backwards to make sure we were in a comment */
2057   in_comment = 0;
2058   pos = encoding_start;
2059   while (pos >= header)
2060     {
2061       if (*pos == ';')
2062         {
2063           in_comment = 1;
2064           break;
2065         }
2066       else if (*pos == '\n' || pos == header)
2067         {
2068           /* This wasn't in a semicolon comment. Check for a
2069            hash-bang comment. */
2070           char *beg = strstr (header, "#!");
2071           char *end = strstr (header, "!#");
2072           if (beg < encoding_start && encoding_start + encoding_length <= end)
2073             in_comment = 1;
2074           break;
2075         }
2076       else
2077         {
2078           pos --;
2079           continue;
2080         }
2081     }
2082   if (!in_comment)
2083     /* This wasn't in a comment */
2084     return NULL;
2085
2086   if (utf8_bom && strcasecmp (encoding, "UTF-8"))
2087     scm_misc_error (NULL,
2088                     "the port input declares the encoding ~s but is encoded as UTF-8",
2089                     scm_list_1 (scm_from_locale_string (encoding)));
2090
2091   return encoding;
2092 }
2093
2094 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2095             (SCM port),
2096             "Scans the port for an Emacs-like character coding declaration\n"
2097             "near the top of the contents of a port with random-accessible contents.\n"
2098             "The coding declaration is of the form\n"
2099             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2100             "\n"
2101             "Returns a string containing the character encoding of the file\n"
2102             "if a declaration was found, or @code{#f} otherwise.\n")
2103 #define FUNC_NAME s_scm_file_encoding
2104 {
2105   char *enc;
2106   SCM s_enc;
2107
2108   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2109
2110   enc = scm_i_scan_for_encoding (port);
2111   if (enc == NULL)
2112     return SCM_BOOL_F;
2113   else
2114     {
2115       s_enc = scm_from_locale_string (enc);
2116       return s_enc;
2117     }
2118
2119   return SCM_BOOL_F;
2120 }
2121 #undef FUNC_NAME
2122
2123 \f
2124 /* Per-port read options.
2125
2126    We store per-port read options in the 'port-read-options' key of the
2127    port's alist.  The value stored in the alist is a single integer that
2128    contains a two-bit field for each read option.
2129
2130    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2131    the applicable value should be inherited from the corresponding
2132    global read option.  Otherwise, the bit field contains the value of
2133    the read option.  For boolean read options that have been set
2134    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2135    read option has been set per-port, its possible values are those in
2136    'enum t_keyword_style'. */
2137
2138 /* Key to read options in per-port alists. */
2139 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2140
2141 /* Offsets of bit fields for each per-port override */
2142 #define READ_OPTION_COPY_SOURCE_P          0
2143 #define READ_OPTION_RECORD_POSITIONS_P     2
2144 #define READ_OPTION_CASE_INSENSITIVE_P     4
2145 #define READ_OPTION_KEYWORD_STYLE          6
2146 #define READ_OPTION_R6RS_ESCAPES_P         8
2147 #define READ_OPTION_SQUARE_BRACKETS_P     10
2148 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2149 #define READ_OPTION_CURLY_INFIX_P         14
2150
2151 /* The total width in bits of the per-port overrides */
2152 #define READ_OPTIONS_NUM_BITS             16
2153
2154 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2155 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2156
2157 #define READ_OPTION_MASK     3
2158 #define READ_OPTION_INHERIT  3
2159
2160 static void
2161 set_port_read_option (SCM port, int option, int new_value)
2162 {
2163   SCM scm_read_options;
2164   unsigned int read_options;
2165
2166   new_value &= READ_OPTION_MASK;
2167   scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2168                                    sym_port_read_options);
2169   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2170     read_options = scm_to_uint (scm_read_options);
2171   else
2172     read_options = READ_OPTIONS_INHERIT_ALL;
2173   read_options &= ~(READ_OPTION_MASK << option);
2174   read_options |= new_value << option;
2175   scm_read_options = scm_from_uint (read_options);
2176   SCM_PTAB_ENTRY(port)->alist = scm_assq_set_x (SCM_PTAB_ENTRY(port)->alist,
2177                                                 sym_port_read_options,
2178                                                 scm_read_options);
2179 }
2180
2181 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2182 static void
2183 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2184 {
2185   value = !!value;
2186   opts->case_insensitive_p = value;
2187   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2188 }
2189
2190 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2191 static void
2192 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2193 {
2194   value = !!value;
2195   opts->square_brackets_p = value;
2196   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2197 }
2198
2199 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2200 static void
2201 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2202 {
2203   value = !!value;
2204   opts->curly_infix_p = value;
2205   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2206 }
2207
2208 /* Initialize OPTS based on PORT's read options and the global read
2209    options. */
2210 static void
2211 init_read_options (SCM port, scm_t_read_opts *opts)
2212 {
2213   SCM val, scm_read_options;
2214   unsigned int read_options, x;
2215
2216   scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2217                                    sym_port_read_options);
2218
2219   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2220     read_options = scm_to_uint (scm_read_options);
2221   else
2222     read_options = READ_OPTIONS_INHERIT_ALL;
2223
2224   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2225   if (x == READ_OPTION_INHERIT)
2226     {
2227       val = SCM_PACK (SCM_KEYWORD_STYLE);
2228       if (scm_is_eq (val, scm_keyword_prefix))
2229         x = KEYWORD_STYLE_PREFIX;
2230       else if (scm_is_eq (val, scm_keyword_postfix))
2231         x = KEYWORD_STYLE_POSTFIX;
2232       else
2233         x = KEYWORD_STYLE_HASH_PREFIX;
2234     }
2235   opts->keyword_style = x;
2236
2237 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2238   do                                                                    \
2239     {                                                                   \
2240       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2241       if (x == READ_OPTION_INHERIT)                                     \
2242         x = !!SCM_ ## NAME;                                             \
2243           opts->name = x;                                               \
2244     }                                                                   \
2245   while (0)
2246
2247   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2248   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2249   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2250   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2251   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2252   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2253   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2254
2255 #undef RESOLVE_BOOLEAN_OPTION
2256
2257   opts->neoteric_p = 0;
2258 }
2259
2260 void
2261 scm_init_read ()
2262 {
2263   SCM read_hash_procs;
2264
2265   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2266
2267   scm_i_read_hash_procedures =
2268     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2269
2270   scm_init_opts (scm_read_options, scm_read_opts);
2271 #include "libguile/read.x"
2272 }
2273
2274 /*
2275   Local Variables:
2276   c-file-style: "gnu"
2277   End:
2278 */