libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 /* SRFI-105 curly infix expression support */
  67 SCM_SYMBOL (sym_nfx, "$nfx$");
  68 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  69 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  70
  71 scm_t_option scm_read_opts[] =
  72   {
  73     { SCM_OPTION_BOOLEAN, "copy", 0,
  74       "Copy source code expressions." },
  75     { SCM_OPTION_BOOLEAN, "positions", 1,
  76       "Record positions of source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  78       "Convert symbols to lower case."},
  79     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  80       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  81     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  82       "Use R6RS variable-length character and string hex escapes."},
  83     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  84       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  85     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  86       "In strings, consume leading whitespace after an escaped end-of-line."},
  87     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  88       "Support SRFI-105 curly infix expressions."},
  89     { 0, },
  90   };
  91
  92 /* Internal read options structure.  This is initialized by 'scm_read'
  93    from the global and per-port read options, and a pointer is passed
  94    down to all helper functions. */
  95
  96 enum t_keyword_style
  97   {
  98     KEYWORD_STYLE_HASH_PREFIX,
  99     KEYWORD_STYLE_PREFIX,
 100     KEYWORD_STYLE_POSTFIX
 101   };
 102
 103 struct t_read_opts
 104 {
 105   enum t_keyword_style keyword_style;
 106   unsigned int copy_source_p        : 1;
 107   unsigned int record_positions_p   : 1;
 108   unsigned int case_insensitive_p   : 1;
 109   unsigned int r6rs_escapes_p       : 1;
 110   unsigned int square_brackets_p    : 1;
 111   unsigned int hungry_eol_escapes_p : 1;
 112   unsigned int curly_infix_p        : 1;
 113   unsigned int neoteric_p           : 1;
 114 };
 115
 116 typedef struct t_read_opts scm_t_read_opts;
 117
 118
 119 /*
 120   Give meaningful error messages for errors
 121
 122   We use the format
 123
 124   FILE:LINE:COL: MESSAGE
 125   This happened in ....
 126
 127   This is not standard GNU format, but the test-suite likes the real
 128   message to be in front.
 129
 130  */
 131
 132
 133 void
 134 scm_i_input_error (char const *function,
 135                    SCM port, const char *message, SCM arg)
 136 {
 137   SCM fn = (scm_is_string (SCM_FILENAME(port))
 138             ? SCM_FILENAME(port)
 139             : scm_from_locale_string ("#<unknown port>"));
 140
 141   SCM string_port = scm_open_output_string ();
 142   SCM string = SCM_EOL;
 143   scm_simple_format (string_port,
 144                      scm_from_locale_string ("~A:~S:~S: ~A"),
 145                      scm_list_4 (fn,
 146                                  scm_from_long (SCM_LINUM (port) + 1),
 147                                  scm_from_int (SCM_COL (port) + 1),
 148                                  scm_from_locale_string (message)));
 149
 150   string = scm_get_output_string (string_port);
 151   scm_close_output_port (string_port);
 152   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 153                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 154                  string,
 155                  arg,
 156                  SCM_BOOL_F);
 157 }
 158
 159
 160 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 161             (SCM setting),
 162             "Option interface for the read options. Instead of using\n"
 163             "this procedure directly, use the procedures @code{read-enable},\n"
 164             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 165 #define FUNC_NAME s_scm_read_options
 166 {
 167   SCM ans = scm_options (setting,
 168                          scm_read_opts,
 169                          FUNC_NAME);
 170   if (SCM_COPY_SOURCE_P)
 171     SCM_RECORD_POSITIONS_P = 1;
 172   return ans;
 173 }
 174 #undef FUNC_NAME
 175
 176 /* A fluid referring to an association list mapping extra hash
 177    characters to procedures.  */
 178 static SCM *scm_i_read_hash_procedures;
 179
 180 static SCM
 181 scm_i_read_hash_procedures_ref (void)
 182 {
 183   return scm_fluid_ref (*scm_i_read_hash_procedures);
 184 }
 185
 186 static void
 187 scm_i_read_hash_procedures_set_x (SCM value)
 188 {
 189   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 190 }
 191
 192 \f
 193 /* Token readers.  */
 194
 195
 196 /* Size of the C buffer used to read symbols and numbers.  */
 197 #define READER_BUFFER_SIZE            128
 198
 199 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 200 #define READER_STRING_BUFFER_SIZE     128
 201
 202 /* The maximum size of Scheme character names.  */
 203 #define READER_CHAR_NAME_MAX_SIZE      50
 204
 205 /* The maximum size of reader directive names.  */
 206 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 207
 208
 209 /* `isblank' is only in C99.  */
 210 #define CHAR_IS_BLANK_(_chr)                                    \
 211   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 212    || ((_chr) == '\f') || ((_chr) == '\r'))
 213
 214 #ifdef MSDOS
 215 # define CHAR_IS_BLANK(_chr)                    \
 216   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 217 #else
 218 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 219 #endif
 220
 221
 222 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 223    structure'').  */
 224 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 225   (CHAR_IS_BLANK (c)                                            \
 226    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 227
 228 #define CHAR_IS_DELIMITER(c)                                    \
 229   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 230    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 231                                       || opts->curly_infix_p))  \
 232    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 233
 234 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 235    Structure''.  */
 236 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 237   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 238    || ((_chr) == 'd') || ((_chr) == 'l'))
 239
 240 /* Read an SCSH block comment.  */
 241 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 242 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 244 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_get_hash_procedure (int);
 246
 247 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 248    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 249    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 250    bytes actually read.  */
 251 static int
 252 read_token (SCM port, scm_t_read_opts *opts,
 253             char *buf, size_t buf_size, size_t *read)
 254 {
 255    *read = 0;
 256
 257    while (*read < buf_size)
 258      {
 259        int chr;
 260
 261        chr = scm_get_byte_or_eof (port);
 262
 263        if (chr == EOF)
 264         return 0;
 265       else if (CHAR_IS_DELIMITER (chr))
 266         {
 267           scm_unget_byte (chr, port);
 268           return 0;
 269         }
 270       else
 271         {
 272           *buf = (char) chr;
 273           buf++, (*read)++;
 274         }
 275      }
 276
 277    return 1;
 278  }
 279
 280 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 281    if the token doesn't fit in BUFFER_SIZE bytes.  */
 282 static char *
 283 read_complete_token (SCM port, scm_t_read_opts *opts,
 284                      char *buffer, size_t buffer_size, size_t *read)
 285 {
 286   int overflow = 0;
 287   size_t bytes_read, overflow_size = 0;
 288   char *overflow_buffer = NULL;
 289
 290   do
 291     {
 292       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 293       if (bytes_read == 0)
 294         break;
 295       if (overflow || overflow_size != 0)
 296         {
 297           if (overflow_size == 0)
 298             {
 299               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 300               memcpy (overflow_buffer, buffer, bytes_read);
 301               overflow_size = bytes_read;
 302             }
 303           else
 304             {
 305               char *new_buf =
 306                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 307
 308               memcpy (new_buf, overflow_buffer, overflow_size);
 309               memcpy (new_buf + overflow_size, buffer, bytes_read);
 310
 311               overflow_buffer = new_buf;
 312               overflow_size += bytes_read;
 313             }
 314         }
 315     }
 316   while (overflow);
 317
 318   if (overflow_size)
 319     *read = overflow_size;
 320   else
 321     *read = bytes_read;
 322
 323   return (overflow_size > 0 ? overflow_buffer : buffer);
 324 }
 325
 326 /* Skip whitespace from PORT and return the first non-whitespace character
 327    read.  Raise an error on end-of-file.  */
 328 static int
 329 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 330 {
 331   scm_t_wchar c;
 332   while (1)
 333     switch (c = scm_getc (port))
 334       {
 335       case EOF:
 336       goteof:
 337         if (eoferr)
 338           {
 339             scm_i_input_error (eoferr,
 340                                port,
 341                                "end of file",
 342                                SCM_EOL);
 343           }
 344         return c;
 345
 346       case ';':
 347       lp:
 348         switch (c = scm_getc (port))
 349           {
 350           case EOF:
 351             goto goteof;
 352           default:
 353             goto lp;
 354           case SCM_LINE_INCREMENTORS:
 355             break;
 356           }
 357         break;
 358
 359       case '#':
 360         switch (c = scm_getc (port))
 361           {
 362           case EOF:
 363             eoferr = "read_sharp";
 364             goto goteof;
 365           case '!':
 366             scm_read_shebang (c, port, opts);
 367             break;
 368           case ';':
 369             scm_read_commented_expression (c, port, opts);
 370             break;
 371           case '|':
 372             if (scm_is_false (scm_get_hash_procedure (c)))
 373               {
 374                 scm_read_r6rs_block_comment (c, port);
 375                 break;
 376               }
 377             /* fall through */
 378           default:
 379             scm_ungetc (c, port);
 380             return '#';
 381           }
 382         break;
 383
 384       case SCM_LINE_INCREMENTORS:
 385       case SCM_SINGLE_SPACES:
 386       case '\t':
 387         break;
 388
 389       default:
 390         return c;
 391       }
 392
 393   return 0;
 394 }
 395
 396
 397 \f
 398 /* Token readers.  */
 399
 400 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 401 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 402                            long line, int column);
 403
 404
 405 static SCM
 406 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 407                        long line, int column)
 408 {
 409   if (opts->record_positions_p)
 410     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 411   return x;
 412 }
 413
 414 static SCM
 415 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 416 #define FUNC_NAME "scm_i_lreadparen"
 417 {
 418   int c;
 419   SCM tmp, tl, ans = SCM_EOL;
 420   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 421   const int terminating_char = ((chr == '{') ? '}'
 422                                 : ((chr == '[') ? ']'
 423                                    : ')'));
 424
 425   /* Need to capture line and column numbers here. */
 426   long line = SCM_LINUM (port);
 427   int column = SCM_COL (port) - 1;
 428
 429   c = flush_ws (port, opts, FUNC_NAME);
 430   if (terminating_char == c)
 431     return SCM_EOL;
 432
 433   scm_ungetc (c, port);
 434   tmp = scm_read_expression (port, opts);
 435
 436   /* Note that it is possible for scm_read_expression to return
 437      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 438      check that it's a real dot by checking `c'.  */
 439   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 440     {
 441       ans = scm_read_expression (port, opts);
 442       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 443         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 444                            SCM_EOL);
 445       return ans;
 446     }
 447
 448   /* Build the head of the list structure. */
 449   ans = tl = scm_cons (tmp, SCM_EOL);
 450
 451   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 452     {
 453       SCM new_tail;
 454
 455       if (c == ')' || (c == ']' && opts->square_brackets_p)
 456           || ((c == '}' || c == ']') && opts->curly_infix_p))
 457         scm_i_input_error (FUNC_NAME, port,
 458                            "in pair: mismatched close paren: ~A",
 459                            scm_list_1 (SCM_MAKE_CHAR (c)));
 460
 461       scm_ungetc (c, port);
 462       tmp = scm_read_expression (port, opts);
 463
 464       /* See above note about scm_sym_dot.  */
 465       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 466         {
 467           SCM_SETCDR (tl, scm_read_expression (port, opts));
 468
 469           c = flush_ws (port, opts, FUNC_NAME);
 470           if (terminating_char != c)
 471             scm_i_input_error (FUNC_NAME, port,
 472                                "in pair: missing close paren", SCM_EOL);
 473           break;
 474         }
 475
 476       new_tail = scm_cons (tmp, SCM_EOL);
 477       SCM_SETCDR (tl, new_tail);
 478       tl = new_tail;
 479     }
 480
 481   if (curly_list_p)
 482     {
 483       /* In addition to finding the length, 'scm_ilength' checks for
 484          improper or circular lists, in which case it returns -1. */
 485       int len = scm_ilength (ans);
 486
 487       /* The (len == 0) case is handled above */
 488       if (len == 1)
 489         /* Return directly to avoid re-annotating the element's source
 490            location with the position of the outer brace.  Also, it
 491            might not be possible to annotate the element. */
 492         return scm_car (ans);  /* {e} => e */
 493       else if (len == 2)
 494         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 495       else if (len >= 3 && (len & 1))
 496         {
 497           /* It's a proper list whose length is odd and at least 3.  If
 498              the elements at odd indices (the infix operator positions)
 499              are all 'equal?', then it's a simple curly-infix list.
 500              Otherwise it's a mixed curly-infix list. */
 501           SCM op = scm_cadr (ans);
 502
 503           /* Check to see if the elements at odd indices are 'equal?' */
 504           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 505             {
 506               if (scm_is_null (tl))
 507                 {
 508                   /* Convert simple curly-infix list to prefix:
 509                      {a <op> b <op> ...} => (<op> a b ...) */
 510                   tl = ans;
 511                   while (scm_is_pair (scm_cdr (tl)))
 512                     {
 513                       tmp = scm_cddr (tl);
 514                       SCM_SETCDR (tl, tmp);
 515                       tl = tmp;
 516                     }
 517                   ans = scm_cons (op, ans);
 518                   break;
 519                 }
 520               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 521                 {
 522                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 523                   ans = scm_cons (sym_nfx, ans);
 524                   break;
 525                 }
 526             }
 527         }
 528       else
 529         /* Mixed curly-infix (possibly improper) list:
 530            {e . tail} => ($nfx$ e . tail) */
 531         ans = scm_cons (sym_nfx, ans);
 532     }
 533
 534   return maybe_annotate_source (ans, port, opts, line, column);
 535 }
 536 #undef FUNC_NAME
 537
 538
 539 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 540    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 541    found.  */
 542 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 543   do                                                               \
 544     {                                                              \
 545       scm_t_wchar a;                                               \
 546       size_t i = 0;                                                \
 547       c = 0;                                                       \
 548       while (i < ndigits)                                          \
 549         {                                                          \
 550           a = scm_getc (port);                                     \
 551           if (a == EOF)                                            \
 552             goto str_eof;                                          \
 553           if (terminator                                           \
 554               && (a == (scm_t_wchar) terminator)                   \
 555               && (i > 0))                                          \
 556             break;                                                 \
 557           if ('0' <= a && a <= '9')                                \
 558             a -= '0';                                              \
 559           else if ('A' <= a && a <= 'F')                           \
 560             a = a - 'A' + 10;                                      \
 561           else if ('a' <= a && a <= 'f')                           \
 562             a = a - 'a' + 10;                                      \
 563           else                                                     \
 564             {                                                      \
 565               c = a;                                               \
 566               goto bad_escaped;                                    \
 567             }                                                      \
 568           c = c * 16 + a;                                          \
 569           i ++;                                                    \
 570         }                                                          \
 571     } while (0)
 572
 573 static void
 574 skip_intraline_whitespace (SCM port)
 575 {
 576   scm_t_wchar c;
 577
 578   do
 579     {
 580       c = scm_getc (port);
 581       if (c == EOF)
 582         return;
 583     }
 584   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 585
 586   scm_ungetc (c, port);
 587 }
 588
 589 static SCM
 590 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 591 #define FUNC_NAME "scm_lreadr"
 592 {
 593   /* For strings smaller than C_STR, this function creates only one Scheme
 594      object (the string returned).  */
 595
 596   SCM str = SCM_EOL;
 597   size_t c_str_len = 0;
 598   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 599
 600   /* Need to capture line and column numbers here. */
 601   long line = SCM_LINUM (port);
 602   int column = SCM_COL (port) - 1;
 603
 604   while ('"' != (c = scm_getc (port)))
 605     {
 606       if (c == EOF)
 607         {
 608         str_eof:
 609           scm_i_input_error (FUNC_NAME, port,
 610                              "end of file in string constant", SCM_EOL);
 611         }
 612
 613       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 614         {
 615           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 616           c_str_len = 0;
 617         }
 618
 619       if (c == '\\')
 620         {
 621           switch (c = scm_getc (port))
 622             {
 623             case EOF:
 624               goto str_eof;
 625             case '"':
 626             case '\\':
 627               break;
 628             case '\n':
 629               if (opts->hungry_eol_escapes_p)
 630                 skip_intraline_whitespace (port);
 631               continue;
 632             case '0':
 633               c = '\0';
 634               break;
 635             case 'f':
 636               c = '\f';
 637               break;
 638             case 'n':
 639               c = '\n';
 640               break;
 641             case 'r':
 642               c = '\r';
 643               break;
 644             case 't':
 645               c = '\t';
 646               break;
 647             case 'a':
 648               c = '\007';
 649               break;
 650             case 'v':
 651               c = '\v';
 652               break;
 653             case 'b':
 654               c = '\010';
 655               break;
 656             case 'x':
 657               if (opts->r6rs_escapes_p)
 658                 SCM_READ_HEX_ESCAPE (10, ';');
 659               else
 660                 SCM_READ_HEX_ESCAPE (2, '\0');
 661               break;
 662             case 'u':
 663               if (!opts->r6rs_escapes_p)
 664                 {
 665                   SCM_READ_HEX_ESCAPE (4, '\0');
 666                   break;
 667                 }
 668             case 'U':
 669               if (!opts->r6rs_escapes_p)
 670                 {
 671                   SCM_READ_HEX_ESCAPE (6, '\0');
 672                   break;
 673                 }
 674             default:
 675             bad_escaped:
 676               scm_i_input_error (FUNC_NAME, port,
 677                                  "illegal character in escape sequence: ~S",
 678                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 679             }
 680         }
 681
 682       c_str[c_str_len++] = c;
 683     }
 684
 685   if (scm_is_null (str))
 686     /* Fast path: we got a string that fits in C_STR.  */
 687     str = scm_from_utf32_stringn (c_str, c_str_len);
 688   else
 689     {
 690       if (c_str_len > 0)
 691         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 692
 693       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 694     }
 695
 696   return maybe_annotate_source (str, port, opts, line, column);
 697 }
 698 #undef FUNC_NAME
 699
 700
 701 static SCM
 702 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 703 {
 704   SCM result, str = SCM_EOL;
 705   char local_buffer[READER_BUFFER_SIZE], *buffer;
 706   size_t bytes_read;
 707   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 708
 709   /* Need to capture line and column numbers here. */
 710   long line = SCM_LINUM (port);
 711   int column = SCM_COL (port) - 1;
 712
 713   scm_ungetc (chr, port);
 714   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 715                                 &bytes_read);
 716
 717   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 718
 719   result = scm_string_to_number (str, SCM_UNDEFINED);
 720   if (scm_is_false (result))
 721     {
 722       /* Return a symbol instead of a number */
 723       if (opts->case_insensitive_p)
 724         str = scm_string_downcase_x (str);
 725       result = scm_string_to_symbol (str);
 726     }
 727   else if (SCM_NIMP (result))
 728     result = maybe_annotate_source (result, port, opts, line, column);
 729
 730   SCM_COL (port) += scm_i_string_length (str);
 731   return result;
 732 }
 733
 734 static SCM
 735 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 736 {
 737   SCM result;
 738   int ends_with_colon = 0;
 739   size_t bytes_read;
 740   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 741   char local_buffer[READER_BUFFER_SIZE], *buffer;
 742   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 743   SCM str;
 744
 745   scm_ungetc (chr, port);
 746   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 747                                 &bytes_read);
 748   if (bytes_read > 0)
 749     ends_with_colon = buffer[bytes_read - 1] == ':';
 750
 751   if (postfix && ends_with_colon && (bytes_read > 1))
 752     {
 753       str = scm_from_stringn (buffer, bytes_read - 1,
 754                               pt->encoding, pt->ilseq_handler);
 755
 756       if (opts->case_insensitive_p)
 757         str = scm_string_downcase_x (str);
 758       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 759     }
 760   else
 761     {
 762       str = scm_from_stringn (buffer, bytes_read,
 763                               pt->encoding, pt->ilseq_handler);
 764
 765       if (opts->case_insensitive_p)
 766         str = scm_string_downcase_x (str);
 767       result = scm_string_to_symbol (str);
 768     }
 769
 770   SCM_COL (port) += scm_i_string_length (str);
 771   return result;
 772 }
 773
 774 static SCM
 775 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 776 #define FUNC_NAME "scm_lreadr"
 777 {
 778   SCM result;
 779   size_t read;
 780   char local_buffer[READER_BUFFER_SIZE], *buffer;
 781   unsigned int radix;
 782   SCM str;
 783   scm_t_port *pt;
 784
 785   switch (chr)
 786     {
 787     case 'B':
 788     case 'b':
 789       radix = 2;
 790       break;
 791
 792     case 'o':
 793     case 'O':
 794       radix = 8;
 795       break;
 796
 797     case 'd':
 798     case 'D':
 799       radix = 10;
 800       break;
 801
 802     case 'x':
 803     case 'X':
 804       radix = 16;
 805       break;
 806
 807     default:
 808       scm_ungetc (chr, port);
 809       scm_ungetc ('#', port);
 810       radix = 10;
 811     }
 812
 813   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 814                                 &read);
 815
 816   pt = SCM_PTAB_ENTRY (port);
 817   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 818
 819   result = scm_string_to_number (str, scm_from_uint (radix));
 820
 821   SCM_COL (port) += scm_i_string_length (str);
 822
 823   if (scm_is_true (result))
 824     return result;
 825
 826   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 827
 828   return SCM_BOOL_F;
 829 }
 830 #undef FUNC_NAME
 831
 832 static SCM
 833 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 834 {
 835   SCM p;
 836   long line = SCM_LINUM (port);
 837   int column = SCM_COL (port) - 1;
 838
 839   switch (chr)
 840     {
 841     case '`':
 842       p = scm_sym_quasiquote;
 843       break;
 844
 845     case '\'':
 846       p = scm_sym_quote;
 847       break;
 848
 849     case ',':
 850       {
 851         scm_t_wchar c;
 852
 853         c = scm_getc (port);
 854         if ('@' == c)
 855           p = scm_sym_uq_splicing;
 856         else
 857           {
 858             scm_ungetc (c, port);
 859             p = scm_sym_unquote;
 860           }
 861         break;
 862       }
 863
 864     default:
 865       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 866                "scm_read_quote", chr);
 867       abort ();
 868     }
 869
 870   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 871   return maybe_annotate_source (p, port, opts, line, column);
 872 }
 873
 874 SCM_SYMBOL (sym_syntax, "syntax");
 875 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 876 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 877 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 878
 879 static SCM
 880 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 881 {
 882   SCM p;
 883   long line = SCM_LINUM (port);
 884   int column = SCM_COL (port) - 1;
 885
 886   switch (chr)
 887     {
 888     case '`':
 889       p = sym_quasisyntax;
 890       break;
 891
 892     case '\'':
 893       p = sym_syntax;
 894       break;
 895
 896     case ',':
 897       {
 898         int c;
 899
 900         c = scm_getc (port);
 901         if ('@' == c)
 902           p = sym_unsyntax_splicing;
 903         else
 904           {
 905             scm_ungetc (c, port);
 906             p = sym_unsyntax;
 907           }
 908         break;
 909       }
 910
 911     default:
 912       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 913                "scm_read_syntax", chr);
 914       abort ();
 915     }
 916
 917   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 918   return maybe_annotate_source (p, port, opts, line, column);
 919 }
 920
 921 static SCM
 922 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 923 {
 924   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 925
 926   if (!scm_is_eq (id, sym_nil))
 927     scm_i_input_error ("scm_read_nil", port,
 928                        "unexpected input while reading #nil: ~a",
 929                        scm_list_1 (id));
 930
 931   return SCM_ELISP_NIL;
 932 }
 933
 934 static SCM
 935 scm_read_semicolon_comment (int chr, SCM port)
 936 {
 937   int c;
 938
 939   /* We use the get_byte here because there is no need to get the
 940      locale correct with comment input. This presumes that newline
 941      always represents itself no matter what the encoding is.  */
 942   for (c = scm_get_byte_or_eof (port);
 943        (c != EOF) && (c != '\n');
 944        c = scm_get_byte_or_eof (port));
 945
 946   return SCM_UNSPECIFIED;
 947 }
 948
 949 \f
 950 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 951
 952 static SCM
 953 scm_read_boolean (int chr, SCM port)
 954 {
 955   switch (chr)
 956     {
 957     case 't':
 958     case 'T':
 959       return SCM_BOOL_T;
 960
 961     case 'f':
 962     case 'F':
 963       return SCM_BOOL_F;
 964     }
 965
 966   return SCM_UNSPECIFIED;
 967 }
 968
 969 static SCM
 970 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 971 #define FUNC_NAME "scm_lreadr"
 972 {
 973   char buffer[READER_CHAR_NAME_MAX_SIZE];
 974   SCM charname;
 975   size_t charname_len, bytes_read;
 976   scm_t_wchar cp;
 977   int overflow;
 978   scm_t_port *pt;
 979
 980   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 981                          &bytes_read);
 982   if (overflow)
 983     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 984
 985   if (bytes_read == 0)
 986     {
 987       chr = scm_getc (port);
 988       if (chr == EOF)
 989         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 990                            "while reading character", SCM_EOL);
 991
 992       /* CHR must be a token delimiter, like a whitespace.  */
 993       return (SCM_MAKE_CHAR (chr));
 994     }
 995
 996   pt = SCM_PTAB_ENTRY (port);
 997
 998   /* Simple ASCII characters can be processed immediately.  Also, simple
 999      ISO-8859-1 characters can be processed immediately if the encoding for this
1000      port is ISO-8859-1.  */
1001   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1002     {
1003       SCM_COL (port) += 1;
1004       return SCM_MAKE_CHAR (buffer[0]);
1005     }
1006
1007   /* Otherwise, convert the buffer into a proper scheme string for
1008      processing.  */
1009   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1010                                pt->ilseq_handler);
1011   charname_len = scm_i_string_length (charname);
1012   SCM_COL (port) += charname_len;
1013   cp = scm_i_string_ref (charname, 0);
1014   if (charname_len == 1)
1015     return SCM_MAKE_CHAR (cp);
1016
1017   /* Ignore dotted circles, which may be used to keep combining characters from
1018      combining with the backslash in #\charname.  */
1019   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1020     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1021
1022   if (cp >= '0' && cp < '8')
1023     {
1024       /* Dirk:FIXME::  This type of character syntax is not R5RS
1025        * compliant.  Further, it should be verified that the constant
1026        * does only consist of octal digits.  */
1027       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1028       if (SCM_I_INUMP (p))
1029         {
1030           scm_t_wchar c = scm_to_uint32 (p);
1031           if (SCM_IS_UNICODE_CHAR (c))
1032             return SCM_MAKE_CHAR (c);
1033           else
1034             scm_i_input_error (FUNC_NAME, port,
1035                                "out-of-range octal character escape: ~a",
1036                                scm_list_1 (charname));
1037         }
1038     }
1039
1040   if (cp == 'x' && (charname_len > 1))
1041     {
1042       SCM p;
1043
1044       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1045       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1046                                 scm_from_uint (16));
1047       if (SCM_I_INUMP (p))
1048         {
1049           scm_t_wchar c = scm_to_uint32 (p);
1050           if (SCM_IS_UNICODE_CHAR (c))
1051             return SCM_MAKE_CHAR (c);
1052           else
1053             scm_i_input_error (FUNC_NAME, port,
1054                                "out-of-range hex character escape: ~a",
1055                                scm_list_1 (charname));
1056         }
1057     }
1058
1059   /* The names of characters should never have non-Latin1
1060      characters.  */
1061   if (scm_i_is_narrow_string (charname)
1062       || scm_i_try_narrow_string (charname))
1063     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1064                                        charname_len);
1065       if (scm_is_true (ch))
1066         return ch;
1067     }
1068
1069   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1070                      scm_list_1 (charname));
1071
1072   return SCM_UNSPECIFIED;
1073 }
1074 #undef FUNC_NAME
1075
1076 static SCM
1077 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1078 {
1079   SCM symbol;
1080
1081   /* Read the symbol that comprises the keyword.  Doing this instead of
1082      invoking a specific symbol reader function allows `scm_read_keyword ()'
1083      to adapt to the delimiters currently valid of symbols.
1084
1085      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1086   symbol = scm_read_expression (port, opts);
1087   if (!scm_is_symbol (symbol))
1088     scm_i_input_error ("scm_read_keyword", port,
1089                        "keyword prefix `~a' not followed by a symbol: ~s",
1090                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1091
1092   return (scm_symbol_to_keyword (symbol));
1093 }
1094
1095 static SCM
1096 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1097                  long line, int column)
1098 {
1099   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1100      guarantee that it's going to do what we want.  After all, this is an
1101      implementation detail of `scm_read_vector ()', not a desirable
1102      property.  */
1103   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1104                                 port, opts, line, column);
1105 }
1106
1107 /* Helper used by scm_read_array */
1108 static int
1109 read_decimal_integer (SCM port, int c, ssize_t *resp)
1110 {
1111   ssize_t sign = 1;
1112   ssize_t res = 0;
1113   int got_it = 0;
1114
1115   if (c == '-')
1116     {
1117       sign = -1;
1118       c = scm_getc (port);
1119     }
1120
1121   while ('0' <= c && c <= '9')
1122     {
1123       res = 10*res + c-'0';
1124       got_it = 1;
1125       c = scm_getc (port);
1126     }
1127
1128   if (got_it)
1129     *resp = sign * res;
1130   return c;
1131 }
1132
1133 /* Read an array.  This function can also read vectors and uniform
1134    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1135    handled here.
1136
1137    C is the first character read after the '#'.
1138 */
1139 static SCM
1140 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1141 {
1142   ssize_t rank;
1143   scm_t_wchar tag_buf[8];
1144   int tag_len;
1145
1146   SCM tag, shape = SCM_BOOL_F, elements, array;
1147
1148   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1149      the array code can not deal with zero-length dimensions yet, and
1150      we want to allow zero-length vectors, of course.
1151   */
1152   if (c == '(')
1153     return scm_read_vector (c, port, opts, line, column);
1154
1155   /* Disambiguate between '#f' and uniform floating point vectors.
1156    */
1157   if (c == 'f')
1158     {
1159       c = scm_getc (port);
1160       if (c != '3' && c != '6')
1161         {
1162           if (c != EOF)
1163             scm_ungetc (c, port);
1164           return SCM_BOOL_F;
1165         }
1166       rank = 1;
1167       tag_buf[0] = 'f';
1168       tag_len = 1;
1169       goto continue_reading_tag;
1170     }
1171
1172   /* Read rank. */
1173   rank = 1;
1174   c = read_decimal_integer (port, c, &rank);
1175   if (rank < 0)
1176     scm_i_input_error (NULL, port, "array rank must be non-negative",
1177                        SCM_EOL);
1178
1179   /* Read tag. */
1180   tag_len = 0;
1181  continue_reading_tag:
1182   while (c != EOF && c != '(' && c != '@' && c != ':'
1183          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1184     {
1185       tag_buf[tag_len++] = c;
1186       c = scm_getc (port);
1187     }
1188   if (tag_len == 0)
1189     tag = SCM_BOOL_T;
1190   else
1191     {
1192       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1193       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1194         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1195                            scm_list_1 (tag));
1196     }
1197
1198   /* Read shape. */
1199   if (c == '@' || c == ':')
1200     {
1201       shape = SCM_EOL;
1202
1203       do
1204         {
1205           ssize_t lbnd = 0, len = 0;
1206           SCM s;
1207
1208           if (c == '@')
1209             {
1210               c = scm_getc (port);
1211               c = read_decimal_integer (port, c, &lbnd);
1212             }
1213
1214           s = scm_from_ssize_t (lbnd);
1215
1216           if (c == ':')
1217             {
1218               c = scm_getc (port);
1219               c = read_decimal_integer (port, c, &len);
1220               if (len < 0)
1221                 scm_i_input_error (NULL, port,
1222                                    "array length must be non-negative",
1223                                    SCM_EOL);
1224
1225               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1226             }
1227
1228           shape = scm_cons (s, shape);
1229         } while (c == '@' || c == ':');
1230
1231       shape = scm_reverse_x (shape, SCM_EOL);
1232     }
1233
1234   /* Read nested lists of elements. */
1235   if (c != '(')
1236     scm_i_input_error (NULL, port,
1237                        "missing '(' in vector or array literal",
1238                        SCM_EOL);
1239   elements = scm_read_sexp (c, port, opts);
1240
1241   if (scm_is_false (shape))
1242     shape = scm_from_ssize_t (rank);
1243   else if (scm_ilength (shape) != rank)
1244     scm_i_input_error
1245       (NULL, port,
1246        "the number of shape specifications must match the array rank",
1247        SCM_EOL);
1248
1249   /* Handle special print syntax of rank zero arrays; see
1250      scm_i_print_array for a rationale. */
1251   if (rank == 0)
1252     {
1253       if (!scm_is_pair (elements))
1254         scm_i_input_error (NULL, port,
1255                            "too few elements in array literal, need 1",
1256                            SCM_EOL);
1257       if (!scm_is_null (SCM_CDR (elements)))
1258         scm_i_input_error (NULL, port,
1259                            "too many elements in array literal, want 1",
1260                            SCM_EOL);
1261       elements = SCM_CAR (elements);
1262     }
1263
1264   /* Construct array, annotate with source location, and return. */
1265   array = scm_list_to_typed_array (tag, shape, elements);
1266   return maybe_annotate_source (array, port, opts, line, column);
1267 }
1268
1269 static SCM
1270 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1271                        long line, int column)
1272 {
1273   return scm_read_array (chr, port, opts, line, column);
1274 }
1275
1276 static SCM
1277 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1278                      long line, int column)
1279 {
1280   chr = scm_getc (port);
1281   if (chr != 'u')
1282     goto syntax;
1283
1284   chr = scm_getc (port);
1285   if (chr != '8')
1286     goto syntax;
1287
1288   chr = scm_getc (port);
1289   if (chr != '(')
1290     goto syntax;
1291
1292   return maybe_annotate_source
1293     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1294      port, opts, line, column);
1295
1296  syntax:
1297   scm_i_input_error ("read_bytevector", port,
1298                      "invalid bytevector prefix",
1299                      SCM_MAKE_CHAR (chr));
1300   return SCM_UNSPECIFIED;
1301 }
1302
1303 static SCM
1304 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1305                            long line, int column)
1306 {
1307   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1308      terribly inefficient but who cares?  */
1309   SCM s_bits = SCM_EOL;
1310
1311   for (chr = scm_getc (port);
1312        (chr != EOF) && ((chr == '0') || (chr == '1'));
1313        chr = scm_getc (port))
1314     {
1315       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1316     }
1317
1318   if (chr != EOF)
1319     scm_ungetc (chr, port);
1320
1321   return maybe_annotate_source
1322     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1323      port, opts, line, column);
1324 }
1325
1326 static SCM
1327 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1328 {
1329   int bang_seen = 0;
1330
1331   for (;;)
1332     {
1333       int c = scm_getc (port);
1334
1335       if (c == EOF)
1336         scm_i_input_error ("skip_block_comment", port,
1337                            "unterminated `#! ... !#' comment", SCM_EOL);
1338
1339       if (c == '!')
1340         bang_seen = 1;
1341       else if (c == '#' && bang_seen)
1342         break;
1343       else
1344         bang_seen = 0;
1345     }
1346
1347   return SCM_UNSPECIFIED;
1348 }
1349
1350 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1351                                          int value);
1352 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1353                                         int value);
1354 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1355                                     int value);
1356
1357 static SCM
1358 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1359 {
1360   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1361   int c;
1362   int i = 0;
1363
1364   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1365     {
1366       c = scm_getc (port);
1367       if (c == EOF)
1368         scm_i_input_error ("skip_block_comment", port,
1369                            "unterminated `#! ... !#' comment", SCM_EOL);
1370       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1371         name[i++] = c;
1372       else if (CHAR_IS_DELIMITER (c))
1373         {
1374           scm_ungetc (c, port);
1375           name[i] = '\0';
1376           if (0 == strcmp ("r6rs", name))
1377             ;  /* Silently ignore */
1378           else if (0 == strcmp ("fold-case", name))
1379             set_port_case_insensitive_p (port, opts, 1);
1380           else if (0 == strcmp ("no-fold-case", name))
1381             set_port_case_insensitive_p (port, opts, 0);
1382           else if (0 == strcmp ("curly-infix", name))
1383             set_port_curly_infix_p (port, opts, 1);
1384           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1385             {
1386               set_port_curly_infix_p (port, opts, 1);
1387               set_port_square_brackets_p (port, opts, 0);
1388             }
1389           else
1390             break;
1391
1392           return SCM_UNSPECIFIED;
1393         }
1394     }
1395   while (i > 0)
1396     scm_ungetc (name[--i], port);
1397   return scm_read_scsh_block_comment (chr, port);
1398 }
1399
1400 static SCM
1401 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1402 {
1403   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1404      nested.  So care must be taken.  */
1405   int nesting_level = 1;
1406
1407   int a = scm_getc (port);
1408
1409   if (a == EOF)
1410     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1411                        "unterminated `#| ... |#' comment", SCM_EOL);
1412
1413   while (nesting_level > 0)
1414     {
1415       int b = scm_getc (port);
1416
1417       if (b == EOF)
1418         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1419                            "unterminated `#| ... |#' comment", SCM_EOL);
1420
1421       if (a == '|' && b == '#')
1422         {
1423           nesting_level--;
1424           b = EOF;
1425         }
1426       else if (a == '#' && b == '|')
1427         {
1428           nesting_level++;
1429           b = EOF;
1430         }
1431
1432       a = b;
1433     }
1434
1435   return SCM_UNSPECIFIED;
1436 }
1437
1438 static SCM
1439 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1440                                scm_t_read_opts *opts)
1441 {
1442   scm_t_wchar c;
1443
1444   c = flush_ws (port, opts, (char *) NULL);
1445   if (EOF == c)
1446     scm_i_input_error ("read_commented_expression", port,
1447                        "no expression after #; comment", SCM_EOL);
1448   scm_ungetc (c, port);
1449   scm_read_expression (port, opts);
1450   return SCM_UNSPECIFIED;
1451 }
1452
1453 static SCM
1454 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1455 {
1456   /* Guile's extended symbol read syntax looks like this:
1457
1458        #{This is all a symbol name}#
1459
1460      So here, CHR is expected to be `{'.  */
1461   int saw_brace = 0;
1462   size_t len = 0;
1463   SCM buf = scm_i_make_string (1024, NULL, 0);
1464
1465   buf = scm_i_string_start_writing (buf);
1466
1467   while ((chr = scm_getc (port)) != EOF)
1468     {
1469       if (saw_brace)
1470         {
1471           if (chr == '#')
1472             {
1473               break;
1474             }
1475           else
1476             {
1477               saw_brace = 0;
1478               scm_i_string_set_x (buf, len++, '}');
1479             }
1480         }
1481
1482       if (chr == '}')
1483         saw_brace = 1;
1484       else if (chr == '\\')
1485         {
1486           /* It used to be that print.c would print extended-read-syntax
1487              symbols with backslashes before "non-standard" chars, but
1488              this routine wouldn't do anything with those escapes.
1489              Bummer.  What we've done is to change print.c to output
1490              R6RS hex escapes for those characters, relying on the fact
1491              that the extended read syntax would never put a `\' before
1492              an `x'.  For now, we just ignore other instances of
1493              backslash in the string.  */
1494           switch ((chr = scm_getc (port)))
1495             {
1496             case EOF:
1497               goto done;
1498             case 'x':
1499               {
1500                 scm_t_wchar c;
1501
1502                 SCM_READ_HEX_ESCAPE (10, ';');
1503                 scm_i_string_set_x (buf, len++, c);
1504                 break;
1505
1506               str_eof:
1507                 chr = EOF;
1508                 goto done;
1509
1510               bad_escaped:
1511                 scm_i_string_stop_writing ();
1512                 scm_i_input_error ("scm_read_extended_symbol", port,
1513                                    "illegal character in escape sequence: ~S",
1514                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1515                 break;
1516               }
1517             default:
1518               scm_i_string_set_x (buf, len++, chr);
1519               break;
1520             }
1521         }
1522       else
1523         scm_i_string_set_x (buf, len++, chr);
1524
1525       if (len >= scm_i_string_length (buf) - 2)
1526         {
1527           SCM addy;
1528
1529           scm_i_string_stop_writing ();
1530           addy = scm_i_make_string (1024, NULL, 0);
1531           buf = scm_string_append (scm_list_2 (buf, addy));
1532           len = 0;
1533           buf = scm_i_string_start_writing (buf);
1534         }
1535     }
1536
1537  done:
1538   scm_i_string_stop_writing ();
1539   if (chr == EOF)
1540     scm_i_input_error ("scm_read_extended_symbol", port,
1541                        "end of file while reading symbol", SCM_EOL);
1542
1543   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1544 }
1545
1546
1547 \f
1548 /* Top-level token readers, i.e., dispatchers.  */
1549
1550 static SCM
1551 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1552 {
1553   SCM proc;
1554
1555   proc = scm_get_hash_procedure (chr);
1556   if (scm_is_true (scm_procedure_p (proc)))
1557     {
1558       long line = SCM_LINUM (port);
1559       int column = SCM_COL (port) - 2;
1560       SCM got;
1561
1562       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1563
1564       if (opts->record_positions_p && SCM_NIMP (got)
1565           && !scm_i_has_source_properties (got))
1566         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1567
1568       return got;
1569     }
1570
1571   return SCM_UNSPECIFIED;
1572 }
1573
1574 /* The reader for the sharp `#' character.  It basically dispatches reads
1575    among the above token readers.   */
1576 static SCM
1577 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1578                 long line, int column)
1579 #define FUNC_NAME "scm_lreadr"
1580 {
1581   SCM result;
1582
1583   chr = scm_getc (port);
1584
1585   result = scm_read_sharp_extension (chr, port, opts);
1586   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1587     return result;
1588
1589   switch (chr)
1590     {
1591     case '\\':
1592       return (scm_read_character (chr, port, opts));
1593     case '(':
1594       return (scm_read_vector (chr, port, opts, line, column));
1595     case 's':
1596     case 'u':
1597     case 'f':
1598     case 'c':
1599       /* This one may return either a boolean or an SRFI-4 vector.  */
1600       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1601     case 'v':
1602       return (scm_read_bytevector (chr, port, opts, line, column));
1603     case '*':
1604       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1605     case 't':
1606     case 'T':
1607     case 'F':
1608       return (scm_read_boolean (chr, port));
1609     case ':':
1610       return (scm_read_keyword (chr, port, opts));
1611     case '0': case '1': case '2': case '3': case '4':
1612     case '5': case '6': case '7': case '8': case '9':
1613     case '@':
1614 #if SCM_ENABLE_DEPRECATED
1615       /* See below for 'i' and 'e'. */
1616     case 'a':
1617     case 'y':
1618     case 'h':
1619     case 'l':
1620 #endif
1621       return (scm_read_array (chr, port, opts, line, column));
1622
1623     case 'i':
1624     case 'e':
1625 #if SCM_ENABLE_DEPRECATED
1626       {
1627         /* When next char is '(', it really is an old-style
1628            uniform array. */
1629         scm_t_wchar next_c = scm_getc (port);
1630         if (next_c != EOF)
1631           scm_ungetc (next_c, port);
1632         if (next_c == '(')
1633           return scm_read_array (chr, port, opts, line, column);
1634         /* Fall through. */
1635       }
1636 #endif
1637     case 'b':
1638     case 'B':
1639     case 'o':
1640     case 'O':
1641     case 'd':
1642     case 'D':
1643     case 'x':
1644     case 'X':
1645     case 'I':
1646     case 'E':
1647       return (scm_read_number_and_radix (chr, port, opts));
1648     case '{':
1649       return (scm_read_extended_symbol (chr, port));
1650     case '!':
1651       return (scm_read_shebang (chr, port, opts));
1652     case ';':
1653       return (scm_read_commented_expression (chr, port, opts));
1654     case '`':
1655     case '\'':
1656     case ',':
1657       return (scm_read_syntax (chr, port, opts));
1658     case 'n':
1659       return (scm_read_nil (chr, port, opts));
1660     default:
1661       result = scm_read_sharp_extension (chr, port, opts);
1662       if (scm_is_eq (result, SCM_UNSPECIFIED))
1663         {
1664           /* To remain compatible with 1.8 and earlier, the following
1665              characters have lower precedence than `read-hash-extend'
1666              characters.  */
1667           switch (chr)
1668             {
1669             case '|':
1670               return scm_read_r6rs_block_comment (chr, port);
1671             default:
1672               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1673                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1674             }
1675         }
1676       else
1677         return result;
1678     }
1679
1680   return SCM_UNSPECIFIED;
1681 }
1682 #undef FUNC_NAME
1683
1684 static SCM
1685 read_inner_expression (SCM port, scm_t_read_opts *opts)
1686 #define FUNC_NAME "read_inner_expression"
1687 {
1688   while (1)
1689     {
1690       scm_t_wchar chr;
1691
1692       chr = scm_getc (port);
1693
1694       switch (chr)
1695         {
1696         case SCM_WHITE_SPACES:
1697         case SCM_LINE_INCREMENTORS:
1698           break;
1699         case ';':
1700           (void) scm_read_semicolon_comment (chr, port);
1701           break;
1702         case '{':
1703           if (opts->curly_infix_p)
1704             {
1705               if (opts->neoteric_p)
1706                 return scm_read_sexp (chr, port, opts);
1707               else
1708                 {
1709                   SCM expr;
1710
1711                   /* Enable neoteric expressions within curly braces */
1712                   opts->neoteric_p = 1;
1713                   expr = scm_read_sexp (chr, port, opts);
1714                   opts->neoteric_p = 0;
1715                   return expr;
1716                 }
1717             }
1718           else
1719             return scm_read_mixed_case_symbol (chr, port, opts);
1720         case '[':
1721           if (opts->square_brackets_p)
1722             return scm_read_sexp (chr, port, opts);
1723           else if (opts->curly_infix_p)
1724             {
1725               /* The syntax of neoteric expressions requires that '[' be
1726                  a delimiter when curly-infix is enabled, so it cannot
1727                  be part of an unescaped symbol.  We might as well do
1728                  something useful with it, so we adopt Kawa's convention:
1729                  [...] => ($bracket-list$ ...) */
1730               long line = SCM_LINUM (port);
1731               int column = SCM_COL (port) - 1;
1732               return maybe_annotate_source
1733                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1734                  port, opts, line, column);
1735             }
1736           else
1737             return scm_read_mixed_case_symbol (chr, port, opts);
1738         case '(':
1739           return (scm_read_sexp (chr, port, opts));
1740         case '"':
1741           return (scm_read_string (chr, port, opts));
1742         case '\'':
1743         case '`':
1744         case ',':
1745           return (scm_read_quote (chr, port, opts));
1746         case '#':
1747           {
1748             long line  = SCM_LINUM (port);
1749             int column = SCM_COL (port) - 1;
1750             SCM result = scm_read_sharp (chr, port, opts, line, column);
1751             if (scm_is_eq (result, SCM_UNSPECIFIED))
1752               /* We read a comment or some such.  */
1753               break;
1754             else
1755               return result;
1756           }
1757         case ')':
1758           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1759           break;
1760         case '}':
1761           if (opts->curly_infix_p)
1762             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1763           else
1764             return scm_read_mixed_case_symbol (chr, port, opts);
1765         case ']':
1766           if (opts->square_brackets_p)
1767             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1768           /* otherwise fall through */
1769         case EOF:
1770           return SCM_EOF_VAL;
1771         case ':':
1772           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1773             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1774           /* Fall through.  */
1775
1776         default:
1777           {
1778             if (((chr >= '0') && (chr <= '9'))
1779                 || (strchr ("+-.", chr)))
1780               return (scm_read_number (chr, port, opts));
1781             else
1782               return (scm_read_mixed_case_symbol (chr, port, opts));
1783           }
1784         }
1785     }
1786 }
1787 #undef FUNC_NAME
1788
1789 static SCM
1790 scm_read_expression (SCM port, scm_t_read_opts *opts)
1791 #define FUNC_NAME "scm_read_expression"
1792 {
1793   if (!opts->neoteric_p)
1794     return read_inner_expression (port, opts);
1795   else
1796     {
1797       long line = 0;
1798       int column = 0;
1799       SCM expr;
1800
1801       if (opts->record_positions_p)
1802         {
1803           /* We need to get the position of the first non-whitespace
1804              character in order to correctly annotate neoteric
1805              expressions.  For example, for the expression 'f(x)', the
1806              first call to 'read_inner_expression' reads the 'f' (which
1807              cannot be annotated), and then we later read the '(x)' and
1808              use it to construct the new list (f x). */
1809           int c = flush_ws (port, opts, (char *) NULL);
1810           if (c == EOF)
1811             return SCM_EOF_VAL;
1812           scm_ungetc (c, port);
1813           line = SCM_LINUM (port);
1814           column = SCM_COL (port);
1815         }
1816
1817       expr = read_inner_expression (port, opts);
1818
1819       /* 'expr' is the first component of the neoteric expression.  Now
1820          we loop, and as long as the next character is '(', '[', or '{',
1821          (without any intervening whitespace), we use it to construct a
1822          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1823       for (;;)
1824         {
1825           int chr = scm_getc (port);
1826
1827           if (chr == '(')
1828             /* e(...) => (e ...) */
1829             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1830           else if (chr == '[')
1831             /* e[...] => ($bracket-apply$ e ...) */
1832             expr = scm_cons (sym_bracket_apply,
1833                              scm_cons (expr,
1834                                        scm_read_sexp (chr, port, opts)));
1835           else if (chr == '{')
1836             {
1837               SCM arg = scm_read_sexp (chr, port, opts);
1838
1839               if (scm_is_null (arg))
1840                 expr = scm_list_1 (expr);       /* e{} => (e) */
1841               else
1842                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1843             }
1844           else
1845             {
1846               if (chr != EOF)
1847                 scm_ungetc (chr, port);
1848               break;
1849             }
1850           maybe_annotate_source (expr, port, opts, line, column);
1851         }
1852       return expr;
1853     }
1854 }
1855 #undef FUNC_NAME
1856
1857 \f
1858 /* Actual reader.  */
1859
1860 static void init_read_options (SCM port, scm_t_read_opts *opts);
1861
1862 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1863             (SCM port),
1864             "Read an s-expression from the input port @var{port}, or from\n"
1865             "the current input port if @var{port} is not specified.\n"
1866             "Any whitespace before the next token is discarded.")
1867 #define FUNC_NAME s_scm_read
1868 {
1869   scm_t_read_opts opts;
1870   int c;
1871
1872   if (SCM_UNBNDP (port))
1873     port = scm_current_input_port ();
1874   SCM_VALIDATE_OPINPORT (1, port);
1875
1876   init_read_options (port, &opts);
1877
1878   c = flush_ws (port, &opts, (char *) NULL);
1879   if (EOF == c)
1880     return SCM_EOF_VAL;
1881   scm_ungetc (c, port);
1882
1883   return (scm_read_expression (port, &opts));
1884 }
1885 #undef FUNC_NAME
1886
1887
1888 \f
1889
1890 /* Manipulate the read-hash-procedures alist.  This could be written in
1891    Scheme, but maybe it will also be used by C code during initialisation.  */
1892 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1893             (SCM chr, SCM proc),
1894             "Install the procedure @var{proc} for reading expressions\n"
1895             "starting with the character sequence @code{#} and @var{chr}.\n"
1896             "@var{proc} will be called with two arguments:  the character\n"
1897             "@var{chr} and the port to read further data from. The object\n"
1898             "returned will be the return value of @code{read}. \n"
1899             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1900             )
1901 #define FUNC_NAME s_scm_read_hash_extend
1902 {
1903   SCM this;
1904   SCM prev;
1905
1906   SCM_VALIDATE_CHAR (1, chr);
1907   SCM_ASSERT (scm_is_false (proc)
1908               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1909               proc, SCM_ARG2, FUNC_NAME);
1910
1911   /* Check if chr is already in the alist.  */
1912   this = scm_i_read_hash_procedures_ref ();
1913   prev = SCM_BOOL_F;
1914   while (1)
1915     {
1916       if (scm_is_null (this))
1917         {
1918           /* not found, so add it to the beginning.  */
1919           if (scm_is_true (proc))
1920             {
1921               SCM new = scm_cons (scm_cons (chr, proc),
1922                                   scm_i_read_hash_procedures_ref ());
1923               scm_i_read_hash_procedures_set_x (new);
1924             }
1925           break;
1926         }
1927       if (scm_is_eq (chr, SCM_CAAR (this)))
1928         {
1929           /* already in the alist.  */
1930           if (scm_is_false (proc))
1931             {
1932               /* remove it.  */
1933               if (scm_is_false (prev))
1934                 {
1935                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1936                   scm_i_read_hash_procedures_set_x (rest);
1937                 }
1938               else
1939                 scm_set_cdr_x (prev, SCM_CDR (this));
1940             }
1941           else
1942             {
1943               /* replace it.  */
1944               scm_set_cdr_x (SCM_CAR (this), proc);
1945             }
1946           break;
1947         }
1948       prev = this;
1949       this = SCM_CDR (this);
1950     }
1951
1952   return SCM_UNSPECIFIED;
1953 }
1954 #undef FUNC_NAME
1955
1956 /* Recover the read-hash procedure corresponding to char c.  */
1957 static SCM
1958 scm_get_hash_procedure (int c)
1959 {
1960   SCM rest = scm_i_read_hash_procedures_ref ();
1961
1962   while (1)
1963     {
1964       if (scm_is_null (rest))
1965         return SCM_BOOL_F;
1966
1967       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1968         return SCM_CDAR (rest);
1969
1970       rest = SCM_CDR (rest);
1971     }
1972 }
1973
1974 #define SCM_ENCODING_SEARCH_SIZE (500)
1975
1976 /* Search the first few hundred characters of a file for an Emacs-like coding
1977    declaration.  Returns either NULL or a string whose storage has been
1978    allocated with `scm_gc_malloc ()'.  */
1979 char *
1980 scm_i_scan_for_encoding (SCM port)
1981 {
1982   scm_t_port *pt;
1983   char header[SCM_ENCODING_SEARCH_SIZE+1];
1984   size_t bytes_read, encoding_length, i;
1985   char *encoding = NULL;
1986   int utf8_bom = 0;
1987   char *pos, *encoding_start;
1988   int in_comment;
1989
1990   pt = SCM_PTAB_ENTRY (port);
1991
1992   if (pt->rw_active == SCM_PORT_WRITE)
1993     scm_flush (port);
1994
1995   if (pt->rw_random)
1996     pt->rw_active = SCM_PORT_READ;
1997
1998   if (pt->read_pos == pt->read_end)
1999     {
2000       /* We can use the read buffer, and thus avoid a seek. */
2001       if (scm_fill_input (port) == EOF)
2002         return NULL;
2003
2004       bytes_read = pt->read_end - pt->read_pos;
2005       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2006         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2007
2008       if (bytes_read <= 1)
2009         /* An unbuffered port -- don't scan.  */
2010         return NULL;
2011
2012       memcpy (header, pt->read_pos, bytes_read);
2013       header[bytes_read] = '\0';
2014     }
2015   else
2016     {
2017       /* Try to read some bytes and then seek back.  Not all ports
2018          support seeking back; and indeed some file ports (like
2019          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2020          check performed by SCM_FPORT_FDES---but fail to seek
2021          backwards.  Hence this block comes second.  We prefer to use
2022          the read buffer in-place.  */
2023       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2024         return NULL;
2025
2026       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2027       header[bytes_read] = '\0';
2028       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2029     }
2030
2031   if (bytes_read > 3
2032       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2033     utf8_bom = 1;
2034
2035   /* search past "coding[:=]" */
2036   pos = header;
2037   while (1)
2038     {
2039       if ((pos = strstr(pos, "coding")) == NULL)
2040         return NULL;
2041
2042       pos += strlen("coding");
2043       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2044           (*pos == ':' || *pos == '='))
2045         {
2046           pos ++;
2047           break;
2048         }
2049     }
2050
2051   /* skip spaces */
2052   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2053          (*pos == ' ' || *pos == '\t'))
2054     pos ++;
2055
2056   /* grab the next token */
2057   encoding_start = pos;
2058   i = 0;
2059   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2060          && encoding_start + i - header < bytes_read
2061          && (isalnum ((int) encoding_start[i])
2062              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2063     i++;
2064
2065   encoding_length = i;
2066   if (encoding_length == 0)
2067     return NULL;
2068
2069   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2070   for (i = 0; i < encoding_length; i++)
2071     encoding[i] = toupper ((int) encoding[i]);
2072
2073   /* push backwards to make sure we were in a comment */
2074   in_comment = 0;
2075   pos = encoding_start;
2076   while (pos >= header)
2077     {
2078       if (*pos == ';')
2079         {
2080           in_comment = 1;
2081           break;
2082         }
2083       else if (*pos == '\n' || pos == header)
2084         {
2085           /* This wasn't in a semicolon comment. Check for a
2086            hash-bang comment. */
2087           char *beg = strstr (header, "#!");
2088           char *end = strstr (header, "!#");
2089           if (beg < encoding_start && encoding_start + encoding_length <= end)
2090             in_comment = 1;
2091           break;
2092         }
2093       else
2094         {
2095           pos --;
2096           continue;
2097         }
2098     }
2099   if (!in_comment)
2100     /* This wasn't in a comment */
2101     return NULL;
2102
2103   if (utf8_bom && strcmp(encoding, "UTF-8"))
2104     scm_misc_error (NULL,
2105                     "the port input declares the encoding ~s but is encoded as UTF-8",
2106                     scm_list_1 (scm_from_locale_string (encoding)));
2107
2108   return encoding;
2109 }
2110
2111 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2112             (SCM port),
2113             "Scans the port for an Emacs-like character coding declaration\n"
2114             "near the top of the contents of a port with random-accessible contents.\n"
2115             "The coding declaration is of the form\n"
2116             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2117             "\n"
2118             "Returns a string containing the character encoding of the file\n"
2119             "if a declaration was found, or @code{#f} otherwise.\n")
2120 #define FUNC_NAME s_scm_file_encoding
2121 {
2122   char *enc;
2123   SCM s_enc;
2124
2125   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2126
2127   enc = scm_i_scan_for_encoding (port);
2128   if (enc == NULL)
2129     return SCM_BOOL_F;
2130   else
2131     {
2132       s_enc = scm_from_locale_string (enc);
2133       return s_enc;
2134     }
2135
2136   return SCM_BOOL_F;
2137 }
2138 #undef FUNC_NAME
2139
2140 \f
2141 /* Per-port read options.
2142
2143    We store per-port read options in the 'port-read-options' key of the
2144    port's alist, which is stored in 'scm_i_port_weak_hash'.  The value
2145    stored in the alist is a single integer that contains a two-bit field
2146    for each read option.
2147
2148    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2149    the applicable value should be inherited from the corresponding
2150    global read option.  Otherwise, the bit field contains the value of
2151    the read option.  For boolean read options that have been set
2152    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2153    read option has been set per-port, its possible values are those in
2154    'enum t_keyword_style'. */
2155
2156 /* Key to read options in per-port alists. */
2157 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2158
2159 /* Offsets of bit fields for each per-port override */
2160 #define READ_OPTION_COPY_SOURCE_P          0
2161 #define READ_OPTION_RECORD_POSITIONS_P     2
2162 #define READ_OPTION_CASE_INSENSITIVE_P     4
2163 #define READ_OPTION_KEYWORD_STYLE          6
2164 #define READ_OPTION_R6RS_ESCAPES_P         8
2165 #define READ_OPTION_SQUARE_BRACKETS_P     10
2166 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2167 #define READ_OPTION_CURLY_INFIX_P         14
2168
2169 /* The total width in bits of the per-port overrides */
2170 #define READ_OPTIONS_NUM_BITS             16
2171
2172 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2173 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2174
2175 #define READ_OPTION_MASK     3
2176 #define READ_OPTION_INHERIT  3
2177
2178 static void
2179 set_port_read_option (SCM port, int option, int new_value)
2180 {
2181   SCM alist, scm_read_options;
2182   unsigned int read_options;
2183
2184   new_value &= READ_OPTION_MASK;
2185   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2186   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2187   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2188   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2189     read_options = scm_to_uint (scm_read_options);
2190   else
2191     read_options = READ_OPTIONS_INHERIT_ALL;
2192   read_options &= ~(READ_OPTION_MASK << option);
2193   read_options |= new_value << option;
2194   scm_read_options = scm_from_uint (read_options);
2195   alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2196   scm_hashq_set_x (scm_i_port_weak_hash, port, alist);
2197   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2198 }
2199
2200 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2201 static void
2202 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2203 {
2204   value = !!value;
2205   opts->case_insensitive_p = value;
2206   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2207 }
2208
2209 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2210 static void
2211 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2212 {
2213   value = !!value;
2214   opts->square_brackets_p = value;
2215   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2216 }
2217
2218 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2219 static void
2220 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2221 {
2222   value = !!value;
2223   opts->curly_infix_p = value;
2224   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2225 }
2226
2227 /* Initialize OPTS based on PORT's read options and the global read
2228    options. */
2229 static void
2230 init_read_options (SCM port, scm_t_read_opts *opts)
2231 {
2232   SCM alist, val, scm_read_options;
2233   unsigned int read_options, x;
2234
2235   scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
2236   alist = scm_hashq_ref (scm_i_port_weak_hash, port, SCM_BOOL_F);
2237   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2238   scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
2239
2240   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2241     read_options = scm_to_uint (scm_read_options);
2242   else
2243     read_options = READ_OPTIONS_INHERIT_ALL;
2244
2245   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2246   if (x == READ_OPTION_INHERIT)
2247     {
2248       val = SCM_PACK (SCM_KEYWORD_STYLE);
2249       if (scm_is_eq (val, scm_keyword_prefix))
2250         x = KEYWORD_STYLE_PREFIX;
2251       else if (scm_is_eq (val, scm_keyword_postfix))
2252         x = KEYWORD_STYLE_POSTFIX;
2253       else
2254         x = KEYWORD_STYLE_HASH_PREFIX;
2255     }
2256   opts->keyword_style = x;
2257
2258 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2259   do                                                                    \
2260     {                                                                   \
2261       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2262       if (x == READ_OPTION_INHERIT)                                     \
2263         x = !!SCM_ ## NAME;                                             \
2264           opts->name = x;                                               \
2265     }                                                                   \
2266   while (0)
2267
2268   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2269   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2270   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2271   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2272   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2273   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2274   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2275
2276 #undef RESOLVE_BOOLEAN_OPTION
2277
2278   opts->neoteric_p = 0;
2279 }
2280
2281 void
2282 scm_init_read ()
2283 {
2284   SCM read_hash_procs;
2285
2286   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2287
2288   scm_i_read_hash_procedures =
2289     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2290
2291   scm_init_opts (scm_read_options, scm_read_opts);
2292 #include "libguile/read.x"
2293 }
2294
2295 /*
2296   Local Variables:
2297   c-file-style: "gnu"
2298   End:
2299 */