libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 /* SRFI-105 curly infix expression support */
  67 SCM_SYMBOL (sym_nfx, "$nfx$");
  68 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  69 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  70
  71 scm_t_option scm_read_opts[] =
  72   {
  73     { SCM_OPTION_BOOLEAN, "copy", 0,
  74       "Copy source code expressions." },
  75     { SCM_OPTION_BOOLEAN, "positions", 1,
  76       "Record positions of source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  78       "Convert symbols to lower case."},
  79     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  80       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  81     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  82       "Use R6RS variable-length character and string hex escapes."},
  83     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  84       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  85     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  86       "In strings, consume leading whitespace after an escaped end-of-line."},
  87     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  88       "Support SRFI-105 curly infix expressions."},
  89     { 0, },
  90   };
  91
  92 /* Internal read options structure.  This is initialized by 'scm_read'
  93    from the global and per-port read options, and a pointer is passed
  94    down to all helper functions. */
  95
  96 enum t_keyword_style
  97   {
  98     KEYWORD_STYLE_HASH_PREFIX,
  99     KEYWORD_STYLE_PREFIX,
 100     KEYWORD_STYLE_POSTFIX
 101   };
 102
 103 struct t_read_opts
 104 {
 105   enum t_keyword_style keyword_style;
 106   unsigned int copy_source_p        : 1;
 107   unsigned int record_positions_p   : 1;
 108   unsigned int case_insensitive_p   : 1;
 109   unsigned int r6rs_escapes_p       : 1;
 110   unsigned int square_brackets_p    : 1;
 111   unsigned int hungry_eol_escapes_p : 1;
 112   unsigned int curly_infix_p        : 1;
 113   unsigned int neoteric_p           : 1;
 114 };
 115
 116 typedef struct t_read_opts scm_t_read_opts;
 117
 118
 119 /*
 120   Give meaningful error messages for errors
 121
 122   We use the format
 123
 124   FILE:LINE:COL: MESSAGE
 125   This happened in ....
 126
 127   This is not standard GNU format, but the test-suite likes the real
 128   message to be in front.
 129
 130  */
 131
 132
 133 void
 134 scm_i_input_error (char const *function,
 135                    SCM port, const char *message, SCM arg)
 136 {
 137   SCM fn = (scm_is_string (SCM_FILENAME(port))
 138             ? SCM_FILENAME(port)
 139             : scm_from_locale_string ("#<unknown port>"));
 140
 141   SCM string_port = scm_open_output_string ();
 142   SCM string = SCM_EOL;
 143   scm_simple_format (string_port,
 144                      scm_from_locale_string ("~A:~S:~S: ~A"),
 145                      scm_list_4 (fn,
 146                                  scm_from_long (SCM_LINUM (port) + 1),
 147                                  scm_from_int (SCM_COL (port) + 1),
 148                                  scm_from_locale_string (message)));
 149
 150   string = scm_get_output_string (string_port);
 151   scm_close_output_port (string_port);
 152   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 153                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 154                  string,
 155                  arg,
 156                  SCM_BOOL_F);
 157 }
 158
 159
 160 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 161             (SCM setting),
 162             "Option interface for the read options. Instead of using\n"
 163             "this procedure directly, use the procedures @code{read-enable},\n"
 164             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 165 #define FUNC_NAME s_scm_read_options
 166 {
 167   SCM ans = scm_options (setting,
 168                          scm_read_opts,
 169                          FUNC_NAME);
 170   if (SCM_COPY_SOURCE_P)
 171     SCM_RECORD_POSITIONS_P = 1;
 172   return ans;
 173 }
 174 #undef FUNC_NAME
 175
 176 /* A fluid referring to an association list mapping extra hash
 177    characters to procedures.  */
 178 static SCM *scm_i_read_hash_procedures;
 179
 180 static SCM
 181 scm_i_read_hash_procedures_ref (void)
 182 {
 183   return scm_fluid_ref (*scm_i_read_hash_procedures);
 184 }
 185
 186 static void
 187 scm_i_read_hash_procedures_set_x (SCM value)
 188 {
 189   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 190 }
 191
 192 \f
 193 /* Token readers.  */
 194
 195
 196 /* Size of the C buffer used to read symbols and numbers.  */
 197 #define READER_BUFFER_SIZE            128
 198
 199 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 200 #define READER_STRING_BUFFER_SIZE     128
 201
 202 /* The maximum size of Scheme character names.  */
 203 #define READER_CHAR_NAME_MAX_SIZE      50
 204
 205 /* The maximum size of reader directive names.  */
 206 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 207
 208
 209 /* `isblank' is only in C99.  */
 210 #define CHAR_IS_BLANK_(_chr)                                    \
 211   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 212    || ((_chr) == '\f') || ((_chr) == '\r'))
 213
 214 #ifdef MSDOS
 215 # define CHAR_IS_BLANK(_chr)                    \
 216   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 217 #else
 218 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 219 #endif
 220
 221
 222 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 223    structure'').  */
 224 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 225   (CHAR_IS_BLANK (c)                                            \
 226    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 227
 228 #define CHAR_IS_DELIMITER(c)                                    \
 229   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 230    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 231                                       || opts->curly_infix_p))  \
 232    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 233
 234 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 235    Structure''.  */
 236 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 237   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 238    || ((_chr) == 'd') || ((_chr) == 'l'))
 239
 240 /* Read an SCSH block comment.  */
 241 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 242 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 244 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_get_hash_procedure (int);
 246
 247 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 248    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 249    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 250    bytes actually read.  */
 251 static int
 252 read_token (SCM port, scm_t_read_opts *opts,
 253             char *buf, size_t buf_size, size_t *read)
 254 {
 255    *read = 0;
 256
 257    while (*read < buf_size)
 258      {
 259        int chr;
 260
 261        chr = scm_get_byte_or_eof (port);
 262
 263        if (chr == EOF)
 264         return 0;
 265       else if (CHAR_IS_DELIMITER (chr))
 266         {
 267           scm_unget_byte (chr, port);
 268           return 0;
 269         }
 270       else
 271         {
 272           *buf = (char) chr;
 273           buf++, (*read)++;
 274         }
 275      }
 276
 277    return 1;
 278  }
 279
 280 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 281    if the token doesn't fit in BUFFER_SIZE bytes.  */
 282 static char *
 283 read_complete_token (SCM port, scm_t_read_opts *opts,
 284                      char *buffer, size_t buffer_size, size_t *read)
 285 {
 286   int overflow = 0;
 287   size_t bytes_read, overflow_size = 0;
 288   char *overflow_buffer = NULL;
 289
 290   do
 291     {
 292       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 293       if (bytes_read == 0)
 294         break;
 295       if (overflow || overflow_size != 0)
 296         {
 297           if (overflow_size == 0)
 298             {
 299               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 300               memcpy (overflow_buffer, buffer, bytes_read);
 301               overflow_size = bytes_read;
 302             }
 303           else
 304             {
 305               char *new_buf =
 306                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 307
 308               memcpy (new_buf, overflow_buffer, overflow_size);
 309               memcpy (new_buf + overflow_size, buffer, bytes_read);
 310
 311               overflow_buffer = new_buf;
 312               overflow_size += bytes_read;
 313             }
 314         }
 315     }
 316   while (overflow);
 317
 318   if (overflow_size)
 319     *read = overflow_size;
 320   else
 321     *read = bytes_read;
 322
 323   return (overflow_size > 0 ? overflow_buffer : buffer);
 324 }
 325
 326 /* Skip whitespace from PORT and return the first non-whitespace character
 327    read.  Raise an error on end-of-file.  */
 328 static int
 329 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 330 {
 331   scm_t_wchar c;
 332   while (1)
 333     switch (c = scm_getc (port))
 334       {
 335       case EOF:
 336       goteof:
 337         if (eoferr)
 338           {
 339             scm_i_input_error (eoferr,
 340                                port,
 341                                "end of file",
 342                                SCM_EOL);
 343           }
 344         return c;
 345
 346       case ';':
 347       lp:
 348         switch (c = scm_getc (port))
 349           {
 350           case EOF:
 351             goto goteof;
 352           default:
 353             goto lp;
 354           case SCM_LINE_INCREMENTORS:
 355             break;
 356           }
 357         break;
 358
 359       case '#':
 360         switch (c = scm_getc (port))
 361           {
 362           case EOF:
 363             eoferr = "read_sharp";
 364             goto goteof;
 365           case '!':
 366             scm_read_shebang (c, port, opts);
 367             break;
 368           case ';':
 369             scm_read_commented_expression (c, port, opts);
 370             break;
 371           case '|':
 372             if (scm_is_false (scm_get_hash_procedure (c)))
 373               {
 374                 scm_read_r6rs_block_comment (c, port);
 375                 break;
 376               }
 377             /* fall through */
 378           default:
 379             scm_ungetc (c, port);
 380             return '#';
 381           }
 382         break;
 383
 384       case SCM_LINE_INCREMENTORS:
 385       case SCM_SINGLE_SPACES:
 386       case '\t':
 387         break;
 388
 389       default:
 390         return c;
 391       }
 392
 393   return 0;
 394 }
 395
 396
 397 \f
 398 /* Token readers.  */
 399
 400 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 401 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 402                            long line, int column);
 403
 404
 405 static SCM
 406 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 407                        long line, int column)
 408 {
 409   if (opts->record_positions_p)
 410     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 411   return x;
 412 }
 413
 414 static SCM
 415 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 416 #define FUNC_NAME "scm_i_lreadparen"
 417 {
 418   int c;
 419   SCM tmp, tl, ans = SCM_EOL;
 420   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 421   const int terminating_char = ((chr == '{') ? '}'
 422                                 : ((chr == '[') ? ']'
 423                                    : ')'));
 424
 425   /* Need to capture line and column numbers here. */
 426   long line = SCM_LINUM (port);
 427   int column = SCM_COL (port) - 1;
 428
 429   c = flush_ws (port, opts, FUNC_NAME);
 430   if (terminating_char == c)
 431     return SCM_EOL;
 432
 433   scm_ungetc (c, port);
 434   tmp = scm_read_expression (port, opts);
 435
 436   /* Note that it is possible for scm_read_expression to return
 437      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 438      check that it's a real dot by checking `c'.  */
 439   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 440     {
 441       ans = scm_read_expression (port, opts);
 442       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 443         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 444                            SCM_EOL);
 445       return ans;
 446     }
 447
 448   /* Build the head of the list structure. */
 449   ans = tl = scm_cons (tmp, SCM_EOL);
 450
 451   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 452     {
 453       SCM new_tail;
 454
 455       if (c == ')' || (c == ']' && opts->square_brackets_p)
 456           || ((c == '}' || c == ']') && opts->curly_infix_p))
 457         scm_i_input_error (FUNC_NAME, port,
 458                            "in pair: mismatched close paren: ~A",
 459                            scm_list_1 (SCM_MAKE_CHAR (c)));
 460
 461       scm_ungetc (c, port);
 462       tmp = scm_read_expression (port, opts);
 463
 464       /* See above note about scm_sym_dot.  */
 465       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 466         {
 467           SCM_SETCDR (tl, scm_read_expression (port, opts));
 468
 469           c = flush_ws (port, opts, FUNC_NAME);
 470           if (terminating_char != c)
 471             scm_i_input_error (FUNC_NAME, port,
 472                                "in pair: missing close paren", SCM_EOL);
 473           break;
 474         }
 475
 476       new_tail = scm_cons (tmp, SCM_EOL);
 477       SCM_SETCDR (tl, new_tail);
 478       tl = new_tail;
 479     }
 480
 481   if (curly_list_p)
 482     {
 483       /* In addition to finding the length, 'scm_ilength' checks for
 484          improper or circular lists, in which case it returns -1. */
 485       int len = scm_ilength (ans);
 486
 487       /* The (len == 0) case is handled above */
 488       if (len == 1)
 489         /* Return directly to avoid re-annotating the element's source
 490            location with the position of the outer brace.  Also, it
 491            might not be possible to annotate the element. */
 492         return scm_car (ans);  /* {e} => e */
 493       else if (len == 2)
 494         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 495       else if (len >= 3 && (len & 1))
 496         {
 497           /* It's a proper list whose length is odd and at least 3.  If
 498              the elements at odd indices (the infix operator positions)
 499              are all 'equal?', then it's a simple curly-infix list.
 500              Otherwise it's a mixed curly-infix list. */
 501           SCM op = scm_cadr (ans);
 502
 503           /* Check to see if the elements at odd indices are 'equal?' */
 504           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 505             {
 506               if (scm_is_null (tl))
 507                 {
 508                   /* Convert simple curly-infix list to prefix:
 509                      {a <op> b <op> ...} => (<op> a b ...) */
 510                   tl = ans;
 511                   while (scm_is_pair (scm_cdr (tl)))
 512                     {
 513                       tmp = scm_cddr (tl);
 514                       SCM_SETCDR (tl, tmp);
 515                       tl = tmp;
 516                     }
 517                   ans = scm_cons (op, ans);
 518                   break;
 519                 }
 520               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 521                 {
 522                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 523                   ans = scm_cons (sym_nfx, ans);
 524                   break;
 525                 }
 526             }
 527         }
 528       else
 529         /* Mixed curly-infix (possibly improper) list:
 530            {e . tail} => ($nfx$ e . tail) */
 531         ans = scm_cons (sym_nfx, ans);
 532     }
 533
 534   return maybe_annotate_source (ans, port, opts, line, column);
 535 }
 536 #undef FUNC_NAME
 537
 538
 539 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 540    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 541    found.  */
 542 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 543   do                                                               \
 544     {                                                              \
 545       scm_t_wchar a;                                               \
 546       size_t i = 0;                                                \
 547       c = 0;                                                       \
 548       while (i < ndigits)                                          \
 549         {                                                          \
 550           a = scm_getc (port);                                     \
 551           if (a == EOF)                                            \
 552             goto str_eof;                                          \
 553           if (terminator                                           \
 554               && (a == (scm_t_wchar) terminator)                   \
 555               && (i > 0))                                          \
 556             break;                                                 \
 557           if ('0' <= a && a <= '9')                                \
 558             a -= '0';                                              \
 559           else if ('A' <= a && a <= 'F')                           \
 560             a = a - 'A' + 10;                                      \
 561           else if ('a' <= a && a <= 'f')                           \
 562             a = a - 'a' + 10;                                      \
 563           else                                                     \
 564             {                                                      \
 565               c = a;                                               \
 566               goto bad_escaped;                                    \
 567             }                                                      \
 568           c = c * 16 + a;                                          \
 569           i ++;                                                    \
 570         }                                                          \
 571     } while (0)
 572
 573 static void
 574 skip_intraline_whitespace (SCM port)
 575 {
 576   scm_t_wchar c;
 577
 578   do
 579     {
 580       c = scm_getc (port);
 581       if (c == EOF)
 582         return;
 583     }
 584   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 585
 586   scm_ungetc (c, port);
 587 }
 588
 589 static SCM
 590 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 591 #define FUNC_NAME "scm_lreadr"
 592 {
 593   /* For strings smaller than C_STR, this function creates only one Scheme
 594      object (the string returned).  */
 595
 596   SCM str = SCM_EOL;
 597   size_t c_str_len = 0;
 598   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 599
 600   /* Need to capture line and column numbers here. */
 601   long line = SCM_LINUM (port);
 602   int column = SCM_COL (port) - 1;
 603
 604   while ('"' != (c = scm_getc (port)))
 605     {
 606       if (c == EOF)
 607         {
 608         str_eof:
 609           scm_i_input_error (FUNC_NAME, port,
 610                              "end of file in string constant", SCM_EOL);
 611         }
 612
 613       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 614         {
 615           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 616           c_str_len = 0;
 617         }
 618
 619       if (c == '\\')
 620         {
 621           switch (c = scm_getc (port))
 622             {
 623             case EOF:
 624               goto str_eof;
 625             case '"':
 626             case '\\':
 627               break;
 628             case '\n':
 629               if (opts->hungry_eol_escapes_p)
 630                 skip_intraline_whitespace (port);
 631               continue;
 632             case '0':
 633               c = '\0';
 634               break;
 635             case 'f':
 636               c = '\f';
 637               break;
 638             case 'n':
 639               c = '\n';
 640               break;
 641             case 'r':
 642               c = '\r';
 643               break;
 644             case 't':
 645               c = '\t';
 646               break;
 647             case 'a':
 648               c = '\007';
 649               break;
 650             case 'v':
 651               c = '\v';
 652               break;
 653             case 'b':
 654               c = '\010';
 655               break;
 656             case 'x':
 657               if (opts->r6rs_escapes_p)
 658                 SCM_READ_HEX_ESCAPE (10, ';');
 659               else
 660                 SCM_READ_HEX_ESCAPE (2, '\0');
 661               break;
 662             case 'u':
 663               if (!opts->r6rs_escapes_p)
 664                 {
 665                   SCM_READ_HEX_ESCAPE (4, '\0');
 666                   break;
 667                 }
 668             case 'U':
 669               if (!opts->r6rs_escapes_p)
 670                 {
 671                   SCM_READ_HEX_ESCAPE (6, '\0');
 672                   break;
 673                 }
 674             default:
 675             bad_escaped:
 676               scm_i_input_error (FUNC_NAME, port,
 677                                  "illegal character in escape sequence: ~S",
 678                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 679             }
 680         }
 681
 682       c_str[c_str_len++] = c;
 683     }
 684
 685   if (scm_is_null (str))
 686     /* Fast path: we got a string that fits in C_STR.  */
 687     str = scm_from_utf32_stringn (c_str, c_str_len);
 688   else
 689     {
 690       if (c_str_len > 0)
 691         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 692
 693       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 694     }
 695
 696   return maybe_annotate_source (str, port, opts, line, column);
 697 }
 698 #undef FUNC_NAME
 699
 700
 701 static SCM
 702 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 703 {
 704   SCM result, str = SCM_EOL;
 705   char local_buffer[READER_BUFFER_SIZE], *buffer;
 706   size_t bytes_read;
 707   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 708
 709   /* Need to capture line and column numbers here. */
 710   long line = SCM_LINUM (port);
 711   int column = SCM_COL (port) - 1;
 712
 713   scm_ungetc (chr, port);
 714   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 715                                 &bytes_read);
 716
 717   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 718
 719   result = scm_string_to_number (str, SCM_UNDEFINED);
 720   if (scm_is_false (result))
 721     {
 722       /* Return a symbol instead of a number */
 723       if (opts->case_insensitive_p)
 724         str = scm_string_downcase_x (str);
 725       result = scm_string_to_symbol (str);
 726     }
 727   else if (SCM_NIMP (result))
 728     result = maybe_annotate_source (result, port, opts, line, column);
 729
 730   SCM_COL (port) += scm_i_string_length (str);
 731   return result;
 732 }
 733
 734 static SCM
 735 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 736 {
 737   SCM result;
 738   int ends_with_colon = 0;
 739   size_t bytes_read;
 740   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 741   char local_buffer[READER_BUFFER_SIZE], *buffer;
 742   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 743   SCM str;
 744
 745   scm_ungetc (chr, port);
 746   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 747                                 &bytes_read);
 748   if (bytes_read > 0)
 749     ends_with_colon = buffer[bytes_read - 1] == ':';
 750
 751   if (postfix && ends_with_colon && (bytes_read > 1))
 752     {
 753       str = scm_from_stringn (buffer, bytes_read - 1,
 754                               pt->encoding, pt->ilseq_handler);
 755
 756       if (opts->case_insensitive_p)
 757         str = scm_string_downcase_x (str);
 758       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 759     }
 760   else
 761     {
 762       str = scm_from_stringn (buffer, bytes_read,
 763                               pt->encoding, pt->ilseq_handler);
 764
 765       if (opts->case_insensitive_p)
 766         str = scm_string_downcase_x (str);
 767       result = scm_string_to_symbol (str);
 768     }
 769
 770   SCM_COL (port) += scm_i_string_length (str);
 771   return result;
 772 }
 773
 774 static SCM
 775 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 776 #define FUNC_NAME "scm_lreadr"
 777 {
 778   SCM result;
 779   size_t read;
 780   char local_buffer[READER_BUFFER_SIZE], *buffer;
 781   unsigned int radix;
 782   SCM str;
 783   scm_t_port *pt;
 784
 785   switch (chr)
 786     {
 787     case 'B':
 788     case 'b':
 789       radix = 2;
 790       break;
 791
 792     case 'o':
 793     case 'O':
 794       radix = 8;
 795       break;
 796
 797     case 'd':
 798     case 'D':
 799       radix = 10;
 800       break;
 801
 802     case 'x':
 803     case 'X':
 804       radix = 16;
 805       break;
 806
 807     default:
 808       scm_ungetc (chr, port);
 809       scm_ungetc ('#', port);
 810       radix = 10;
 811     }
 812
 813   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 814                                 &read);
 815
 816   pt = SCM_PTAB_ENTRY (port);
 817   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 818
 819   result = scm_string_to_number (str, scm_from_uint (radix));
 820
 821   SCM_COL (port) += scm_i_string_length (str);
 822
 823   if (scm_is_true (result))
 824     return result;
 825
 826   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 827
 828   return SCM_BOOL_F;
 829 }
 830 #undef FUNC_NAME
 831
 832 static SCM
 833 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 834 {
 835   SCM p;
 836   long line = SCM_LINUM (port);
 837   int column = SCM_COL (port) - 1;
 838
 839   switch (chr)
 840     {
 841     case '`':
 842       p = scm_sym_quasiquote;
 843       break;
 844
 845     case '\'':
 846       p = scm_sym_quote;
 847       break;
 848
 849     case ',':
 850       {
 851         scm_t_wchar c;
 852
 853         c = scm_getc (port);
 854         if ('@' == c)
 855           p = scm_sym_uq_splicing;
 856         else
 857           {
 858             scm_ungetc (c, port);
 859             p = scm_sym_unquote;
 860           }
 861         break;
 862       }
 863
 864     default:
 865       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 866                "scm_read_quote", chr);
 867       abort ();
 868     }
 869
 870   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 871   return maybe_annotate_source (p, port, opts, line, column);
 872 }
 873
 874 SCM_SYMBOL (sym_syntax, "syntax");
 875 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 876 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 877 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 878
 879 static SCM
 880 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 881 {
 882   SCM p;
 883   long line = SCM_LINUM (port);
 884   int column = SCM_COL (port) - 1;
 885
 886   switch (chr)
 887     {
 888     case '`':
 889       p = sym_quasisyntax;
 890       break;
 891
 892     case '\'':
 893       p = sym_syntax;
 894       break;
 895
 896     case ',':
 897       {
 898         int c;
 899
 900         c = scm_getc (port);
 901         if ('@' == c)
 902           p = sym_unsyntax_splicing;
 903         else
 904           {
 905             scm_ungetc (c, port);
 906             p = sym_unsyntax;
 907           }
 908         break;
 909       }
 910
 911     default:
 912       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 913                "scm_read_syntax", chr);
 914       abort ();
 915     }
 916
 917   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 918   return maybe_annotate_source (p, port, opts, line, column);
 919 }
 920
 921 static SCM
 922 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 923 {
 924   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 925
 926   if (!scm_is_eq (id, sym_nil))
 927     scm_i_input_error ("scm_read_nil", port,
 928                        "unexpected input while reading #nil: ~a",
 929                        scm_list_1 (id));
 930
 931   return SCM_ELISP_NIL;
 932 }
 933
 934 static SCM
 935 scm_read_semicolon_comment (int chr, SCM port)
 936 {
 937   int c;
 938
 939   /* We use the get_byte here because there is no need to get the
 940      locale correct with comment input. This presumes that newline
 941      always represents itself no matter what the encoding is.  */
 942   for (c = scm_get_byte_or_eof (port);
 943        (c != EOF) && (c != '\n');
 944        c = scm_get_byte_or_eof (port));
 945
 946   return SCM_UNSPECIFIED;
 947 }
 948
 949 \f
 950 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 951
 952 static SCM
 953 scm_read_boolean (int chr, SCM port)
 954 {
 955   switch (chr)
 956     {
 957     case 't':
 958     case 'T':
 959       return SCM_BOOL_T;
 960
 961     case 'f':
 962     case 'F':
 963       return SCM_BOOL_F;
 964     }
 965
 966   return SCM_UNSPECIFIED;
 967 }
 968
 969 static SCM
 970 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 971 #define FUNC_NAME "scm_lreadr"
 972 {
 973   char buffer[READER_CHAR_NAME_MAX_SIZE];
 974   SCM charname;
 975   size_t charname_len, bytes_read;
 976   scm_t_wchar cp;
 977   int overflow;
 978   scm_t_port *pt;
 979
 980   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 981                          &bytes_read);
 982   if (overflow)
 983     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 984
 985   if (bytes_read == 0)
 986     {
 987       chr = scm_getc (port);
 988       if (chr == EOF)
 989         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 990                            "while reading character", SCM_EOL);
 991
 992       /* CHR must be a token delimiter, like a whitespace.  */
 993       return (SCM_MAKE_CHAR (chr));
 994     }
 995
 996   pt = SCM_PTAB_ENTRY (port);
 997
 998   /* Simple ASCII characters can be processed immediately.  Also, simple
 999      ISO-8859-1 characters can be processed immediately if the encoding for this
1000      port is ISO-8859-1.  */
1001   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1002     {
1003       SCM_COL (port) += 1;
1004       return SCM_MAKE_CHAR (buffer[0]);
1005     }
1006
1007   /* Otherwise, convert the buffer into a proper scheme string for
1008      processing.  */
1009   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1010                                pt->ilseq_handler);
1011   charname_len = scm_i_string_length (charname);
1012   SCM_COL (port) += charname_len;
1013   cp = scm_i_string_ref (charname, 0);
1014   if (charname_len == 1)
1015     return SCM_MAKE_CHAR (cp);
1016
1017   /* Ignore dotted circles, which may be used to keep combining characters from
1018      combining with the backslash in #\charname.  */
1019   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1020     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1021
1022   if (cp >= '0' && cp < '8')
1023     {
1024       /* Dirk:FIXME::  This type of character syntax is not R5RS
1025        * compliant.  Further, it should be verified that the constant
1026        * does only consist of octal digits.  */
1027       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1028       if (SCM_I_INUMP (p))
1029         {
1030           scm_t_wchar c = scm_to_uint32 (p);
1031           if (SCM_IS_UNICODE_CHAR (c))
1032             return SCM_MAKE_CHAR (c);
1033           else
1034             scm_i_input_error (FUNC_NAME, port,
1035                                "out-of-range octal character escape: ~a",
1036                                scm_list_1 (charname));
1037         }
1038     }
1039
1040   if (cp == 'x' && (charname_len > 1))
1041     {
1042       SCM p;
1043
1044       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1045       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1046                                 scm_from_uint (16));
1047       if (SCM_I_INUMP (p))
1048         {
1049           scm_t_wchar c = scm_to_uint32 (p);
1050           if (SCM_IS_UNICODE_CHAR (c))
1051             return SCM_MAKE_CHAR (c);
1052           else
1053             scm_i_input_error (FUNC_NAME, port,
1054                                "out-of-range hex character escape: ~a",
1055                                scm_list_1 (charname));
1056         }
1057     }
1058
1059   /* The names of characters should never have non-Latin1
1060      characters.  */
1061   if (scm_i_is_narrow_string (charname)
1062       || scm_i_try_narrow_string (charname))
1063     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1064                                        charname_len);
1065       if (scm_is_true (ch))
1066         return ch;
1067     }
1068
1069   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1070                      scm_list_1 (charname));
1071
1072   return SCM_UNSPECIFIED;
1073 }
1074 #undef FUNC_NAME
1075
1076 static SCM
1077 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1078 {
1079   SCM symbol;
1080
1081   /* Read the symbol that comprises the keyword.  Doing this instead of
1082      invoking a specific symbol reader function allows `scm_read_keyword ()'
1083      to adapt to the delimiters currently valid of symbols.
1084
1085      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1086   symbol = scm_read_expression (port, opts);
1087   if (!scm_is_symbol (symbol))
1088     scm_i_input_error ("scm_read_keyword", port,
1089                        "keyword prefix `~a' not followed by a symbol: ~s",
1090                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1091
1092   return (scm_symbol_to_keyword (symbol));
1093 }
1094
1095 static SCM
1096 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1097                  long line, int column)
1098 {
1099   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1100      guarantee that it's going to do what we want.  After all, this is an
1101      implementation detail of `scm_read_vector ()', not a desirable
1102      property.  */
1103   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1104                                 port, opts, line, column);
1105 }
1106
1107 /* Helper used by scm_read_array */
1108 static int
1109 read_decimal_integer (SCM port, int c, ssize_t *resp)
1110 {
1111   ssize_t sign = 1;
1112   ssize_t res = 0;
1113   int got_it = 0;
1114
1115   if (c == '-')
1116     {
1117       sign = -1;
1118       c = scm_getc (port);
1119     }
1120
1121   while ('0' <= c && c <= '9')
1122     {
1123       res = 10*res + c-'0';
1124       got_it = 1;
1125       c = scm_getc (port);
1126     }
1127
1128   if (got_it)
1129     *resp = sign * res;
1130   return c;
1131 }
1132
1133 /* Read an array.  This function can also read vectors and uniform
1134    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1135    handled here.
1136
1137    C is the first character read after the '#'. */
1138 static SCM
1139 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1140 {
1141   ssize_t rank;
1142   scm_t_wchar tag_buf[8];
1143   int tag_len;
1144
1145   SCM tag, shape = SCM_BOOL_F, elements, array;
1146
1147   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1148      the array code can not deal with zero-length dimensions yet, and
1149      we want to allow zero-length vectors, of course. */
1150   if (c == '(')
1151     return scm_read_vector (c, port, opts, line, column);
1152
1153   /* Disambiguate between '#f' and uniform floating point vectors. */
1154   if (c == 'f')
1155     {
1156       c = scm_getc (port);
1157       if (c != '3' && c != '6')
1158         {
1159           if (c != EOF)
1160             scm_ungetc (c, port);
1161           return SCM_BOOL_F;
1162         }
1163       rank = 1;
1164       tag_buf[0] = 'f';
1165       tag_len = 1;
1166       goto continue_reading_tag;
1167     }
1168
1169   /* Read rank. */
1170   rank = 1;
1171   c = read_decimal_integer (port, c, &rank);
1172   if (rank < 0)
1173     scm_i_input_error (NULL, port, "array rank must be non-negative",
1174                        SCM_EOL);
1175
1176   /* Read tag. */
1177   tag_len = 0;
1178  continue_reading_tag:
1179   while (c != EOF && c != '(' && c != '@' && c != ':'
1180          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1181     {
1182       tag_buf[tag_len++] = c;
1183       c = scm_getc (port);
1184     }
1185   if (tag_len == 0)
1186     tag = SCM_BOOL_T;
1187   else
1188     {
1189       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1190       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1191         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1192                            scm_list_1 (tag));
1193     }
1194
1195   /* Read shape. */
1196   if (c == '@' || c == ':')
1197     {
1198       shape = SCM_EOL;
1199
1200       do
1201         {
1202           ssize_t lbnd = 0, len = 0;
1203           SCM s;
1204
1205           if (c == '@')
1206             {
1207               c = scm_getc (port);
1208               c = read_decimal_integer (port, c, &lbnd);
1209             }
1210
1211           s = scm_from_ssize_t (lbnd);
1212
1213           if (c == ':')
1214             {
1215               c = scm_getc (port);
1216               c = read_decimal_integer (port, c, &len);
1217               if (len < 0)
1218                 scm_i_input_error (NULL, port,
1219                                    "array length must be non-negative",
1220                                    SCM_EOL);
1221
1222               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1223             }
1224
1225           shape = scm_cons (s, shape);
1226         } while (c == '@' || c == ':');
1227
1228       shape = scm_reverse_x (shape, SCM_EOL);
1229     }
1230
1231   /* Read nested lists of elements. */
1232   if (c != '(')
1233     scm_i_input_error (NULL, port,
1234                        "missing '(' in vector or array literal",
1235                        SCM_EOL);
1236   elements = scm_read_sexp (c, port, opts);
1237
1238   if (scm_is_false (shape))
1239     shape = scm_from_ssize_t (rank);
1240   else if (scm_ilength (shape) != rank)
1241     scm_i_input_error
1242       (NULL, port,
1243        "the number of shape specifications must match the array rank",
1244        SCM_EOL);
1245
1246   /* Handle special print syntax of rank zero arrays; see
1247      scm_i_print_array for a rationale. */
1248   if (rank == 0)
1249     {
1250       if (!scm_is_pair (elements))
1251         scm_i_input_error (NULL, port,
1252                            "too few elements in array literal, need 1",
1253                            SCM_EOL);
1254       if (!scm_is_null (SCM_CDR (elements)))
1255         scm_i_input_error (NULL, port,
1256                            "too many elements in array literal, want 1",
1257                            SCM_EOL);
1258       elements = SCM_CAR (elements);
1259     }
1260
1261   /* Construct array, annotate with source location, and return. */
1262   array = scm_list_to_typed_array (tag, shape, elements);
1263   return maybe_annotate_source (array, port, opts, line, column);
1264 }
1265
1266 static SCM
1267 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1268                        long line, int column)
1269 {
1270   return scm_read_array (chr, port, opts, line, column);
1271 }
1272
1273 static SCM
1274 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1275                      long line, int column)
1276 {
1277   chr = scm_getc (port);
1278   if (chr != 'u')
1279     goto syntax;
1280
1281   chr = scm_getc (port);
1282   if (chr != '8')
1283     goto syntax;
1284
1285   chr = scm_getc (port);
1286   if (chr != '(')
1287     goto syntax;
1288
1289   return maybe_annotate_source
1290     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1291      port, opts, line, column);
1292
1293  syntax:
1294   scm_i_input_error ("read_bytevector", port,
1295                      "invalid bytevector prefix",
1296                      SCM_MAKE_CHAR (chr));
1297   return SCM_UNSPECIFIED;
1298 }
1299
1300 static SCM
1301 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1302                            long line, int column)
1303 {
1304   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1305      terribly inefficient but who cares?  */
1306   SCM s_bits = SCM_EOL;
1307
1308   for (chr = scm_getc (port);
1309        (chr != EOF) && ((chr == '0') || (chr == '1'));
1310        chr = scm_getc (port))
1311     {
1312       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1313     }
1314
1315   if (chr != EOF)
1316     scm_ungetc (chr, port);
1317
1318   return maybe_annotate_source
1319     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1320      port, opts, line, column);
1321 }
1322
1323 static SCM
1324 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1325 {
1326   int bang_seen = 0;
1327
1328   for (;;)
1329     {
1330       int c = scm_getc (port);
1331
1332       if (c == EOF)
1333         scm_i_input_error ("skip_block_comment", port,
1334                            "unterminated `#! ... !#' comment", SCM_EOL);
1335
1336       if (c == '!')
1337         bang_seen = 1;
1338       else if (c == '#' && bang_seen)
1339         break;
1340       else
1341         bang_seen = 0;
1342     }
1343
1344   return SCM_UNSPECIFIED;
1345 }
1346
1347 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1348                                          int value);
1349 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1350                                         int value);
1351 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1352                                     int value);
1353
1354 static SCM
1355 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1356 {
1357   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1358   int c;
1359   int i = 0;
1360
1361   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1362     {
1363       c = scm_getc (port);
1364       if (c == EOF)
1365         scm_i_input_error ("skip_block_comment", port,
1366                            "unterminated `#! ... !#' comment", SCM_EOL);
1367       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1368         name[i++] = c;
1369       else if (CHAR_IS_DELIMITER (c))
1370         {
1371           scm_ungetc (c, port);
1372           name[i] = '\0';
1373           if (0 == strcmp ("r6rs", name))
1374             ;  /* Silently ignore */
1375           else if (0 == strcmp ("fold-case", name))
1376             set_port_case_insensitive_p (port, opts, 1);
1377           else if (0 == strcmp ("no-fold-case", name))
1378             set_port_case_insensitive_p (port, opts, 0);
1379           else if (0 == strcmp ("curly-infix", name))
1380             set_port_curly_infix_p (port, opts, 1);
1381           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1382             {
1383               set_port_curly_infix_p (port, opts, 1);
1384               set_port_square_brackets_p (port, opts, 0);
1385             }
1386           else
1387             break;
1388
1389           return SCM_UNSPECIFIED;
1390         }
1391       else
1392         {
1393           scm_ungetc (c, port);
1394           break;
1395         }
1396     }
1397   while (i > 0)
1398     scm_ungetc (name[--i], port);
1399   return scm_read_scsh_block_comment (chr, port);
1400 }
1401
1402 static SCM
1403 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1404 {
1405   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1406      nested.  So care must be taken.  */
1407   int nesting_level = 1;
1408
1409   int a = scm_getc (port);
1410
1411   if (a == EOF)
1412     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1413                        "unterminated `#| ... |#' comment", SCM_EOL);
1414
1415   while (nesting_level > 0)
1416     {
1417       int b = scm_getc (port);
1418
1419       if (b == EOF)
1420         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1421                            "unterminated `#| ... |#' comment", SCM_EOL);
1422
1423       if (a == '|' && b == '#')
1424         {
1425           nesting_level--;
1426           b = EOF;
1427         }
1428       else if (a == '#' && b == '|')
1429         {
1430           nesting_level++;
1431           b = EOF;
1432         }
1433
1434       a = b;
1435     }
1436
1437   return SCM_UNSPECIFIED;
1438 }
1439
1440 static SCM
1441 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1442                                scm_t_read_opts *opts)
1443 {
1444   scm_t_wchar c;
1445
1446   c = flush_ws (port, opts, (char *) NULL);
1447   if (EOF == c)
1448     scm_i_input_error ("read_commented_expression", port,
1449                        "no expression after #; comment", SCM_EOL);
1450   scm_ungetc (c, port);
1451   scm_read_expression (port, opts);
1452   return SCM_UNSPECIFIED;
1453 }
1454
1455 static SCM
1456 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1457 {
1458   /* Guile's extended symbol read syntax looks like this:
1459
1460        #{This is all a symbol name}#
1461
1462      So here, CHR is expected to be `{'.  */
1463   int saw_brace = 0;
1464   size_t len = 0;
1465   SCM buf = scm_i_make_string (1024, NULL, 0);
1466
1467   buf = scm_i_string_start_writing (buf);
1468
1469   while ((chr = scm_getc (port)) != EOF)
1470     {
1471       if (saw_brace)
1472         {
1473           if (chr == '#')
1474             {
1475               break;
1476             }
1477           else
1478             {
1479               saw_brace = 0;
1480               scm_i_string_set_x (buf, len++, '}');
1481             }
1482         }
1483
1484       if (chr == '}')
1485         saw_brace = 1;
1486       else if (chr == '\\')
1487         {
1488           /* It used to be that print.c would print extended-read-syntax
1489              symbols with backslashes before "non-standard" chars, but
1490              this routine wouldn't do anything with those escapes.
1491              Bummer.  What we've done is to change print.c to output
1492              R6RS hex escapes for those characters, relying on the fact
1493              that the extended read syntax would never put a `\' before
1494              an `x'.  For now, we just ignore other instances of
1495              backslash in the string.  */
1496           switch ((chr = scm_getc (port)))
1497             {
1498             case EOF:
1499               goto done;
1500             case 'x':
1501               {
1502                 scm_t_wchar c;
1503
1504                 SCM_READ_HEX_ESCAPE (10, ';');
1505                 scm_i_string_set_x (buf, len++, c);
1506                 break;
1507
1508               str_eof:
1509                 chr = EOF;
1510                 goto done;
1511
1512               bad_escaped:
1513                 scm_i_string_stop_writing ();
1514                 scm_i_input_error ("scm_read_extended_symbol", port,
1515                                    "illegal character in escape sequence: ~S",
1516                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1517                 break;
1518               }
1519             default:
1520               scm_i_string_set_x (buf, len++, chr);
1521               break;
1522             }
1523         }
1524       else
1525         scm_i_string_set_x (buf, len++, chr);
1526
1527       if (len >= scm_i_string_length (buf) - 2)
1528         {
1529           SCM addy;
1530
1531           scm_i_string_stop_writing ();
1532           addy = scm_i_make_string (1024, NULL, 0);
1533           buf = scm_string_append (scm_list_2 (buf, addy));
1534           len = 0;
1535           buf = scm_i_string_start_writing (buf);
1536         }
1537     }
1538
1539  done:
1540   scm_i_string_stop_writing ();
1541   if (chr == EOF)
1542     scm_i_input_error ("scm_read_extended_symbol", port,
1543                        "end of file while reading symbol", SCM_EOL);
1544
1545   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1546 }
1547
1548
1549 \f
1550 /* Top-level token readers, i.e., dispatchers.  */
1551
1552 static SCM
1553 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1554 {
1555   SCM proc;
1556
1557   proc = scm_get_hash_procedure (chr);
1558   if (scm_is_true (scm_procedure_p (proc)))
1559     {
1560       long line = SCM_LINUM (port);
1561       int column = SCM_COL (port) - 2;
1562       SCM got;
1563
1564       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1565
1566       if (opts->record_positions_p && SCM_NIMP (got)
1567           && !scm_i_has_source_properties (got))
1568         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1569
1570       return got;
1571     }
1572
1573   return SCM_UNSPECIFIED;
1574 }
1575
1576 /* The reader for the sharp `#' character.  It basically dispatches reads
1577    among the above token readers.   */
1578 static SCM
1579 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1580                 long line, int column)
1581 #define FUNC_NAME "scm_lreadr"
1582 {
1583   SCM result;
1584
1585   chr = scm_getc (port);
1586
1587   result = scm_read_sharp_extension (chr, port, opts);
1588   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1589     return result;
1590
1591   switch (chr)
1592     {
1593     case '\\':
1594       return (scm_read_character (chr, port, opts));
1595     case '(':
1596       return (scm_read_vector (chr, port, opts, line, column));
1597     case 's':
1598     case 'u':
1599     case 'f':
1600     case 'c':
1601       /* This one may return either a boolean or an SRFI-4 vector.  */
1602       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1603     case 'v':
1604       return (scm_read_bytevector (chr, port, opts, line, column));
1605     case '*':
1606       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1607     case 't':
1608     case 'T':
1609     case 'F':
1610       return (scm_read_boolean (chr, port));
1611     case ':':
1612       return (scm_read_keyword (chr, port, opts));
1613     case '0': case '1': case '2': case '3': case '4':
1614     case '5': case '6': case '7': case '8': case '9':
1615     case '@':
1616 #if SCM_ENABLE_DEPRECATED
1617       /* See below for 'i' and 'e'. */
1618     case 'a':
1619     case 'y':
1620     case 'h':
1621     case 'l':
1622 #endif
1623       return (scm_read_array (chr, port, opts, line, column));
1624
1625     case 'i':
1626     case 'e':
1627 #if SCM_ENABLE_DEPRECATED
1628       {
1629         /* When next char is '(', it really is an old-style
1630            uniform array. */
1631         scm_t_wchar next_c = scm_getc (port);
1632         if (next_c != EOF)
1633           scm_ungetc (next_c, port);
1634         if (next_c == '(')
1635           return scm_read_array (chr, port, opts, line, column);
1636         /* Fall through. */
1637       }
1638 #endif
1639     case 'b':
1640     case 'B':
1641     case 'o':
1642     case 'O':
1643     case 'd':
1644     case 'D':
1645     case 'x':
1646     case 'X':
1647     case 'I':
1648     case 'E':
1649       return (scm_read_number_and_radix (chr, port, opts));
1650     case '{':
1651       return (scm_read_extended_symbol (chr, port));
1652     case '!':
1653       return (scm_read_shebang (chr, port, opts));
1654     case ';':
1655       return (scm_read_commented_expression (chr, port, opts));
1656     case '`':
1657     case '\'':
1658     case ',':
1659       return (scm_read_syntax (chr, port, opts));
1660     case 'n':
1661       return (scm_read_nil (chr, port, opts));
1662     default:
1663       result = scm_read_sharp_extension (chr, port, opts);
1664       if (scm_is_eq (result, SCM_UNSPECIFIED))
1665         {
1666           /* To remain compatible with 1.8 and earlier, the following
1667              characters have lower precedence than `read-hash-extend'
1668              characters.  */
1669           switch (chr)
1670             {
1671             case '|':
1672               return scm_read_r6rs_block_comment (chr, port);
1673             default:
1674               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1675                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1676             }
1677         }
1678       else
1679         return result;
1680     }
1681
1682   return SCM_UNSPECIFIED;
1683 }
1684 #undef FUNC_NAME
1685
1686 static SCM
1687 read_inner_expression (SCM port, scm_t_read_opts *opts)
1688 #define FUNC_NAME "read_inner_expression"
1689 {
1690   while (1)
1691     {
1692       scm_t_wchar chr;
1693
1694       chr = scm_getc (port);
1695
1696       switch (chr)
1697         {
1698         case SCM_WHITE_SPACES:
1699         case SCM_LINE_INCREMENTORS:
1700           break;
1701         case ';':
1702           (void) scm_read_semicolon_comment (chr, port);
1703           break;
1704         case '{':
1705           if (opts->curly_infix_p)
1706             {
1707               if (opts->neoteric_p)
1708                 return scm_read_sexp (chr, port, opts);
1709               else
1710                 {
1711                   SCM expr;
1712
1713                   /* Enable neoteric expressions within curly braces */
1714                   opts->neoteric_p = 1;
1715                   expr = scm_read_sexp (chr, port, opts);
1716                   opts->neoteric_p = 0;
1717                   return expr;
1718                 }
1719             }
1720           else
1721             return scm_read_mixed_case_symbol (chr, port, opts);
1722         case '[':
1723           if (opts->square_brackets_p)
1724             return scm_read_sexp (chr, port, opts);
1725           else if (opts->curly_infix_p)
1726             {
1727               /* The syntax of neoteric expressions requires that '[' be
1728                  a delimiter when curly-infix is enabled, so it cannot
1729                  be part of an unescaped symbol.  We might as well do
1730                  something useful with it, so we adopt Kawa's convention:
1731                  [...] => ($bracket-list$ ...) */
1732               long line = SCM_LINUM (port);
1733               int column = SCM_COL (port) - 1;
1734               return maybe_annotate_source
1735                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1736                  port, opts, line, column);
1737             }
1738           else
1739             return scm_read_mixed_case_symbol (chr, port, opts);
1740         case '(':
1741           return (scm_read_sexp (chr, port, opts));
1742         case '"':
1743           return (scm_read_string (chr, port, opts));
1744         case '\'':
1745         case '`':
1746         case ',':
1747           return (scm_read_quote (chr, port, opts));
1748         case '#':
1749           {
1750             long line  = SCM_LINUM (port);
1751             int column = SCM_COL (port) - 1;
1752             SCM result = scm_read_sharp (chr, port, opts, line, column);
1753             if (scm_is_eq (result, SCM_UNSPECIFIED))
1754               /* We read a comment or some such.  */
1755               break;
1756             else
1757               return result;
1758           }
1759         case ')':
1760           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1761           break;
1762         case '}':
1763           if (opts->curly_infix_p)
1764             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1765           else
1766             return scm_read_mixed_case_symbol (chr, port, opts);
1767         case ']':
1768           if (opts->square_brackets_p)
1769             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1770           /* otherwise fall through */
1771         case EOF:
1772           return SCM_EOF_VAL;
1773         case ':':
1774           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1775             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1776           /* Fall through.  */
1777
1778         default:
1779           {
1780             if (((chr >= '0') && (chr <= '9'))
1781                 || (strchr ("+-.", chr)))
1782               return (scm_read_number (chr, port, opts));
1783             else
1784               return (scm_read_mixed_case_symbol (chr, port, opts));
1785           }
1786         }
1787     }
1788 }
1789 #undef FUNC_NAME
1790
1791 static SCM
1792 scm_read_expression (SCM port, scm_t_read_opts *opts)
1793 #define FUNC_NAME "scm_read_expression"
1794 {
1795   if (!opts->neoteric_p)
1796     return read_inner_expression (port, opts);
1797   else
1798     {
1799       long line = 0;
1800       int column = 0;
1801       SCM expr;
1802
1803       if (opts->record_positions_p)
1804         {
1805           /* We need to get the position of the first non-whitespace
1806              character in order to correctly annotate neoteric
1807              expressions.  For example, for the expression 'f(x)', the
1808              first call to 'read_inner_expression' reads the 'f' (which
1809              cannot be annotated), and then we later read the '(x)' and
1810              use it to construct the new list (f x). */
1811           int c = flush_ws (port, opts, (char *) NULL);
1812           if (c == EOF)
1813             return SCM_EOF_VAL;
1814           scm_ungetc (c, port);
1815           line = SCM_LINUM (port);
1816           column = SCM_COL (port);
1817         }
1818
1819       expr = read_inner_expression (port, opts);
1820
1821       /* 'expr' is the first component of the neoteric expression.  Now
1822          we loop, and as long as the next character is '(', '[', or '{',
1823          (without any intervening whitespace), we use it to construct a
1824          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1825       for (;;)
1826         {
1827           int chr = scm_getc (port);
1828
1829           if (chr == '(')
1830             /* e(...) => (e ...) */
1831             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1832           else if (chr == '[')
1833             /* e[...] => ($bracket-apply$ e ...) */
1834             expr = scm_cons (sym_bracket_apply,
1835                              scm_cons (expr,
1836                                        scm_read_sexp (chr, port, opts)));
1837           else if (chr == '{')
1838             {
1839               SCM arg = scm_read_sexp (chr, port, opts);
1840
1841               if (scm_is_null (arg))
1842                 expr = scm_list_1 (expr);       /* e{} => (e) */
1843               else
1844                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1845             }
1846           else
1847             {
1848               if (chr != EOF)
1849                 scm_ungetc (chr, port);
1850               break;
1851             }
1852           maybe_annotate_source (expr, port, opts, line, column);
1853         }
1854       return expr;
1855     }
1856 }
1857 #undef FUNC_NAME
1858
1859 \f
1860 /* Actual reader.  */
1861
1862 static void init_read_options (SCM port, scm_t_read_opts *opts);
1863
1864 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1865             (SCM port),
1866             "Read an s-expression from the input port @var{port}, or from\n"
1867             "the current input port if @var{port} is not specified.\n"
1868             "Any whitespace before the next token is discarded.")
1869 #define FUNC_NAME s_scm_read
1870 {
1871   scm_t_read_opts opts;
1872   int c;
1873
1874   if (SCM_UNBNDP (port))
1875     port = scm_current_input_port ();
1876   SCM_VALIDATE_OPINPORT (1, port);
1877
1878   init_read_options (port, &opts);
1879
1880   c = flush_ws (port, &opts, (char *) NULL);
1881   if (EOF == c)
1882     return SCM_EOF_VAL;
1883   scm_ungetc (c, port);
1884
1885   return (scm_read_expression (port, &opts));
1886 }
1887 #undef FUNC_NAME
1888
1889
1890 \f
1891
1892 /* Manipulate the read-hash-procedures alist.  This could be written in
1893    Scheme, but maybe it will also be used by C code during initialisation.  */
1894 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1895             (SCM chr, SCM proc),
1896             "Install the procedure @var{proc} for reading expressions\n"
1897             "starting with the character sequence @code{#} and @var{chr}.\n"
1898             "@var{proc} will be called with two arguments:  the character\n"
1899             "@var{chr} and the port to read further data from. The object\n"
1900             "returned will be the return value of @code{read}. \n"
1901             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1902             )
1903 #define FUNC_NAME s_scm_read_hash_extend
1904 {
1905   SCM this;
1906   SCM prev;
1907
1908   SCM_VALIDATE_CHAR (1, chr);
1909   SCM_ASSERT (scm_is_false (proc)
1910               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1911               proc, SCM_ARG2, FUNC_NAME);
1912
1913   /* Check if chr is already in the alist.  */
1914   this = scm_i_read_hash_procedures_ref ();
1915   prev = SCM_BOOL_F;
1916   while (1)
1917     {
1918       if (scm_is_null (this))
1919         {
1920           /* not found, so add it to the beginning.  */
1921           if (scm_is_true (proc))
1922             {
1923               SCM new = scm_cons (scm_cons (chr, proc),
1924                                   scm_i_read_hash_procedures_ref ());
1925               scm_i_read_hash_procedures_set_x (new);
1926             }
1927           break;
1928         }
1929       if (scm_is_eq (chr, SCM_CAAR (this)))
1930         {
1931           /* already in the alist.  */
1932           if (scm_is_false (proc))
1933             {
1934               /* remove it.  */
1935               if (scm_is_false (prev))
1936                 {
1937                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1938                   scm_i_read_hash_procedures_set_x (rest);
1939                 }
1940               else
1941                 scm_set_cdr_x (prev, SCM_CDR (this));
1942             }
1943           else
1944             {
1945               /* replace it.  */
1946               scm_set_cdr_x (SCM_CAR (this), proc);
1947             }
1948           break;
1949         }
1950       prev = this;
1951       this = SCM_CDR (this);
1952     }
1953
1954   return SCM_UNSPECIFIED;
1955 }
1956 #undef FUNC_NAME
1957
1958 /* Recover the read-hash procedure corresponding to char c.  */
1959 static SCM
1960 scm_get_hash_procedure (int c)
1961 {
1962   SCM rest = scm_i_read_hash_procedures_ref ();
1963
1964   while (1)
1965     {
1966       if (scm_is_null (rest))
1967         return SCM_BOOL_F;
1968
1969       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1970         return SCM_CDAR (rest);
1971
1972       rest = SCM_CDR (rest);
1973     }
1974 }
1975
1976 #define SCM_ENCODING_SEARCH_SIZE (500)
1977
1978 /* Search the first few hundred characters of a file for an Emacs-like coding
1979    declaration.  Returns either NULL or a string whose storage has been
1980    allocated with `scm_gc_malloc ()'.  */
1981 char *
1982 scm_i_scan_for_encoding (SCM port)
1983 {
1984   scm_t_port *pt;
1985   char header[SCM_ENCODING_SEARCH_SIZE+1];
1986   size_t bytes_read, encoding_length, i;
1987   char *encoding = NULL;
1988   int utf8_bom = 0;
1989   char *pos, *encoding_start;
1990   int in_comment;
1991
1992   pt = SCM_PTAB_ENTRY (port);
1993
1994   if (pt->rw_active == SCM_PORT_WRITE)
1995     scm_flush (port);
1996
1997   if (pt->rw_random)
1998     pt->rw_active = SCM_PORT_READ;
1999
2000   if (pt->read_pos == pt->read_end)
2001     {
2002       /* We can use the read buffer, and thus avoid a seek. */
2003       if (scm_fill_input (port) == EOF)
2004         return NULL;
2005
2006       bytes_read = pt->read_end - pt->read_pos;
2007       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2008         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2009
2010       if (bytes_read <= 1)
2011         /* An unbuffered port -- don't scan.  */
2012         return NULL;
2013
2014       memcpy (header, pt->read_pos, bytes_read);
2015       header[bytes_read] = '\0';
2016     }
2017   else
2018     {
2019       /* Try to read some bytes and then seek back.  Not all ports
2020          support seeking back; and indeed some file ports (like
2021          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2022          check performed by SCM_FPORT_FDES---but fail to seek
2023          backwards.  Hence this block comes second.  We prefer to use
2024          the read buffer in-place.  */
2025       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2026         return NULL;
2027
2028       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2029       header[bytes_read] = '\0';
2030       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2031     }
2032
2033   if (bytes_read > 3
2034       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2035     utf8_bom = 1;
2036
2037   /* search past "coding[:=]" */
2038   pos = header;
2039   while (1)
2040     {
2041       if ((pos = strstr(pos, "coding")) == NULL)
2042         return NULL;
2043
2044       pos += strlen("coding");
2045       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2046           (*pos == ':' || *pos == '='))
2047         {
2048           pos ++;
2049           break;
2050         }
2051     }
2052
2053   /* skip spaces */
2054   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2055          (*pos == ' ' || *pos == '\t'))
2056     pos ++;
2057
2058   /* grab the next token */
2059   encoding_start = pos;
2060   i = 0;
2061   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2062          && encoding_start + i - header < bytes_read
2063          && (isalnum ((int) encoding_start[i])
2064              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2065     i++;
2066
2067   encoding_length = i;
2068   if (encoding_length == 0)
2069     return NULL;
2070
2071   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2072   for (i = 0; i < encoding_length; i++)
2073     encoding[i] = toupper ((int) encoding[i]);
2074
2075   /* push backwards to make sure we were in a comment */
2076   in_comment = 0;
2077   pos = encoding_start;
2078   while (pos >= header)
2079     {
2080       if (*pos == ';')
2081         {
2082           in_comment = 1;
2083           break;
2084         }
2085       else if (*pos == '\n' || pos == header)
2086         {
2087           /* This wasn't in a semicolon comment. Check for a
2088            hash-bang comment. */
2089           char *beg = strstr (header, "#!");
2090           char *end = strstr (header, "!#");
2091           if (beg < encoding_start && encoding_start + encoding_length <= end)
2092             in_comment = 1;
2093           break;
2094         }
2095       else
2096         {
2097           pos --;
2098           continue;
2099         }
2100     }
2101   if (!in_comment)
2102     /* This wasn't in a comment */
2103     return NULL;
2104
2105   if (utf8_bom && strcmp(encoding, "UTF-8"))
2106     scm_misc_error (NULL,
2107                     "the port input declares the encoding ~s but is encoded as UTF-8",
2108                     scm_list_1 (scm_from_locale_string (encoding)));
2109
2110   return encoding;
2111 }
2112
2113 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2114             (SCM port),
2115             "Scans the port for an Emacs-like character coding declaration\n"
2116             "near the top of the contents of a port with random-accessible contents.\n"
2117             "The coding declaration is of the form\n"
2118             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2119             "\n"
2120             "Returns a string containing the character encoding of the file\n"
2121             "if a declaration was found, or @code{#f} otherwise.\n")
2122 #define FUNC_NAME s_scm_file_encoding
2123 {
2124   char *enc;
2125   SCM s_enc;
2126
2127   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2128
2129   enc = scm_i_scan_for_encoding (port);
2130   if (enc == NULL)
2131     return SCM_BOOL_F;
2132   else
2133     {
2134       s_enc = scm_from_locale_string (enc);
2135       return s_enc;
2136     }
2137
2138   return SCM_BOOL_F;
2139 }
2140 #undef FUNC_NAME
2141
2142 \f
2143 /* Per-port read options.
2144
2145    We store per-port read options in the 'port-read-options' key of the
2146    port's alist, which is stored in the internal port structure.  The
2147    value stored in the alist is a single integer that contains a two-bit
2148    field for each read option.
2149
2150    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2151    the applicable value should be inherited from the corresponding
2152    global read option.  Otherwise, the bit field contains the value of
2153    the read option.  For boolean read options that have been set
2154    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2155    read option has been set per-port, its possible values are those in
2156    'enum t_keyword_style'. */
2157
2158 /* Key to read options in per-port alists. */
2159 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2160
2161 /* Offsets of bit fields for each per-port override */
2162 #define READ_OPTION_COPY_SOURCE_P          0
2163 #define READ_OPTION_RECORD_POSITIONS_P     2
2164 #define READ_OPTION_CASE_INSENSITIVE_P     4
2165 #define READ_OPTION_KEYWORD_STYLE          6
2166 #define READ_OPTION_R6RS_ESCAPES_P         8
2167 #define READ_OPTION_SQUARE_BRACKETS_P     10
2168 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2169 #define READ_OPTION_CURLY_INFIX_P         14
2170
2171 /* The total width in bits of the per-port overrides */
2172 #define READ_OPTIONS_NUM_BITS             16
2173
2174 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2175 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2176
2177 #define READ_OPTION_MASK     3
2178 #define READ_OPTION_INHERIT  3
2179
2180 static void
2181 set_port_read_option (SCM port, int option, int new_value)
2182 {
2183   SCM alist, scm_read_options;
2184   unsigned int read_options;
2185
2186   new_value &= READ_OPTION_MASK;
2187   alist = scm_i_port_alist (port);
2188   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2189   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2190     read_options = scm_to_uint (scm_read_options);
2191   else
2192     read_options = READ_OPTIONS_INHERIT_ALL;
2193   read_options &= ~(READ_OPTION_MASK << option);
2194   read_options |= new_value << option;
2195   scm_read_options = scm_from_uint (read_options);
2196   alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2197   scm_i_set_port_alist_x (port, alist);
2198 }
2199
2200 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2201 static void
2202 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2203 {
2204   value = !!value;
2205   opts->case_insensitive_p = value;
2206   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2207 }
2208
2209 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2210 static void
2211 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2212 {
2213   value = !!value;
2214   opts->square_brackets_p = value;
2215   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2216 }
2217
2218 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2219 static void
2220 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2221 {
2222   value = !!value;
2223   opts->curly_infix_p = value;
2224   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2225 }
2226
2227 /* Initialize OPTS based on PORT's read options and the global read
2228    options. */
2229 static void
2230 init_read_options (SCM port, scm_t_read_opts *opts)
2231 {
2232   SCM alist, val, scm_read_options;
2233   unsigned int read_options, x;
2234
2235   alist = scm_i_port_alist (port);
2236   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2237
2238   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2239     read_options = scm_to_uint (scm_read_options);
2240   else
2241     read_options = READ_OPTIONS_INHERIT_ALL;
2242
2243   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2244   if (x == READ_OPTION_INHERIT)
2245     {
2246       val = SCM_PACK (SCM_KEYWORD_STYLE);
2247       if (scm_is_eq (val, scm_keyword_prefix))
2248         x = KEYWORD_STYLE_PREFIX;
2249       else if (scm_is_eq (val, scm_keyword_postfix))
2250         x = KEYWORD_STYLE_POSTFIX;
2251       else
2252         x = KEYWORD_STYLE_HASH_PREFIX;
2253     }
2254   opts->keyword_style = x;
2255
2256 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2257   do                                                                    \
2258     {                                                                   \
2259       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2260       if (x == READ_OPTION_INHERIT)                                     \
2261         x = !!SCM_ ## NAME;                                             \
2262           opts->name = x;                                               \
2263     }                                                                   \
2264   while (0)
2265
2266   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2267   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2268   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2269   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2270   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2271   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2272   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2273
2274 #undef RESOLVE_BOOLEAN_OPTION
2275
2276   opts->neoteric_p = 0;
2277 }
2278
2279 void
2280 scm_init_read ()
2281 {
2282   SCM read_hash_procs;
2283
2284   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2285
2286   scm_i_read_hash_procedures =
2287     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2288
2289   scm_init_opts (scm_read_options, scm_read_opts);
2290 #include "libguile/read.x"
2291 }
2292
2293 /*
2294   Local Variables:
2295   c-file-style: "gnu"
2296   End:
2297 */