libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 /* SRFI-105 curly infix expression support */
  67 SCM_SYMBOL (sym_nfx, "$nfx$");
  68 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  69 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  70
  71 scm_t_option scm_read_opts[] =
  72   {
  73     { SCM_OPTION_BOOLEAN, "copy", 0,
  74       "Copy source code expressions." },
  75     { SCM_OPTION_BOOLEAN, "positions", 1,
  76       "Record positions of source code expressions." },
  77     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  78       "Convert symbols to lower case."},
  79     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  80       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  81     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  82       "Use R6RS variable-length character and string hex escapes."},
  83     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  84       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  85     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  86       "In strings, consume leading whitespace after an escaped end-of-line."},
  87     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  88       "Support SRFI-105 curly infix expressions."},
  89     { 0, },
  90   };
  91
  92 /* Internal read options structure.  This is initialized by 'scm_read'
  93    from the global and per-port read options, and a pointer is passed
  94    down to all helper functions. */
  95
  96 enum t_keyword_style
  97   {
  98     KEYWORD_STYLE_HASH_PREFIX,
  99     KEYWORD_STYLE_PREFIX,
 100     KEYWORD_STYLE_POSTFIX
 101   };
 102
 103 struct t_read_opts
 104 {
 105   enum t_keyword_style keyword_style;
 106   unsigned int copy_source_p        : 1;
 107   unsigned int record_positions_p   : 1;
 108   unsigned int case_insensitive_p   : 1;
 109   unsigned int r6rs_escapes_p       : 1;
 110   unsigned int square_brackets_p    : 1;
 111   unsigned int hungry_eol_escapes_p : 1;
 112   unsigned int curly_infix_p        : 1;
 113   unsigned int neoteric_p           : 1;
 114 };
 115
 116 typedef struct t_read_opts scm_t_read_opts;
 117
 118
 119 /*
 120   Give meaningful error messages for errors
 121
 122   We use the format
 123
 124   FILE:LINE:COL: MESSAGE
 125   This happened in ....
 126
 127   This is not standard GNU format, but the test-suite likes the real
 128   message to be in front.
 129
 130  */
 131
 132
 133 void
 134 scm_i_input_error (char const *function,
 135                    SCM port, const char *message, SCM arg)
 136 {
 137   SCM fn = (scm_is_string (SCM_FILENAME(port))
 138             ? SCM_FILENAME(port)
 139             : scm_from_locale_string ("#<unknown port>"));
 140
 141   SCM string_port = scm_open_output_string ();
 142   SCM string = SCM_EOL;
 143   scm_simple_format (string_port,
 144                      scm_from_locale_string ("~A:~S:~S: ~A"),
 145                      scm_list_4 (fn,
 146                                  scm_from_long (SCM_LINUM (port) + 1),
 147                                  scm_from_int (SCM_COL (port) + 1),
 148                                  scm_from_locale_string (message)));
 149
 150   string = scm_get_output_string (string_port);
 151   scm_close_output_port (string_port);
 152   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 153                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 154                  string,
 155                  arg,
 156                  SCM_BOOL_F);
 157 }
 158
 159
 160 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 161             (SCM setting),
 162             "Option interface for the read options. Instead of using\n"
 163             "this procedure directly, use the procedures @code{read-enable},\n"
 164             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 165 #define FUNC_NAME s_scm_read_options
 166 {
 167   SCM ans = scm_options (setting,
 168                          scm_read_opts,
 169                          FUNC_NAME);
 170   if (SCM_COPY_SOURCE_P)
 171     SCM_RECORD_POSITIONS_P = 1;
 172   return ans;
 173 }
 174 #undef FUNC_NAME
 175
 176 /* A fluid referring to an association list mapping extra hash
 177    characters to procedures.  */
 178 static SCM *scm_i_read_hash_procedures;
 179
 180 static SCM
 181 scm_i_read_hash_procedures_ref (void)
 182 {
 183   return scm_fluid_ref (*scm_i_read_hash_procedures);
 184 }
 185
 186 static void
 187 scm_i_read_hash_procedures_set_x (SCM value)
 188 {
 189   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 190 }
 191
 192 \f
 193 /* Token readers.  */
 194
 195
 196 /* Size of the C buffer used to read symbols and numbers.  */
 197 #define READER_BUFFER_SIZE            128
 198
 199 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 200 #define READER_STRING_BUFFER_SIZE     128
 201
 202 /* The maximum size of Scheme character names.  */
 203 #define READER_CHAR_NAME_MAX_SIZE      50
 204
 205 /* The maximum size of reader directive names.  */
 206 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 207
 208
 209 /* `isblank' is only in C99.  */
 210 #define CHAR_IS_BLANK_(_chr)                                    \
 211   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 212    || ((_chr) == '\f') || ((_chr) == '\r'))
 213
 214 #ifdef MSDOS
 215 # define CHAR_IS_BLANK(_chr)                    \
 216   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 217 #else
 218 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 219 #endif
 220
 221
 222 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 223    structure'').  */
 224 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 225   (CHAR_IS_BLANK (c)                                            \
 226    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 227
 228 #define CHAR_IS_DELIMITER(c)                                    \
 229   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 230    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 231                                       || opts->curly_infix_p))  \
 232    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 233
 234 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 235    Structure''.  */
 236 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 237   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 238    || ((_chr) == 'd') || ((_chr) == 'l'))
 239
 240 /* Read an SCSH block comment.  */
 241 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 242 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 244 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_get_hash_procedure (int);
 246
 247 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 248    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 249    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 250    bytes actually read.  */
 251 static int
 252 read_token (SCM port, scm_t_read_opts *opts,
 253             char *buf, size_t buf_size, size_t *read)
 254 {
 255    *read = 0;
 256
 257    while (*read < buf_size)
 258      {
 259        int chr;
 260
 261        chr = scm_get_byte_or_eof_unlocked (port);
 262
 263        if (chr == EOF)
 264         return 0;
 265       else if (CHAR_IS_DELIMITER (chr))
 266         {
 267           scm_unget_byte_unlocked (chr, port);
 268           return 0;
 269         }
 270       else
 271         {
 272           *buf = (char) chr;
 273           buf++, (*read)++;
 274         }
 275      }
 276
 277    return 1;
 278  }
 279
 280 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 281    if the token doesn't fit in BUFFER_SIZE bytes.  */
 282 static char *
 283 read_complete_token (SCM port, scm_t_read_opts *opts,
 284                      char *buffer, size_t buffer_size, size_t *read)
 285 {
 286   int overflow = 0;
 287   size_t bytes_read, overflow_size = 0;
 288   char *overflow_buffer = NULL;
 289
 290   do
 291     {
 292       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 293       if (bytes_read == 0)
 294         break;
 295       if (overflow || overflow_size != 0)
 296         {
 297           if (overflow_size == 0)
 298             {
 299               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 300               memcpy (overflow_buffer, buffer, bytes_read);
 301               overflow_size = bytes_read;
 302             }
 303           else
 304             {
 305               char *new_buf =
 306                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 307
 308               memcpy (new_buf, overflow_buffer, overflow_size);
 309               memcpy (new_buf + overflow_size, buffer, bytes_read);
 310
 311               overflow_buffer = new_buf;
 312               overflow_size += bytes_read;
 313             }
 314         }
 315     }
 316   while (overflow);
 317
 318   if (overflow_size)
 319     *read = overflow_size;
 320   else
 321     *read = bytes_read;
 322
 323   return (overflow_size > 0 ? overflow_buffer : buffer);
 324 }
 325
 326 /* Skip whitespace from PORT and return the first non-whitespace character
 327    read.  Raise an error on end-of-file.  */
 328 static int
 329 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 330 {
 331   scm_t_wchar c;
 332   while (1)
 333     switch (c = scm_getc_unlocked (port))
 334       {
 335       case EOF:
 336       goteof:
 337         if (eoferr)
 338           {
 339             scm_i_input_error (eoferr,
 340                                port,
 341                                "end of file",
 342                                SCM_EOL);
 343           }
 344         return c;
 345
 346       case ';':
 347       lp:
 348         switch (c = scm_getc_unlocked (port))
 349           {
 350           case EOF:
 351             goto goteof;
 352           default:
 353             goto lp;
 354           case SCM_LINE_INCREMENTORS:
 355             break;
 356           }
 357         break;
 358
 359       case '#':
 360         switch (c = scm_getc_unlocked (port))
 361           {
 362           case EOF:
 363             eoferr = "read_sharp";
 364             goto goteof;
 365           case '!':
 366             scm_read_shebang (c, port, opts);
 367             break;
 368           case ';':
 369             scm_read_commented_expression (c, port, opts);
 370             break;
 371           case '|':
 372             if (scm_is_false (scm_get_hash_procedure (c)))
 373               {
 374                 scm_read_r6rs_block_comment (c, port);
 375                 break;
 376               }
 377             /* fall through */
 378           default:
 379             scm_ungetc_unlocked (c, port);
 380             return '#';
 381           }
 382         break;
 383
 384       case SCM_LINE_INCREMENTORS:
 385       case SCM_SINGLE_SPACES:
 386       case '\t':
 387         break;
 388
 389       default:
 390         return c;
 391       }
 392
 393   return 0;
 394 }
 395
 396
 397 \f
 398 /* Token readers.  */
 399
 400 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 401 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 402                            long line, int column);
 403
 404
 405 static SCM
 406 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 407                        long line, int column)
 408 {
 409   if (opts->record_positions_p)
 410     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 411   return x;
 412 }
 413
 414 static SCM
 415 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 416 #define FUNC_NAME "scm_i_lreadparen"
 417 {
 418   int c;
 419   SCM tmp, tl, ans = SCM_EOL;
 420   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 421   const int terminating_char = ((chr == '{') ? '}'
 422                                 : ((chr == '[') ? ']'
 423                                    : ')'));
 424
 425   /* Need to capture line and column numbers here. */
 426   long line = SCM_LINUM (port);
 427   int column = SCM_COL (port) - 1;
 428
 429   c = flush_ws (port, opts, FUNC_NAME);
 430   if (terminating_char == c)
 431     return SCM_EOL;
 432
 433   scm_ungetc_unlocked (c, port);
 434   tmp = scm_read_expression (port, opts);
 435
 436   /* Note that it is possible for scm_read_expression to return
 437      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 438      check that it's a real dot by checking `c'.  */
 439   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 440     {
 441       ans = scm_read_expression (port, opts);
 442       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 443         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 444                            SCM_EOL);
 445       return ans;
 446     }
 447
 448   /* Build the head of the list structure. */
 449   ans = tl = scm_cons (tmp, SCM_EOL);
 450
 451   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 452     {
 453       SCM new_tail;
 454
 455       if (c == ')' || (c == ']' && opts->square_brackets_p)
 456           || ((c == '}' || c == ']') && opts->curly_infix_p))
 457         scm_i_input_error (FUNC_NAME, port,
 458                            "in pair: mismatched close paren: ~A",
 459                            scm_list_1 (SCM_MAKE_CHAR (c)));
 460
 461       scm_ungetc_unlocked (c, port);
 462       tmp = scm_read_expression (port, opts);
 463
 464       /* See above note about scm_sym_dot.  */
 465       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 466         {
 467           SCM_SETCDR (tl, scm_read_expression (port, opts));
 468
 469           c = flush_ws (port, opts, FUNC_NAME);
 470           if (terminating_char != c)
 471             scm_i_input_error (FUNC_NAME, port,
 472                                "in pair: missing close paren", SCM_EOL);
 473           break;
 474         }
 475
 476       new_tail = scm_cons (tmp, SCM_EOL);
 477       SCM_SETCDR (tl, new_tail);
 478       tl = new_tail;
 479     }
 480
 481   if (curly_list_p)
 482     {
 483       /* In addition to finding the length, 'scm_ilength' checks for
 484          improper or circular lists, in which case it returns -1. */
 485       int len = scm_ilength (ans);
 486
 487       /* The (len == 0) case is handled above */
 488       if (len == 1)
 489         /* Return directly to avoid re-annotating the element's source
 490            location with the position of the outer brace.  Also, it
 491            might not be possible to annotate the element. */
 492         return scm_car (ans);  /* {e} => e */
 493       else if (len == 2)
 494         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 495       else if (len >= 3 && (len & 1))
 496         {
 497           /* It's a proper list whose length is odd and at least 3.  If
 498              the elements at odd indices (the infix operator positions)
 499              are all 'equal?', then it's a simple curly-infix list.
 500              Otherwise it's a mixed curly-infix list. */
 501           SCM op = scm_cadr (ans);
 502
 503           /* Check to see if the elements at odd indices are 'equal?' */
 504           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 505             {
 506               if (scm_is_null (tl))
 507                 {
 508                   /* Convert simple curly-infix list to prefix:
 509                      {a <op> b <op> ...} => (<op> a b ...) */
 510                   tl = ans;
 511                   while (scm_is_pair (scm_cdr (tl)))
 512                     {
 513                       tmp = scm_cddr (tl);
 514                       SCM_SETCDR (tl, tmp);
 515                       tl = tmp;
 516                     }
 517                   ans = scm_cons (op, ans);
 518                   break;
 519                 }
 520               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 521                 {
 522                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 523                   ans = scm_cons (sym_nfx, ans);
 524                   break;
 525                 }
 526             }
 527         }
 528       else
 529         /* Mixed curly-infix (possibly improper) list:
 530            {e . tail} => ($nfx$ e . tail) */
 531         ans = scm_cons (sym_nfx, ans);
 532     }
 533
 534   return maybe_annotate_source (ans, port, opts, line, column);
 535 }
 536 #undef FUNC_NAME
 537
 538
 539 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 540    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 541    found.  */
 542 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 543   do                                                               \
 544     {                                                              \
 545       scm_t_wchar a;                                               \
 546       size_t i = 0;                                                \
 547       c = 0;                                                       \
 548       while (i < ndigits)                                          \
 549         {                                                          \
 550           a = scm_getc_unlocked (port);                                     \
 551           if (a == EOF)                                            \
 552             goto str_eof;                                          \
 553           if (terminator                                           \
 554               && (a == (scm_t_wchar) terminator)                   \
 555               && (i > 0))                                          \
 556             break;                                                 \
 557           if ('0' <= a && a <= '9')                                \
 558             a -= '0';                                              \
 559           else if ('A' <= a && a <= 'F')                           \
 560             a = a - 'A' + 10;                                      \
 561           else if ('a' <= a && a <= 'f')                           \
 562             a = a - 'a' + 10;                                      \
 563           else                                                     \
 564             {                                                      \
 565               c = a;                                               \
 566               goto bad_escaped;                                    \
 567             }                                                      \
 568           c = c * 16 + a;                                          \
 569           i ++;                                                    \
 570         }                                                          \
 571     } while (0)
 572
 573 static void
 574 skip_intraline_whitespace (SCM port)
 575 {
 576   scm_t_wchar c;
 577
 578   do
 579     {
 580       c = scm_getc_unlocked (port);
 581       if (c == EOF)
 582         return;
 583     }
 584   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 585
 586   scm_ungetc_unlocked (c, port);
 587 }
 588
 589 static SCM
 590 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 591 #define FUNC_NAME "scm_lreadr"
 592 {
 593   /* For strings smaller than C_STR, this function creates only one Scheme
 594      object (the string returned).  */
 595
 596   SCM str = SCM_EOL;
 597   size_t c_str_len = 0;
 598   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 599
 600   /* Need to capture line and column numbers here. */
 601   long line = SCM_LINUM (port);
 602   int column = SCM_COL (port) - 1;
 603
 604   while ('"' != (c = scm_getc_unlocked (port)))
 605     {
 606       if (c == EOF)
 607         {
 608         str_eof:
 609           scm_i_input_error (FUNC_NAME, port,
 610                              "end of file in string constant", SCM_EOL);
 611         }
 612
 613       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 614         {
 615           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 616           c_str_len = 0;
 617         }
 618
 619       if (c == '\\')
 620         {
 621           switch (c = scm_getc_unlocked (port))
 622             {
 623             case EOF:
 624               goto str_eof;
 625             case '"':
 626             case '\\':
 627               break;
 628             case '\n':
 629               if (opts->hungry_eol_escapes_p)
 630                 skip_intraline_whitespace (port);
 631               continue;
 632             case '0':
 633               c = '\0';
 634               break;
 635             case 'f':
 636               c = '\f';
 637               break;
 638             case 'n':
 639               c = '\n';
 640               break;
 641             case 'r':
 642               c = '\r';
 643               break;
 644             case 't':
 645               c = '\t';
 646               break;
 647             case 'a':
 648               c = '\007';
 649               break;
 650             case 'v':
 651               c = '\v';
 652               break;
 653             case 'b':
 654               c = '\010';
 655               break;
 656             case 'x':
 657               if (opts->r6rs_escapes_p)
 658                 SCM_READ_HEX_ESCAPE (10, ';');
 659               else
 660                 SCM_READ_HEX_ESCAPE (2, '\0');
 661               break;
 662             case 'u':
 663               if (!opts->r6rs_escapes_p)
 664                 {
 665                   SCM_READ_HEX_ESCAPE (4, '\0');
 666                   break;
 667                 }
 668             case 'U':
 669               if (!opts->r6rs_escapes_p)
 670                 {
 671                   SCM_READ_HEX_ESCAPE (6, '\0');
 672                   break;
 673                 }
 674             default:
 675             bad_escaped:
 676               scm_i_input_error (FUNC_NAME, port,
 677                                  "illegal character in escape sequence: ~S",
 678                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 679             }
 680         }
 681
 682       c_str[c_str_len++] = c;
 683     }
 684
 685   if (scm_is_null (str))
 686     /* Fast path: we got a string that fits in C_STR.  */
 687     str = scm_from_utf32_stringn (c_str, c_str_len);
 688   else
 689     {
 690       if (c_str_len > 0)
 691         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 692
 693       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 694     }
 695
 696   return maybe_annotate_source (str, port, opts, line, column);
 697 }
 698 #undef FUNC_NAME
 699
 700
 701 static SCM
 702 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 703 {
 704   SCM result, str = SCM_EOL;
 705   char local_buffer[READER_BUFFER_SIZE], *buffer;
 706   size_t bytes_read;
 707
 708   /* Need to capture line and column numbers here. */
 709   long line = SCM_LINUM (port);
 710   int column = SCM_COL (port) - 1;
 711
 712   scm_ungetc_unlocked (chr, port);
 713   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 714                                 &bytes_read);
 715
 716   str = scm_from_port_stringn (buffer, bytes_read, port);
 717
 718   result = scm_string_to_number (str, SCM_UNDEFINED);
 719   if (scm_is_false (result))
 720     {
 721       /* Return a symbol instead of a number */
 722       if (opts->case_insensitive_p)
 723         str = scm_string_downcase_x (str);
 724       result = scm_string_to_symbol (str);
 725     }
 726   else if (SCM_NIMP (result))
 727     result = maybe_annotate_source (result, port, opts, line, column);
 728
 729   SCM_COL (port) += scm_i_string_length (str);
 730   return result;
 731 }
 732
 733 static SCM
 734 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 735 {
 736   SCM result;
 737   int ends_with_colon = 0;
 738   size_t bytes_read;
 739   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 740   char local_buffer[READER_BUFFER_SIZE], *buffer;
 741   SCM str;
 742
 743   scm_ungetc_unlocked (chr, port);
 744   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 745                                 &bytes_read);
 746   if (bytes_read > 0)
 747     ends_with_colon = buffer[bytes_read - 1] == ':';
 748
 749   if (postfix && ends_with_colon && (bytes_read > 1))
 750     {
 751       str = scm_from_port_stringn (buffer, bytes_read - 1, port);
 752
 753       if (opts->case_insensitive_p)
 754         str = scm_string_downcase_x (str);
 755       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 756     }
 757   else
 758     {
 759       str = scm_from_port_stringn (buffer, bytes_read, port);
 760
 761       if (opts->case_insensitive_p)
 762         str = scm_string_downcase_x (str);
 763       result = scm_string_to_symbol (str);
 764     }
 765
 766   SCM_COL (port) += scm_i_string_length (str);
 767   return result;
 768 }
 769
 770 static SCM
 771 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 772 #define FUNC_NAME "scm_lreadr"
 773 {
 774   SCM result;
 775   size_t read;
 776   char local_buffer[READER_BUFFER_SIZE], *buffer;
 777   unsigned int radix;
 778   SCM str;
 779
 780   switch (chr)
 781     {
 782     case 'B':
 783     case 'b':
 784       radix = 2;
 785       break;
 786
 787     case 'o':
 788     case 'O':
 789       radix = 8;
 790       break;
 791
 792     case 'd':
 793     case 'D':
 794       radix = 10;
 795       break;
 796
 797     case 'x':
 798     case 'X':
 799       radix = 16;
 800       break;
 801
 802     default:
 803       scm_ungetc_unlocked (chr, port);
 804       scm_ungetc_unlocked ('#', port);
 805       radix = 10;
 806     }
 807
 808   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 809                                 &read);
 810
 811   str = scm_from_port_stringn (buffer, read, port);
 812
 813   result = scm_string_to_number (str, scm_from_uint (radix));
 814
 815   SCM_COL (port) += scm_i_string_length (str);
 816
 817   if (scm_is_true (result))
 818     return result;
 819
 820   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 821
 822   return SCM_BOOL_F;
 823 }
 824 #undef FUNC_NAME
 825
 826 static SCM
 827 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 828 {
 829   SCM p;
 830   long line = SCM_LINUM (port);
 831   int column = SCM_COL (port) - 1;
 832
 833   switch (chr)
 834     {
 835     case '`':
 836       p = scm_sym_quasiquote;
 837       break;
 838
 839     case '\'':
 840       p = scm_sym_quote;
 841       break;
 842
 843     case ',':
 844       {
 845         scm_t_wchar c;
 846
 847         c = scm_getc_unlocked (port);
 848         if ('@' == c)
 849           p = scm_sym_uq_splicing;
 850         else
 851           {
 852             scm_ungetc_unlocked (c, port);
 853             p = scm_sym_unquote;
 854           }
 855         break;
 856       }
 857
 858     default:
 859       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 860                "scm_read_quote", chr);
 861       abort ();
 862     }
 863
 864   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 865   return maybe_annotate_source (p, port, opts, line, column);
 866 }
 867
 868 SCM_SYMBOL (sym_syntax, "syntax");
 869 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 870 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 871 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 872
 873 static SCM
 874 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 875 {
 876   SCM p;
 877   long line = SCM_LINUM (port);
 878   int column = SCM_COL (port) - 1;
 879
 880   switch (chr)
 881     {
 882     case '`':
 883       p = sym_quasisyntax;
 884       break;
 885
 886     case '\'':
 887       p = sym_syntax;
 888       break;
 889
 890     case ',':
 891       {
 892         int c;
 893
 894         c = scm_getc_unlocked (port);
 895         if ('@' == c)
 896           p = sym_unsyntax_splicing;
 897         else
 898           {
 899             scm_ungetc_unlocked (c, port);
 900             p = sym_unsyntax;
 901           }
 902         break;
 903       }
 904
 905     default:
 906       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 907                "scm_read_syntax", chr);
 908       abort ();
 909     }
 910
 911   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 912   return maybe_annotate_source (p, port, opts, line, column);
 913 }
 914
 915 static SCM
 916 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 917 {
 918   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 919
 920   if (!scm_is_eq (id, sym_nil))
 921     scm_i_input_error ("scm_read_nil", port,
 922                        "unexpected input while reading #nil: ~a",
 923                        scm_list_1 (id));
 924
 925   return SCM_ELISP_NIL;
 926 }
 927
 928 static SCM
 929 scm_read_semicolon_comment (int chr, SCM port)
 930 {
 931   int c;
 932
 933   /* We use the get_byte here because there is no need to get the
 934      locale correct with comment input. This presumes that newline
 935      always represents itself no matter what the encoding is.  */
 936   for (c = scm_get_byte_or_eof_unlocked (port);
 937        (c != EOF) && (c != '\n');
 938        c = scm_get_byte_or_eof_unlocked (port));
 939
 940   return SCM_UNSPECIFIED;
 941 }
 942
 943 \f
 944 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 945
 946 static SCM
 947 scm_read_boolean (int chr, SCM port)
 948 {
 949   switch (chr)
 950     {
 951     case 't':
 952     case 'T':
 953       return SCM_BOOL_T;
 954
 955     case 'f':
 956     case 'F':
 957       return SCM_BOOL_F;
 958     }
 959
 960   return SCM_UNSPECIFIED;
 961 }
 962
 963 static SCM
 964 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 965 #define FUNC_NAME "scm_lreadr"
 966 {
 967   char buffer[READER_CHAR_NAME_MAX_SIZE];
 968   SCM charname;
 969   size_t charname_len, bytes_read;
 970   scm_t_wchar cp;
 971   int overflow;
 972   scm_t_port *pt;
 973
 974   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 975                          &bytes_read);
 976   if (overflow)
 977     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 978
 979   if (bytes_read == 0)
 980     {
 981       chr = scm_getc_unlocked (port);
 982       if (chr == EOF)
 983         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 984                            "while reading character", SCM_EOL);
 985
 986       /* CHR must be a token delimiter, like a whitespace.  */
 987       return (SCM_MAKE_CHAR (chr));
 988     }
 989
 990   pt = SCM_PTAB_ENTRY (port);
 991
 992   /* Simple ASCII characters can be processed immediately.  Also, simple
 993      ISO-8859-1 characters can be processed immediately if the encoding for this
 994      port is ISO-8859-1.  */
 995   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 996     {
 997       SCM_COL (port) += 1;
 998       return SCM_MAKE_CHAR (buffer[0]);
 999     }
1000
1001   /* Otherwise, convert the buffer into a proper scheme string for
1002      processing.  */
1003   charname = scm_from_port_stringn (buffer, bytes_read, port);
1004   charname_len = scm_i_string_length (charname);
1005   SCM_COL (port) += charname_len;
1006   cp = scm_i_string_ref (charname, 0);
1007   if (charname_len == 1)
1008     return SCM_MAKE_CHAR (cp);
1009
1010   /* Ignore dotted circles, which may be used to keep combining characters from
1011      combining with the backslash in #\charname.  */
1012   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1013     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1014
1015   if (cp >= '0' && cp < '8')
1016     {
1017       /* Dirk:FIXME::  This type of character syntax is not R5RS
1018        * compliant.  Further, it should be verified that the constant
1019        * does only consist of octal digits.  */
1020       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1021       if (SCM_I_INUMP (p))
1022         {
1023           scm_t_wchar c = scm_to_uint32 (p);
1024           if (SCM_IS_UNICODE_CHAR (c))
1025             return SCM_MAKE_CHAR (c);
1026           else
1027             scm_i_input_error (FUNC_NAME, port,
1028                                "out-of-range octal character escape: ~a",
1029                                scm_list_1 (charname));
1030         }
1031     }
1032
1033   if (cp == 'x' && (charname_len > 1))
1034     {
1035       SCM p;
1036
1037       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1038       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1039                                 scm_from_uint (16));
1040       if (SCM_I_INUMP (p))
1041         {
1042           scm_t_wchar c = scm_to_uint32 (p);
1043           if (SCM_IS_UNICODE_CHAR (c))
1044             return SCM_MAKE_CHAR (c);
1045           else
1046             scm_i_input_error (FUNC_NAME, port,
1047                                "out-of-range hex character escape: ~a",
1048                                scm_list_1 (charname));
1049         }
1050     }
1051
1052   /* The names of characters should never have non-Latin1
1053      characters.  */
1054   if (scm_i_is_narrow_string (charname)
1055       || scm_i_try_narrow_string (charname))
1056     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1057                                        charname_len);
1058       if (scm_is_true (ch))
1059         return ch;
1060     }
1061
1062   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1063                      scm_list_1 (charname));
1064
1065   return SCM_UNSPECIFIED;
1066 }
1067 #undef FUNC_NAME
1068
1069 static SCM
1070 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1071 {
1072   SCM symbol;
1073
1074   /* Read the symbol that comprises the keyword.  Doing this instead of
1075      invoking a specific symbol reader function allows `scm_read_keyword ()'
1076      to adapt to the delimiters currently valid of symbols.
1077
1078      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1079   symbol = scm_read_expression (port, opts);
1080   if (!scm_is_symbol (symbol))
1081     scm_i_input_error ("scm_read_keyword", port,
1082                        "keyword prefix `~a' not followed by a symbol: ~s",
1083                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1084
1085   return (scm_symbol_to_keyword (symbol));
1086 }
1087
1088 static SCM
1089 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1090                  long line, int column)
1091 {
1092   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1093      guarantee that it's going to do what we want.  After all, this is an
1094      implementation detail of `scm_read_vector ()', not a desirable
1095      property.  */
1096   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1097                                 port, opts, line, column);
1098 }
1099
1100 /* Helper used by scm_read_array */
1101 static int
1102 read_decimal_integer (SCM port, int c, ssize_t *resp)
1103 {
1104   ssize_t sign = 1;
1105   ssize_t res = 0;
1106   int got_it = 0;
1107
1108   if (c == '-')
1109     {
1110       sign = -1;
1111       c = scm_getc_unlocked (port);
1112     }
1113
1114   while ('0' <= c && c <= '9')
1115     {
1116       res = 10*res + c-'0';
1117       got_it = 1;
1118       c = scm_getc_unlocked (port);
1119     }
1120
1121   if (got_it)
1122     *resp = sign * res;
1123   return c;
1124 }
1125
1126 /* Read an array.  This function can also read vectors and uniform
1127    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1128    handled here.
1129
1130    C is the first character read after the '#'. */
1131 static SCM
1132 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1133 {
1134   ssize_t rank;
1135   scm_t_wchar tag_buf[8];
1136   int tag_len;
1137
1138   SCM tag, shape = SCM_BOOL_F, elements, array;
1139
1140   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1141      the array code can not deal with zero-length dimensions yet, and
1142      we want to allow zero-length vectors, of course. */
1143   if (c == '(')
1144     return scm_read_vector (c, port, opts, line, column);
1145
1146   /* Disambiguate between '#f' and uniform floating point vectors. */
1147   if (c == 'f')
1148     {
1149       c = scm_getc_unlocked (port);
1150       if (c != '3' && c != '6')
1151         {
1152           if (c != EOF)
1153             scm_ungetc_unlocked (c, port);
1154           return SCM_BOOL_F;
1155         }
1156       rank = 1;
1157       tag_buf[0] = 'f';
1158       tag_len = 1;
1159       goto continue_reading_tag;
1160     }
1161
1162   /* Read rank. */
1163   rank = 1;
1164   c = read_decimal_integer (port, c, &rank);
1165   if (rank < 0)
1166     scm_i_input_error (NULL, port, "array rank must be non-negative",
1167                        SCM_EOL);
1168
1169   /* Read tag. */
1170   tag_len = 0;
1171  continue_reading_tag:
1172   while (c != EOF && c != '(' && c != '@' && c != ':'
1173          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1174     {
1175       tag_buf[tag_len++] = c;
1176       c = scm_getc_unlocked (port);
1177     }
1178   if (tag_len == 0)
1179     tag = SCM_BOOL_T;
1180   else
1181     {
1182       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1183       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1184         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1185                            scm_list_1 (tag));
1186     }
1187
1188   /* Read shape. */
1189   if (c == '@' || c == ':')
1190     {
1191       shape = SCM_EOL;
1192
1193       do
1194         {
1195           ssize_t lbnd = 0, len = 0;
1196           SCM s;
1197
1198           if (c == '@')
1199             {
1200               c = scm_getc_unlocked (port);
1201               c = read_decimal_integer (port, c, &lbnd);
1202             }
1203
1204           s = scm_from_ssize_t (lbnd);
1205
1206           if (c == ':')
1207             {
1208               c = scm_getc_unlocked (port);
1209               c = read_decimal_integer (port, c, &len);
1210               if (len < 0)
1211                 scm_i_input_error (NULL, port,
1212                                    "array length must be non-negative",
1213                                    SCM_EOL);
1214
1215               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1216             }
1217
1218           shape = scm_cons (s, shape);
1219         } while (c == '@' || c == ':');
1220
1221       shape = scm_reverse_x (shape, SCM_EOL);
1222     }
1223
1224   /* Read nested lists of elements. */
1225   if (c != '(')
1226     scm_i_input_error (NULL, port,
1227                        "missing '(' in vector or array literal",
1228                        SCM_EOL);
1229   elements = scm_read_sexp (c, port, opts);
1230
1231   if (scm_is_false (shape))
1232     shape = scm_from_ssize_t (rank);
1233   else if (scm_ilength (shape) != rank)
1234     scm_i_input_error
1235       (NULL, port,
1236        "the number of shape specifications must match the array rank",
1237        SCM_EOL);
1238
1239   /* Handle special print syntax of rank zero arrays; see
1240      scm_i_print_array for a rationale. */
1241   if (rank == 0)
1242     {
1243       if (!scm_is_pair (elements))
1244         scm_i_input_error (NULL, port,
1245                            "too few elements in array literal, need 1",
1246                            SCM_EOL);
1247       if (!scm_is_null (SCM_CDR (elements)))
1248         scm_i_input_error (NULL, port,
1249                            "too many elements in array literal, want 1",
1250                            SCM_EOL);
1251       elements = SCM_CAR (elements);
1252     }
1253
1254   /* Construct array, annotate with source location, and return. */
1255   array = scm_list_to_typed_array (tag, shape, elements);
1256   return maybe_annotate_source (array, port, opts, line, column);
1257 }
1258
1259 static SCM
1260 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1261                        long line, int column)
1262 {
1263   return scm_read_array (chr, port, opts, line, column);
1264 }
1265
1266 static SCM
1267 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1268                      long line, int column)
1269 {
1270   chr = scm_getc_unlocked (port);
1271   if (chr != 'u')
1272     goto syntax;
1273
1274   chr = scm_getc_unlocked (port);
1275   if (chr != '8')
1276     goto syntax;
1277
1278   chr = scm_getc_unlocked (port);
1279   if (chr != '(')
1280     goto syntax;
1281
1282   return maybe_annotate_source
1283     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1284      port, opts, line, column);
1285
1286  syntax:
1287   scm_i_input_error ("read_bytevector", port,
1288                      "invalid bytevector prefix",
1289                      SCM_MAKE_CHAR (chr));
1290   return SCM_UNSPECIFIED;
1291 }
1292
1293 static SCM
1294 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1295                            long line, int column)
1296 {
1297   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1298      terribly inefficient but who cares?  */
1299   SCM s_bits = SCM_EOL;
1300
1301   for (chr = scm_getc_unlocked (port);
1302        (chr != EOF) && ((chr == '0') || (chr == '1'));
1303        chr = scm_getc_unlocked (port))
1304     {
1305       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1306     }
1307
1308   if (chr != EOF)
1309     scm_ungetc_unlocked (chr, port);
1310
1311   return maybe_annotate_source
1312     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1313      port, opts, line, column);
1314 }
1315
1316 static SCM
1317 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1318 {
1319   int bang_seen = 0;
1320
1321   for (;;)
1322     {
1323       int c = scm_getc_unlocked (port);
1324
1325       if (c == EOF)
1326         scm_i_input_error ("skip_block_comment", port,
1327                            "unterminated `#! ... !#' comment", SCM_EOL);
1328
1329       if (c == '!')
1330         bang_seen = 1;
1331       else if (c == '#' && bang_seen)
1332         break;
1333       else
1334         bang_seen = 0;
1335     }
1336
1337   return SCM_UNSPECIFIED;
1338 }
1339
1340 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1341                                          int value);
1342 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1343                                         int value);
1344 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1345                                     int value);
1346
1347 static SCM
1348 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1349 {
1350   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1351   int c;
1352   int i = 0;
1353
1354   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1355     {
1356       c = scm_getc_unlocked (port);
1357       if (c == EOF)
1358         scm_i_input_error ("skip_block_comment", port,
1359                            "unterminated `#! ... !#' comment", SCM_EOL);
1360       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1361         name[i++] = c;
1362       else if (CHAR_IS_DELIMITER (c))
1363         {
1364           scm_ungetc_unlocked (c, port);
1365           name[i] = '\0';
1366           if (0 == strcmp ("r6rs", name))
1367             ;  /* Silently ignore */
1368           else if (0 == strcmp ("fold-case", name))
1369             set_port_case_insensitive_p (port, opts, 1);
1370           else if (0 == strcmp ("no-fold-case", name))
1371             set_port_case_insensitive_p (port, opts, 0);
1372           else if (0 == strcmp ("curly-infix", name))
1373             set_port_curly_infix_p (port, opts, 1);
1374           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1375             {
1376               set_port_curly_infix_p (port, opts, 1);
1377               set_port_square_brackets_p (port, opts, 0);
1378             }
1379           else
1380             break;
1381
1382           return SCM_UNSPECIFIED;
1383         }
1384       else
1385         {
1386           scm_ungetc_unlocked (c, port);
1387           break;
1388         }
1389     }
1390   while (i > 0)
1391     scm_ungetc_unlocked (name[--i], port);
1392   return scm_read_scsh_block_comment (chr, port);
1393 }
1394
1395 static SCM
1396 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1397 {
1398   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1399      nested.  So care must be taken.  */
1400   int nesting_level = 1;
1401
1402   int a = scm_getc_unlocked (port);
1403
1404   if (a == EOF)
1405     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1406                        "unterminated `#| ... |#' comment", SCM_EOL);
1407
1408   while (nesting_level > 0)
1409     {
1410       int b = scm_getc_unlocked (port);
1411
1412       if (b == EOF)
1413         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1414                            "unterminated `#| ... |#' comment", SCM_EOL);
1415
1416       if (a == '|' && b == '#')
1417         {
1418           nesting_level--;
1419           b = EOF;
1420         }
1421       else if (a == '#' && b == '|')
1422         {
1423           nesting_level++;
1424           b = EOF;
1425         }
1426
1427       a = b;
1428     }
1429
1430   return SCM_UNSPECIFIED;
1431 }
1432
1433 static SCM
1434 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1435                                scm_t_read_opts *opts)
1436 {
1437   scm_t_wchar c;
1438
1439   c = flush_ws (port, opts, (char *) NULL);
1440   if (EOF == c)
1441     scm_i_input_error ("read_commented_expression", port,
1442                        "no expression after #; comment", SCM_EOL);
1443   scm_ungetc_unlocked (c, port);
1444   scm_read_expression (port, opts);
1445   return SCM_UNSPECIFIED;
1446 }
1447
1448 static SCM
1449 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1450 {
1451   /* Guile's extended symbol read syntax looks like this:
1452
1453        #{This is all a symbol name}#
1454
1455      So here, CHR is expected to be `{'.  */
1456   int saw_brace = 0;
1457   size_t len = 0;
1458   SCM buf = scm_i_make_string (1024, NULL, 0);
1459
1460   buf = scm_i_string_start_writing (buf);
1461
1462   while ((chr = scm_getc_unlocked (port)) != EOF)
1463     {
1464       if (saw_brace)
1465         {
1466           if (chr == '#')
1467             {
1468               break;
1469             }
1470           else
1471             {
1472               saw_brace = 0;
1473               scm_i_string_set_x (buf, len++, '}');
1474             }
1475         }
1476
1477       if (chr == '}')
1478         saw_brace = 1;
1479       else if (chr == '\\')
1480         {
1481           /* It used to be that print.c would print extended-read-syntax
1482              symbols with backslashes before "non-standard" chars, but
1483              this routine wouldn't do anything with those escapes.
1484              Bummer.  What we've done is to change print.c to output
1485              R6RS hex escapes for those characters, relying on the fact
1486              that the extended read syntax would never put a `\' before
1487              an `x'.  For now, we just ignore other instances of
1488              backslash in the string.  */
1489           switch ((chr = scm_getc_unlocked (port)))
1490             {
1491             case EOF:
1492               goto done;
1493             case 'x':
1494               {
1495                 scm_t_wchar c;
1496
1497                 SCM_READ_HEX_ESCAPE (10, ';');
1498                 scm_i_string_set_x (buf, len++, c);
1499                 break;
1500
1501               str_eof:
1502                 chr = EOF;
1503                 goto done;
1504
1505               bad_escaped:
1506                 scm_i_string_stop_writing ();
1507                 scm_i_input_error ("scm_read_extended_symbol", port,
1508                                    "illegal character in escape sequence: ~S",
1509                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1510                 break;
1511               }
1512             default:
1513               scm_i_string_set_x (buf, len++, chr);
1514               break;
1515             }
1516         }
1517       else
1518         scm_i_string_set_x (buf, len++, chr);
1519
1520       if (len >= scm_i_string_length (buf) - 2)
1521         {
1522           SCM addy;
1523
1524           scm_i_string_stop_writing ();
1525           addy = scm_i_make_string (1024, NULL, 0);
1526           buf = scm_string_append (scm_list_2 (buf, addy));
1527           len = 0;
1528           buf = scm_i_string_start_writing (buf);
1529         }
1530     }
1531
1532  done:
1533   scm_i_string_stop_writing ();
1534   if (chr == EOF)
1535     scm_i_input_error ("scm_read_extended_symbol", port,
1536                        "end of file while reading symbol", SCM_EOL);
1537
1538   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1539 }
1540
1541
1542 \f
1543 /* Top-level token readers, i.e., dispatchers.  */
1544
1545 static SCM
1546 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1547 {
1548   SCM proc;
1549
1550   proc = scm_get_hash_procedure (chr);
1551   if (scm_is_true (scm_procedure_p (proc)))
1552     {
1553       long line = SCM_LINUM (port);
1554       int column = SCM_COL (port) - 2;
1555       SCM got;
1556
1557       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1558
1559       if (opts->record_positions_p && SCM_NIMP (got)
1560           && !scm_i_has_source_properties (got))
1561         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1562
1563       return got;
1564     }
1565
1566   return SCM_UNSPECIFIED;
1567 }
1568
1569 /* The reader for the sharp `#' character.  It basically dispatches reads
1570    among the above token readers.   */
1571 static SCM
1572 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1573                 long line, int column)
1574 #define FUNC_NAME "scm_lreadr"
1575 {
1576   SCM result;
1577
1578   chr = scm_getc_unlocked (port);
1579
1580   result = scm_read_sharp_extension (chr, port, opts);
1581   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1582     return result;
1583
1584   switch (chr)
1585     {
1586     case '\\':
1587       return (scm_read_character (chr, port, opts));
1588     case '(':
1589       return (scm_read_vector (chr, port, opts, line, column));
1590     case 's':
1591     case 'u':
1592     case 'f':
1593     case 'c':
1594       /* This one may return either a boolean or an SRFI-4 vector.  */
1595       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1596     case 'v':
1597       return (scm_read_bytevector (chr, port, opts, line, column));
1598     case '*':
1599       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1600     case 't':
1601     case 'T':
1602     case 'F':
1603       return (scm_read_boolean (chr, port));
1604     case ':':
1605       return (scm_read_keyword (chr, port, opts));
1606     case '0': case '1': case '2': case '3': case '4':
1607     case '5': case '6': case '7': case '8': case '9':
1608     case '@':
1609       return (scm_read_array (chr, port, opts, line, column));
1610
1611     case 'i':
1612     case 'e':
1613     case 'b':
1614     case 'B':
1615     case 'o':
1616     case 'O':
1617     case 'd':
1618     case 'D':
1619     case 'x':
1620     case 'X':
1621     case 'I':
1622     case 'E':
1623       return (scm_read_number_and_radix (chr, port, opts));
1624     case '{':
1625       return (scm_read_extended_symbol (chr, port));
1626     case '!':
1627       return (scm_read_shebang (chr, port, opts));
1628     case ';':
1629       return (scm_read_commented_expression (chr, port, opts));
1630     case '`':
1631     case '\'':
1632     case ',':
1633       return (scm_read_syntax (chr, port, opts));
1634     case 'n':
1635       return (scm_read_nil (chr, port, opts));
1636     default:
1637       result = scm_read_sharp_extension (chr, port, opts);
1638       if (scm_is_eq (result, SCM_UNSPECIFIED))
1639         {
1640           /* To remain compatible with 1.8 and earlier, the following
1641              characters have lower precedence than `read-hash-extend'
1642              characters.  */
1643           switch (chr)
1644             {
1645             case '|':
1646               return scm_read_r6rs_block_comment (chr, port);
1647             default:
1648               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1649                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1650             }
1651         }
1652       else
1653         return result;
1654     }
1655
1656   return SCM_UNSPECIFIED;
1657 }
1658 #undef FUNC_NAME
1659
1660 static SCM
1661 read_inner_expression (SCM port, scm_t_read_opts *opts)
1662 #define FUNC_NAME "read_inner_expression"
1663 {
1664   while (1)
1665     {
1666       scm_t_wchar chr;
1667
1668       chr = scm_getc_unlocked (port);
1669
1670       switch (chr)
1671         {
1672         case SCM_WHITE_SPACES:
1673         case SCM_LINE_INCREMENTORS:
1674           break;
1675         case ';':
1676           (void) scm_read_semicolon_comment (chr, port);
1677           break;
1678         case '{':
1679           if (opts->curly_infix_p)
1680             {
1681               if (opts->neoteric_p)
1682                 return scm_read_sexp (chr, port, opts);
1683               else
1684                 {
1685                   SCM expr;
1686
1687                   /* Enable neoteric expressions within curly braces */
1688                   opts->neoteric_p = 1;
1689                   expr = scm_read_sexp (chr, port, opts);
1690                   opts->neoteric_p = 0;
1691                   return expr;
1692                 }
1693             }
1694           else
1695             return scm_read_mixed_case_symbol (chr, port, opts);
1696         case '[':
1697           if (opts->square_brackets_p)
1698             return scm_read_sexp (chr, port, opts);
1699           else if (opts->curly_infix_p)
1700             {
1701               /* The syntax of neoteric expressions requires that '[' be
1702                  a delimiter when curly-infix is enabled, so it cannot
1703                  be part of an unescaped symbol.  We might as well do
1704                  something useful with it, so we adopt Kawa's convention:
1705                  [...] => ($bracket-list$ ...) */
1706               long line = SCM_LINUM (port);
1707               int column = SCM_COL (port) - 1;
1708               return maybe_annotate_source
1709                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1710                  port, opts, line, column);
1711             }
1712           else
1713             return scm_read_mixed_case_symbol (chr, port, opts);
1714         case '(':
1715           return (scm_read_sexp (chr, port, opts));
1716         case '"':
1717           return (scm_read_string (chr, port, opts));
1718         case '\'':
1719         case '`':
1720         case ',':
1721           return (scm_read_quote (chr, port, opts));
1722         case '#':
1723           {
1724             long line  = SCM_LINUM (port);
1725             int column = SCM_COL (port) - 1;
1726             SCM result = scm_read_sharp (chr, port, opts, line, column);
1727             if (scm_is_eq (result, SCM_UNSPECIFIED))
1728               /* We read a comment or some such.  */
1729               break;
1730             else
1731               return result;
1732           }
1733         case ')':
1734           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1735           break;
1736         case '}':
1737           if (opts->curly_infix_p)
1738             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1739           else
1740             return scm_read_mixed_case_symbol (chr, port, opts);
1741         case ']':
1742           if (opts->square_brackets_p)
1743             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1744           /* otherwise fall through */
1745         case EOF:
1746           return SCM_EOF_VAL;
1747         case ':':
1748           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1749             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1750           /* Fall through.  */
1751
1752         default:
1753           {
1754             if (((chr >= '0') && (chr <= '9'))
1755                 || (strchr ("+-.", chr)))
1756               return (scm_read_number (chr, port, opts));
1757             else
1758               return (scm_read_mixed_case_symbol (chr, port, opts));
1759           }
1760         }
1761     }
1762 }
1763 #undef FUNC_NAME
1764
1765 static SCM
1766 scm_read_expression (SCM port, scm_t_read_opts *opts)
1767 #define FUNC_NAME "scm_read_expression"
1768 {
1769   if (!opts->neoteric_p)
1770     return read_inner_expression (port, opts);
1771   else
1772     {
1773       long line = 0;
1774       int column = 0;
1775       SCM expr;
1776
1777       if (opts->record_positions_p)
1778         {
1779           /* We need to get the position of the first non-whitespace
1780              character in order to correctly annotate neoteric
1781              expressions.  For example, for the expression 'f(x)', the
1782              first call to 'read_inner_expression' reads the 'f' (which
1783              cannot be annotated), and then we later read the '(x)' and
1784              use it to construct the new list (f x). */
1785           int c = flush_ws (port, opts, (char *) NULL);
1786           if (c == EOF)
1787             return SCM_EOF_VAL;
1788           scm_ungetc_unlocked (c, port);
1789           line = SCM_LINUM (port);
1790           column = SCM_COL (port);
1791         }
1792
1793       expr = read_inner_expression (port, opts);
1794
1795       /* 'expr' is the first component of the neoteric expression.  Now
1796          we loop, and as long as the next character is '(', '[', or '{',
1797          (without any intervening whitespace), we use it to construct a
1798          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1799       for (;;)
1800         {
1801           int chr = scm_getc_unlocked (port);
1802
1803           if (chr == '(')
1804             /* e(...) => (e ...) */
1805             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1806           else if (chr == '[')
1807             /* e[...] => ($bracket-apply$ e ...) */
1808             expr = scm_cons (sym_bracket_apply,
1809                              scm_cons (expr,
1810                                        scm_read_sexp (chr, port, opts)));
1811           else if (chr == '{')
1812             {
1813               SCM arg = scm_read_sexp (chr, port, opts);
1814
1815               if (scm_is_null (arg))
1816                 expr = scm_list_1 (expr);       /* e{} => (e) */
1817               else
1818                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1819             }
1820           else
1821             {
1822               if (chr != EOF)
1823                 scm_ungetc_unlocked (chr, port);
1824               break;
1825             }
1826           maybe_annotate_source (expr, port, opts, line, column);
1827         }
1828       return expr;
1829     }
1830 }
1831 #undef FUNC_NAME
1832
1833 \f
1834 /* Actual reader.  */
1835
1836 static void init_read_options (SCM port, scm_t_read_opts *opts);
1837
1838 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1839             (SCM port),
1840             "Read an s-expression from the input port @var{port}, or from\n"
1841             "the current input port if @var{port} is not specified.\n"
1842             "Any whitespace before the next token is discarded.")
1843 #define FUNC_NAME s_scm_read
1844 {
1845   scm_t_read_opts opts;
1846   int c;
1847
1848   if (SCM_UNBNDP (port))
1849     port = scm_current_input_port ();
1850   SCM_VALIDATE_OPINPORT (1, port);
1851
1852   init_read_options (port, &opts);
1853
1854   c = flush_ws (port, &opts, (char *) NULL);
1855   if (EOF == c)
1856     return SCM_EOF_VAL;
1857   scm_ungetc_unlocked (c, port);
1858
1859   return (scm_read_expression (port, &opts));
1860 }
1861 #undef FUNC_NAME
1862
1863
1864 \f
1865
1866 /* Manipulate the read-hash-procedures alist.  This could be written in
1867    Scheme, but maybe it will also be used by C code during initialisation.  */
1868 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1869             (SCM chr, SCM proc),
1870             "Install the procedure @var{proc} for reading expressions\n"
1871             "starting with the character sequence @code{#} and @var{chr}.\n"
1872             "@var{proc} will be called with two arguments:  the character\n"
1873             "@var{chr} and the port to read further data from. The object\n"
1874             "returned will be the return value of @code{read}. \n"
1875             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1876             )
1877 #define FUNC_NAME s_scm_read_hash_extend
1878 {
1879   SCM this;
1880   SCM prev;
1881
1882   SCM_VALIDATE_CHAR (1, chr);
1883   SCM_ASSERT (scm_is_false (proc)
1884               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1885               proc, SCM_ARG2, FUNC_NAME);
1886
1887   /* Check if chr is already in the alist.  */
1888   this = scm_i_read_hash_procedures_ref ();
1889   prev = SCM_BOOL_F;
1890   while (1)
1891     {
1892       if (scm_is_null (this))
1893         {
1894           /* not found, so add it to the beginning.  */
1895           if (scm_is_true (proc))
1896             {
1897               SCM new = scm_cons (scm_cons (chr, proc),
1898                                   scm_i_read_hash_procedures_ref ());
1899               scm_i_read_hash_procedures_set_x (new);
1900             }
1901           break;
1902         }
1903       if (scm_is_eq (chr, SCM_CAAR (this)))
1904         {
1905           /* already in the alist.  */
1906           if (scm_is_false (proc))
1907             {
1908               /* remove it.  */
1909               if (scm_is_false (prev))
1910                 {
1911                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1912                   scm_i_read_hash_procedures_set_x (rest);
1913                 }
1914               else
1915                 scm_set_cdr_x (prev, SCM_CDR (this));
1916             }
1917           else
1918             {
1919               /* replace it.  */
1920               scm_set_cdr_x (SCM_CAR (this), proc);
1921             }
1922           break;
1923         }
1924       prev = this;
1925       this = SCM_CDR (this);
1926     }
1927
1928   return SCM_UNSPECIFIED;
1929 }
1930 #undef FUNC_NAME
1931
1932 /* Recover the read-hash procedure corresponding to char c.  */
1933 static SCM
1934 scm_get_hash_procedure (int c)
1935 {
1936   SCM rest = scm_i_read_hash_procedures_ref ();
1937
1938   while (1)
1939     {
1940       if (scm_is_null (rest))
1941         return SCM_BOOL_F;
1942
1943       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1944         return SCM_CDAR (rest);
1945
1946       rest = SCM_CDR (rest);
1947     }
1948 }
1949
1950 #define SCM_ENCODING_SEARCH_SIZE (500)
1951
1952 /* Search the first few hundred characters of a file for an Emacs-like coding
1953    declaration.  Returns either NULL or a string whose storage has been
1954    allocated with `scm_gc_malloc ()'.  */
1955 char *
1956 scm_i_scan_for_encoding (SCM port)
1957 {
1958   scm_t_port *pt;
1959   char header[SCM_ENCODING_SEARCH_SIZE+1];
1960   size_t bytes_read, encoding_length, i;
1961   char *encoding = NULL;
1962   int utf8_bom = 0;
1963   char *pos, *encoding_start;
1964   int in_comment;
1965
1966   pt = SCM_PTAB_ENTRY (port);
1967
1968   if (pt->rw_active == SCM_PORT_WRITE)
1969     scm_flush_unlocked (port);
1970
1971   if (pt->rw_random)
1972     pt->rw_active = SCM_PORT_READ;
1973
1974   if (pt->read_pos == pt->read_end)
1975     {
1976       /* We can use the read buffer, and thus avoid a seek. */
1977       if (scm_fill_input_unlocked (port) == EOF)
1978         return NULL;
1979
1980       bytes_read = pt->read_end - pt->read_pos;
1981       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1982         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1983
1984       if (bytes_read <= 1)
1985         /* An unbuffered port -- don't scan.  */
1986         return NULL;
1987
1988       memcpy (header, pt->read_pos, bytes_read);
1989       header[bytes_read] = '\0';
1990     }
1991   else
1992     {
1993       /* Try to read some bytes and then seek back.  Not all ports
1994          support seeking back; and indeed some file ports (like
1995          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1996          check performed by SCM_FPORT_FDES---but fail to seek
1997          backwards.  Hence this block comes second.  We prefer to use
1998          the read buffer in-place.  */
1999       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2000         return NULL;
2001
2002       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2003       header[bytes_read] = '\0';
2004       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2005     }
2006
2007   if (bytes_read > 3
2008       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2009     utf8_bom = 1;
2010
2011   /* search past "coding[:=]" */
2012   pos = header;
2013   while (1)
2014     {
2015       if ((pos = strstr(pos, "coding")) == NULL)
2016         return NULL;
2017
2018       pos += strlen("coding");
2019       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2020           (*pos == ':' || *pos == '='))
2021         {
2022           pos ++;
2023           break;
2024         }
2025     }
2026
2027   /* skip spaces */
2028   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2029          (*pos == ' ' || *pos == '\t'))
2030     pos ++;
2031
2032   /* grab the next token */
2033   encoding_start = pos;
2034   i = 0;
2035   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2036          && encoding_start + i - header < bytes_read
2037          && (isalnum ((int) encoding_start[i])
2038              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2039     i++;
2040
2041   encoding_length = i;
2042   if (encoding_length == 0)
2043     return NULL;
2044
2045   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2046   for (i = 0; i < encoding_length; i++)
2047     encoding[i] = toupper ((int) encoding[i]);
2048
2049   /* push backwards to make sure we were in a comment */
2050   in_comment = 0;
2051   pos = encoding_start;
2052   while (pos >= header)
2053     {
2054       if (*pos == ';')
2055         {
2056           in_comment = 1;
2057           break;
2058         }
2059       else if (*pos == '\n' || pos == header)
2060         {
2061           /* This wasn't in a semicolon comment. Check for a
2062            hash-bang comment. */
2063           char *beg = strstr (header, "#!");
2064           char *end = strstr (header, "!#");
2065           if (beg < encoding_start && encoding_start + encoding_length <= end)
2066             in_comment = 1;
2067           break;
2068         }
2069       else
2070         {
2071           pos --;
2072           continue;
2073         }
2074     }
2075   if (!in_comment)
2076     /* This wasn't in a comment */
2077     return NULL;
2078
2079   if (utf8_bom && strcmp(encoding, "UTF-8"))
2080     scm_misc_error (NULL,
2081                     "the port input declares the encoding ~s but is encoded as UTF-8",
2082                     scm_list_1 (scm_from_locale_string (encoding)));
2083
2084   return encoding;
2085 }
2086
2087 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2088             (SCM port),
2089             "Scans the port for an Emacs-like character coding declaration\n"
2090             "near the top of the contents of a port with random-accessible contents.\n"
2091             "The coding declaration is of the form\n"
2092             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2093             "\n"
2094             "Returns a string containing the character encoding of the file\n"
2095             "if a declaration was found, or @code{#f} otherwise.\n")
2096 #define FUNC_NAME s_scm_file_encoding
2097 {
2098   char *enc;
2099   SCM s_enc;
2100
2101   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2102
2103   enc = scm_i_scan_for_encoding (port);
2104   if (enc == NULL)
2105     return SCM_BOOL_F;
2106   else
2107     {
2108       s_enc = scm_from_locale_string (enc);
2109       return s_enc;
2110     }
2111
2112   return SCM_BOOL_F;
2113 }
2114 #undef FUNC_NAME
2115
2116 \f
2117 /* Per-port read options.
2118
2119    We store per-port read options in the 'port-read-options' key of the
2120    port's alist.  The value stored in the alist is a single integer that
2121    contains a two-bit field for each read option.
2122
2123    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2124    the applicable value should be inherited from the corresponding
2125    global read option.  Otherwise, the bit field contains the value of
2126    the read option.  For boolean read options that have been set
2127    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2128    read option has been set per-port, its possible values are those in
2129    'enum t_keyword_style'. */
2130
2131 /* Key to read options in per-port alists. */
2132 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2133
2134 /* Offsets of bit fields for each per-port override */
2135 #define READ_OPTION_COPY_SOURCE_P          0
2136 #define READ_OPTION_RECORD_POSITIONS_P     2
2137 #define READ_OPTION_CASE_INSENSITIVE_P     4
2138 #define READ_OPTION_KEYWORD_STYLE          6
2139 #define READ_OPTION_R6RS_ESCAPES_P         8
2140 #define READ_OPTION_SQUARE_BRACKETS_P     10
2141 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2142 #define READ_OPTION_CURLY_INFIX_P         14
2143
2144 /* The total width in bits of the per-port overrides */
2145 #define READ_OPTIONS_NUM_BITS             16
2146
2147 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2148 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2149
2150 #define READ_OPTION_MASK     3
2151 #define READ_OPTION_INHERIT  3
2152
2153 static void
2154 set_port_read_option (SCM port, int option, int new_value)
2155 {
2156   SCM scm_read_options;
2157   unsigned int read_options;
2158
2159   new_value &= READ_OPTION_MASK;
2160   scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2161                                    sym_port_read_options);
2162   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2163     read_options = scm_to_uint (scm_read_options);
2164   else
2165     read_options = READ_OPTIONS_INHERIT_ALL;
2166   read_options &= ~(READ_OPTION_MASK << option);
2167   read_options |= new_value << option;
2168   scm_read_options = scm_from_uint (read_options);
2169   SCM_PTAB_ENTRY(port)->alist = scm_assq_set_x (SCM_PTAB_ENTRY(port)->alist,
2170                                                 sym_port_read_options,
2171                                                 scm_read_options);
2172 }
2173
2174 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2175 static void
2176 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2177 {
2178   value = !!value;
2179   opts->case_insensitive_p = value;
2180   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2181 }
2182
2183 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2184 static void
2185 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2186 {
2187   value = !!value;
2188   opts->square_brackets_p = value;
2189   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2190 }
2191
2192 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2193 static void
2194 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2195 {
2196   value = !!value;
2197   opts->curly_infix_p = value;
2198   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2199 }
2200
2201 /* Initialize OPTS based on PORT's read options and the global read
2202    options. */
2203 static void
2204 init_read_options (SCM port, scm_t_read_opts *opts)
2205 {
2206   SCM val, scm_read_options;
2207   unsigned int read_options, x;
2208
2209   scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2210                                    sym_port_read_options);
2211
2212   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2213     read_options = scm_to_uint (scm_read_options);
2214   else
2215     read_options = READ_OPTIONS_INHERIT_ALL;
2216
2217   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2218   if (x == READ_OPTION_INHERIT)
2219     {
2220       val = SCM_PACK (SCM_KEYWORD_STYLE);
2221       if (scm_is_eq (val, scm_keyword_prefix))
2222         x = KEYWORD_STYLE_PREFIX;
2223       else if (scm_is_eq (val, scm_keyword_postfix))
2224         x = KEYWORD_STYLE_POSTFIX;
2225       else
2226         x = KEYWORD_STYLE_HASH_PREFIX;
2227     }
2228   opts->keyword_style = x;
2229
2230 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2231   do                                                                    \
2232     {                                                                   \
2233       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2234       if (x == READ_OPTION_INHERIT)                                     \
2235         x = !!SCM_ ## NAME;                                             \
2236           opts->name = x;                                               \
2237     }                                                                   \
2238   while (0)
2239
2240   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2241   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2242   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2243   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2244   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2245   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2246   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2247
2248 #undef RESOLVE_BOOLEAN_OPTION
2249
2250   opts->neoteric_p = 0;
2251 }
2252
2253 void
2254 scm_init_read ()
2255 {
2256   SCM read_hash_procs;
2257
2258   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2259
2260   scm_i_read_hash_procedures =
2261     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2262
2263   scm_init_opts (scm_read_options, scm_read_opts);
2264 #include "libguile/read.x"
2265 }
2266
2267 /*
2268   Local Variables:
2269   c-file-style: "gnu"
2270   End:
2271 */