libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <unistd.h>
  30 #include <unicase.h>
  31 #include <unictype.h>
  32 #include <c-strcase.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/ports-internal.h"
  47 #include "libguile/fports.h"
  48 #include "libguile/root.h"
  49 #include "libguile/strings.h"
  50 #include "libguile/strports.h"
  51 #include "libguile/vectors.h"
  52 #include "libguile/validate.h"
  53 #include "libguile/srfi-4.h"
  54 #include "libguile/srfi-13.h"
  55
  56 #include "libguile/read.h"
  57 #include "libguile/private-options.h"
  58
  59
  60 \f
  61
  62 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  63 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  64 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  65 SCM_SYMBOL (sym_nil, "nil");
  66
  67 /* SRFI-105 curly infix expression support */
  68 SCM_SYMBOL (sym_nfx, "$nfx$");
  69 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  70 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  71
  72 scm_t_option scm_read_opts[] =
  73   {
  74     { SCM_OPTION_BOOLEAN, "copy", 0,
  75       "Copy source code expressions." },
  76     { SCM_OPTION_BOOLEAN, "positions", 1,
  77       "Record positions of source code expressions." },
  78     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  79       "Convert symbols to lower case."},
  80     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  81       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  82     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  83       "Use R6RS variable-length character and string hex escapes."},
  84     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  85       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  86     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  87       "In strings, consume leading whitespace after an escaped end-of-line."},
  88     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  89       "Support SRFI-105 curly infix expressions."},
  90     { 0, },
  91   };
  92
  93 /* Internal read options structure.  This is initialized by 'scm_read'
  94    from the global and per-port read options, and a pointer is passed
  95    down to all helper functions. */
  96
  97 enum t_keyword_style
  98   {
  99     KEYWORD_STYLE_HASH_PREFIX,
 100     KEYWORD_STYLE_PREFIX,
 101     KEYWORD_STYLE_POSTFIX
 102   };
 103
 104 struct t_read_opts
 105 {
 106   enum t_keyword_style keyword_style;
 107   unsigned int copy_source_p        : 1;
 108   unsigned int record_positions_p   : 1;
 109   unsigned int case_insensitive_p   : 1;
 110   unsigned int r6rs_escapes_p       : 1;
 111   unsigned int square_brackets_p    : 1;
 112   unsigned int hungry_eol_escapes_p : 1;
 113   unsigned int curly_infix_p        : 1;
 114   unsigned int neoteric_p           : 1;
 115 };
 116
 117 typedef struct t_read_opts scm_t_read_opts;
 118
 119
 120 /*
 121   Give meaningful error messages for errors
 122
 123   We use the format
 124
 125   FILE:LINE:COL: MESSAGE
 126   This happened in ....
 127
 128   This is not standard GNU format, but the test-suite likes the real
 129   message to be in front.
 130
 131  */
 132
 133
 134 void
 135 scm_i_input_error (char const *function,
 136                    SCM port, const char *message, SCM arg)
 137 {
 138   SCM fn = (scm_is_string (SCM_FILENAME(port))
 139             ? SCM_FILENAME(port)
 140             : scm_from_locale_string ("#<unknown port>"));
 141
 142   SCM string_port = scm_open_output_string ();
 143   SCM string = SCM_EOL;
 144   scm_simple_format (string_port,
 145                      scm_from_locale_string ("~A:~S:~S: ~A"),
 146                      scm_list_4 (fn,
 147                                  scm_from_long (SCM_LINUM (port) + 1),
 148                                  scm_from_int (SCM_COL (port) + 1),
 149                                  scm_from_locale_string (message)));
 150
 151   string = scm_get_output_string (string_port);
 152   scm_close_output_port (string_port);
 153   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 154                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 155                  string,
 156                  arg,
 157                  SCM_BOOL_F);
 158 }
 159
 160
 161 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 162             (SCM setting),
 163             "Option interface for the read options. Instead of using\n"
 164             "this procedure directly, use the procedures @code{read-enable},\n"
 165             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 166 #define FUNC_NAME s_scm_read_options
 167 {
 168   SCM ans = scm_options (setting,
 169                          scm_read_opts,
 170                          FUNC_NAME);
 171   if (SCM_COPY_SOURCE_P)
 172     SCM_RECORD_POSITIONS_P = 1;
 173   return ans;
 174 }
 175 #undef FUNC_NAME
 176
 177 /* A fluid referring to an association list mapping extra hash
 178    characters to procedures.  */
 179 static SCM *scm_i_read_hash_procedures;
 180
 181 static SCM
 182 scm_i_read_hash_procedures_ref (void)
 183 {
 184   return scm_fluid_ref (*scm_i_read_hash_procedures);
 185 }
 186
 187 static void
 188 scm_i_read_hash_procedures_set_x (SCM value)
 189 {
 190   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 191 }
 192
 193 \f
 194 /* Token readers.  */
 195
 196
 197 /* Size of the C buffer used to read symbols and numbers.  */
 198 #define READER_BUFFER_SIZE            128
 199
 200 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 201 #define READER_STRING_BUFFER_SIZE     128
 202
 203 /* The maximum size of Scheme character names.  */
 204 #define READER_CHAR_NAME_MAX_SIZE      50
 205
 206 /* The maximum size of reader directive names.  */
 207 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 208
 209
 210 /* `isblank' is only in C99.  */
 211 #define CHAR_IS_BLANK_(_chr)                                    \
 212   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 213    || ((_chr) == '\f') || ((_chr) == '\r'))
 214
 215 #ifdef MSDOS
 216 # define CHAR_IS_BLANK(_chr)                    \
 217   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 218 #else
 219 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 220 #endif
 221
 222
 223 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 224    structure'').  */
 225 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 226   (CHAR_IS_BLANK (c)                                            \
 227    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 228
 229 #define CHAR_IS_DELIMITER(c)                                    \
 230   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 231    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 232                                       || opts->curly_infix_p))  \
 233    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 234
 235 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 236    Structure''.  */
 237 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 238   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 239    || ((_chr) == 'd') || ((_chr) == 'l'))
 240
 241 /* Read an SCSH block comment.  */
 242 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 244 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 246 static SCM scm_get_hash_procedure (int);
 247
 248 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 249    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 250    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 251    bytes actually read.  */
 252 static int
 253 read_token (SCM port, scm_t_read_opts *opts,
 254             char *buf, size_t buf_size, size_t *read)
 255 {
 256    *read = 0;
 257
 258    while (*read < buf_size)
 259      {
 260        int chr;
 261
 262        chr = scm_get_byte_or_eof_unlocked (port);
 263
 264        if (chr == EOF)
 265         return 0;
 266       else if (CHAR_IS_DELIMITER (chr))
 267         {
 268           scm_unget_byte_unlocked (chr, port);
 269           return 0;
 270         }
 271       else
 272         {
 273           *buf = (char) chr;
 274           buf++, (*read)++;
 275         }
 276      }
 277
 278    return 1;
 279  }
 280
 281 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 282    if the token doesn't fit in BUFFER_SIZE bytes.  */
 283 static char *
 284 read_complete_token (SCM port, scm_t_read_opts *opts,
 285                      char *buffer, size_t buffer_size, size_t *read)
 286 {
 287   int overflow = 0;
 288   size_t bytes_read, overflow_size = 0;
 289   char *overflow_buffer = NULL;
 290
 291   do
 292     {
 293       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 294       if (bytes_read == 0)
 295         break;
 296       if (overflow || overflow_size != 0)
 297         {
 298           if (overflow_size == 0)
 299             {
 300               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 301               memcpy (overflow_buffer, buffer, bytes_read);
 302               overflow_size = bytes_read;
 303             }
 304           else
 305             {
 306               char *new_buf =
 307                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 308
 309               memcpy (new_buf, overflow_buffer, overflow_size);
 310               memcpy (new_buf + overflow_size, buffer, bytes_read);
 311
 312               overflow_buffer = new_buf;
 313               overflow_size += bytes_read;
 314             }
 315         }
 316     }
 317   while (overflow);
 318
 319   if (overflow_size)
 320     *read = overflow_size;
 321   else
 322     *read = bytes_read;
 323
 324   return (overflow_size > 0 ? overflow_buffer : buffer);
 325 }
 326
 327 /* Skip whitespace from PORT and return the first non-whitespace character
 328    read.  Raise an error on end-of-file.  */
 329 static int
 330 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 331 {
 332   scm_t_wchar c;
 333   while (1)
 334     switch (c = scm_getc_unlocked (port))
 335       {
 336       case EOF:
 337       goteof:
 338         if (eoferr)
 339           {
 340             scm_i_input_error (eoferr,
 341                                port,
 342                                "end of file",
 343                                SCM_EOL);
 344           }
 345         return c;
 346
 347       case ';':
 348       lp:
 349         switch (c = scm_getc_unlocked (port))
 350           {
 351           case EOF:
 352             goto goteof;
 353           default:
 354             goto lp;
 355           case SCM_LINE_INCREMENTORS:
 356             break;
 357           }
 358         break;
 359
 360       case '#':
 361         switch (c = scm_getc_unlocked (port))
 362           {
 363           case EOF:
 364             eoferr = "read_sharp";
 365             goto goteof;
 366           case '!':
 367             scm_read_shebang (c, port, opts);
 368             break;
 369           case ';':
 370             scm_read_commented_expression (c, port, opts);
 371             break;
 372           case '|':
 373             if (scm_is_false (scm_get_hash_procedure (c)))
 374               {
 375                 scm_read_r6rs_block_comment (c, port);
 376                 break;
 377               }
 378             /* fall through */
 379           default:
 380             scm_ungetc_unlocked (c, port);
 381             return '#';
 382           }
 383         break;
 384
 385       case SCM_LINE_INCREMENTORS:
 386       case SCM_SINGLE_SPACES:
 387       case '\t':
 388         break;
 389
 390       default:
 391         return c;
 392       }
 393
 394   return 0;
 395 }
 396
 397
 398 \f
 399 /* Token readers.  */
 400
 401 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 402 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 403                            long line, int column);
 404
 405
 406 static SCM
 407 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 408                        long line, int column)
 409 {
 410   if (opts->record_positions_p)
 411     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 412   return x;
 413 }
 414
 415 static SCM
 416 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 417 #define FUNC_NAME "scm_i_lreadparen"
 418 {
 419   int c;
 420   SCM tmp, tl, ans = SCM_EOL;
 421   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 422   const int terminating_char = ((chr == '{') ? '}'
 423                                 : ((chr == '[') ? ']'
 424                                    : ')'));
 425
 426   /* Need to capture line and column numbers here. */
 427   long line = SCM_LINUM (port);
 428   int column = SCM_COL (port) - 1;
 429
 430   c = flush_ws (port, opts, FUNC_NAME);
 431   if (terminating_char == c)
 432     return SCM_EOL;
 433
 434   scm_ungetc_unlocked (c, port);
 435   tmp = scm_read_expression (port, opts);
 436
 437   /* Note that it is possible for scm_read_expression to return
 438      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 439      check that it's a real dot by checking `c'.  */
 440   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 441     {
 442       ans = scm_read_expression (port, opts);
 443       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 444         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 445                            SCM_EOL);
 446       return ans;
 447     }
 448
 449   /* Build the head of the list structure. */
 450   ans = tl = scm_cons (tmp, SCM_EOL);
 451
 452   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 453     {
 454       SCM new_tail;
 455
 456       if (c == ')' || (c == ']' && opts->square_brackets_p)
 457           || ((c == '}' || c == ']') && opts->curly_infix_p))
 458         scm_i_input_error (FUNC_NAME, port,
 459                            "in pair: mismatched close paren: ~A",
 460                            scm_list_1 (SCM_MAKE_CHAR (c)));
 461
 462       scm_ungetc_unlocked (c, port);
 463       tmp = scm_read_expression (port, opts);
 464
 465       /* See above note about scm_sym_dot.  */
 466       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 467         {
 468           SCM_SETCDR (tl, scm_read_expression (port, opts));
 469
 470           c = flush_ws (port, opts, FUNC_NAME);
 471           if (terminating_char != c)
 472             scm_i_input_error (FUNC_NAME, port,
 473                                "in pair: missing close paren", SCM_EOL);
 474           break;
 475         }
 476
 477       new_tail = scm_cons (tmp, SCM_EOL);
 478       SCM_SETCDR (tl, new_tail);
 479       tl = new_tail;
 480     }
 481
 482   if (curly_list_p)
 483     {
 484       /* In addition to finding the length, 'scm_ilength' checks for
 485          improper or circular lists, in which case it returns -1. */
 486       int len = scm_ilength (ans);
 487
 488       /* The (len == 0) case is handled above */
 489       if (len == 1)
 490         /* Return directly to avoid re-annotating the element's source
 491            location with the position of the outer brace.  Also, it
 492            might not be possible to annotate the element. */
 493         return scm_car (ans);  /* {e} => e */
 494       else if (len == 2)
 495         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 496       else if (len >= 3 && (len & 1))
 497         {
 498           /* It's a proper list whose length is odd and at least 3.  If
 499              the elements at odd indices (the infix operator positions)
 500              are all 'equal?', then it's a simple curly-infix list.
 501              Otherwise it's a mixed curly-infix list. */
 502           SCM op = scm_cadr (ans);
 503
 504           /* Check to see if the elements at odd indices are 'equal?' */
 505           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 506             {
 507               if (scm_is_null (tl))
 508                 {
 509                   /* Convert simple curly-infix list to prefix:
 510                      {a <op> b <op> ...} => (<op> a b ...) */
 511                   tl = ans;
 512                   while (scm_is_pair (scm_cdr (tl)))
 513                     {
 514                       tmp = scm_cddr (tl);
 515                       SCM_SETCDR (tl, tmp);
 516                       tl = tmp;
 517                     }
 518                   ans = scm_cons (op, ans);
 519                   break;
 520                 }
 521               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 522                 {
 523                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 524                   ans = scm_cons (sym_nfx, ans);
 525                   break;
 526                 }
 527             }
 528         }
 529       else
 530         /* Mixed curly-infix (possibly improper) list:
 531            {e . tail} => ($nfx$ e . tail) */
 532         ans = scm_cons (sym_nfx, ans);
 533     }
 534
 535   return maybe_annotate_source (ans, port, opts, line, column);
 536 }
 537 #undef FUNC_NAME
 538
 539
 540 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 541    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 542    found.  */
 543 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 544   do                                                               \
 545     {                                                              \
 546       scm_t_wchar a;                                               \
 547       size_t i = 0;                                                \
 548       c = 0;                                                       \
 549       while (i < ndigits)                                          \
 550         {                                                          \
 551           a = scm_getc_unlocked (port);                                     \
 552           if (a == EOF)                                            \
 553             goto str_eof;                                          \
 554           if (terminator                                           \
 555               && (a == (scm_t_wchar) terminator)                   \
 556               && (i > 0))                                          \
 557             break;                                                 \
 558           if ('0' <= a && a <= '9')                                \
 559             a -= '0';                                              \
 560           else if ('A' <= a && a <= 'F')                           \
 561             a = a - 'A' + 10;                                      \
 562           else if ('a' <= a && a <= 'f')                           \
 563             a = a - 'a' + 10;                                      \
 564           else                                                     \
 565             {                                                      \
 566               c = a;                                               \
 567               goto bad_escaped;                                    \
 568             }                                                      \
 569           c = c * 16 + a;                                          \
 570           i ++;                                                    \
 571         }                                                          \
 572     } while (0)
 573
 574 static void
 575 skip_intraline_whitespace (SCM port)
 576 {
 577   scm_t_wchar c;
 578
 579   do
 580     {
 581       c = scm_getc_unlocked (port);
 582       if (c == EOF)
 583         return;
 584     }
 585   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 586
 587   scm_ungetc_unlocked (c, port);
 588 }
 589
 590 static SCM
 591 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 592 #define FUNC_NAME "scm_lreadr"
 593 {
 594   /* For strings smaller than C_STR, this function creates only one Scheme
 595      object (the string returned).  */
 596
 597   SCM str = SCM_EOL;
 598   size_t c_str_len = 0;
 599   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 600
 601   /* Need to capture line and column numbers here. */
 602   long line = SCM_LINUM (port);
 603   int column = SCM_COL (port) - 1;
 604
 605   while ('"' != (c = scm_getc_unlocked (port)))
 606     {
 607       if (c == EOF)
 608         {
 609         str_eof:
 610           scm_i_input_error (FUNC_NAME, port,
 611                              "end of file in string constant", SCM_EOL);
 612         }
 613
 614       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 615         {
 616           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 617           c_str_len = 0;
 618         }
 619
 620       if (c == '\\')
 621         {
 622           switch (c = scm_getc_unlocked (port))
 623             {
 624             case EOF:
 625               goto str_eof;
 626             case '"':
 627             case '\\':
 628               break;
 629             case '\n':
 630               if (opts->hungry_eol_escapes_p)
 631                 skip_intraline_whitespace (port);
 632               continue;
 633             case '0':
 634               c = '\0';
 635               break;
 636             case 'f':
 637               c = '\f';
 638               break;
 639             case 'n':
 640               c = '\n';
 641               break;
 642             case 'r':
 643               c = '\r';
 644               break;
 645             case 't':
 646               c = '\t';
 647               break;
 648             case 'a':
 649               c = '\007';
 650               break;
 651             case 'v':
 652               c = '\v';
 653               break;
 654             case 'b':
 655               c = '\010';
 656               break;
 657             case 'x':
 658               if (opts->r6rs_escapes_p)
 659                 SCM_READ_HEX_ESCAPE (10, ';');
 660               else
 661                 SCM_READ_HEX_ESCAPE (2, '\0');
 662               break;
 663             case 'u':
 664               if (!opts->r6rs_escapes_p)
 665                 {
 666                   SCM_READ_HEX_ESCAPE (4, '\0');
 667                   break;
 668                 }
 669             case 'U':
 670               if (!opts->r6rs_escapes_p)
 671                 {
 672                   SCM_READ_HEX_ESCAPE (6, '\0');
 673                   break;
 674                 }
 675             default:
 676             bad_escaped:
 677               scm_i_input_error (FUNC_NAME, port,
 678                                  "illegal character in escape sequence: ~S",
 679                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 680             }
 681         }
 682
 683       c_str[c_str_len++] = c;
 684     }
 685
 686   if (scm_is_null (str))
 687     /* Fast path: we got a string that fits in C_STR.  */
 688     str = scm_from_utf32_stringn (c_str, c_str_len);
 689   else
 690     {
 691       if (c_str_len > 0)
 692         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 693
 694       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 695     }
 696
 697   return maybe_annotate_source (str, port, opts, line, column);
 698 }
 699 #undef FUNC_NAME
 700
 701
 702 static SCM
 703 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 704 {
 705   SCM result, str = SCM_EOL;
 706   char local_buffer[READER_BUFFER_SIZE], *buffer;
 707   size_t bytes_read;
 708
 709   /* Need to capture line and column numbers here. */
 710   long line = SCM_LINUM (port);
 711   int column = SCM_COL (port) - 1;
 712
 713   scm_ungetc_unlocked (chr, port);
 714   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 715                                 &bytes_read);
 716
 717   str = scm_from_port_stringn (buffer, bytes_read, port);
 718
 719   result = scm_string_to_number (str, SCM_UNDEFINED);
 720   if (scm_is_false (result))
 721     {
 722       /* Return a symbol instead of a number */
 723       if (opts->case_insensitive_p)
 724         str = scm_string_downcase_x (str);
 725       result = scm_string_to_symbol (str);
 726     }
 727   else if (SCM_NIMP (result))
 728     result = maybe_annotate_source (result, port, opts, line, column);
 729
 730   SCM_COL (port) += scm_i_string_length (str);
 731   return result;
 732 }
 733
 734 static SCM
 735 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 736 {
 737   SCM result;
 738   int ends_with_colon = 0;
 739   size_t bytes_read;
 740   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 741   char local_buffer[READER_BUFFER_SIZE], *buffer;
 742   SCM str;
 743
 744   scm_ungetc_unlocked (chr, port);
 745   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 746                                 &bytes_read);
 747   if (bytes_read > 0)
 748     ends_with_colon = buffer[bytes_read - 1] == ':';
 749
 750   if (postfix && ends_with_colon && (bytes_read > 1))
 751     {
 752       str = scm_from_port_stringn (buffer, bytes_read - 1, port);
 753
 754       if (opts->case_insensitive_p)
 755         str = scm_string_downcase_x (str);
 756       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 757     }
 758   else
 759     {
 760       str = scm_from_port_stringn (buffer, bytes_read, port);
 761
 762       if (opts->case_insensitive_p)
 763         str = scm_string_downcase_x (str);
 764       result = scm_string_to_symbol (str);
 765     }
 766
 767   SCM_COL (port) += scm_i_string_length (str);
 768   return result;
 769 }
 770
 771 static SCM
 772 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 773 #define FUNC_NAME "scm_lreadr"
 774 {
 775   SCM result;
 776   size_t read;
 777   char local_buffer[READER_BUFFER_SIZE], *buffer;
 778   unsigned int radix;
 779   SCM str;
 780
 781   switch (chr)
 782     {
 783     case 'B':
 784     case 'b':
 785       radix = 2;
 786       break;
 787
 788     case 'o':
 789     case 'O':
 790       radix = 8;
 791       break;
 792
 793     case 'd':
 794     case 'D':
 795       radix = 10;
 796       break;
 797
 798     case 'x':
 799     case 'X':
 800       radix = 16;
 801       break;
 802
 803     default:
 804       scm_ungetc_unlocked (chr, port);
 805       scm_ungetc_unlocked ('#', port);
 806       radix = 10;
 807     }
 808
 809   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 810                                 &read);
 811
 812   str = scm_from_port_stringn (buffer, read, port);
 813
 814   result = scm_string_to_number (str, scm_from_uint (radix));
 815
 816   SCM_COL (port) += scm_i_string_length (str);
 817
 818   if (scm_is_true (result))
 819     return result;
 820
 821   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 822
 823   return SCM_BOOL_F;
 824 }
 825 #undef FUNC_NAME
 826
 827 static SCM
 828 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 829 {
 830   SCM p;
 831   long line = SCM_LINUM (port);
 832   int column = SCM_COL (port) - 1;
 833
 834   switch (chr)
 835     {
 836     case '`':
 837       p = scm_sym_quasiquote;
 838       break;
 839
 840     case '\'':
 841       p = scm_sym_quote;
 842       break;
 843
 844     case ',':
 845       {
 846         scm_t_wchar c;
 847
 848         c = scm_getc_unlocked (port);
 849         if ('@' == c)
 850           p = scm_sym_uq_splicing;
 851         else
 852           {
 853             scm_ungetc_unlocked (c, port);
 854             p = scm_sym_unquote;
 855           }
 856         break;
 857       }
 858
 859     default:
 860       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 861                "scm_read_quote", chr);
 862       abort ();
 863     }
 864
 865   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 866   return maybe_annotate_source (p, port, opts, line, column);
 867 }
 868
 869 SCM_SYMBOL (sym_syntax, "syntax");
 870 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 871 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 872 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 873
 874 static SCM
 875 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 876 {
 877   SCM p;
 878   long line = SCM_LINUM (port);
 879   int column = SCM_COL (port) - 1;
 880
 881   switch (chr)
 882     {
 883     case '`':
 884       p = sym_quasisyntax;
 885       break;
 886
 887     case '\'':
 888       p = sym_syntax;
 889       break;
 890
 891     case ',':
 892       {
 893         int c;
 894
 895         c = scm_getc_unlocked (port);
 896         if ('@' == c)
 897           p = sym_unsyntax_splicing;
 898         else
 899           {
 900             scm_ungetc_unlocked (c, port);
 901             p = sym_unsyntax;
 902           }
 903         break;
 904       }
 905
 906     default:
 907       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 908                "scm_read_syntax", chr);
 909       abort ();
 910     }
 911
 912   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 913   return maybe_annotate_source (p, port, opts, line, column);
 914 }
 915
 916 static SCM
 917 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 918 {
 919   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 920
 921   if (!scm_is_eq (id, sym_nil))
 922     scm_i_input_error ("scm_read_nil", port,
 923                        "unexpected input while reading #nil: ~a",
 924                        scm_list_1 (id));
 925
 926   return SCM_ELISP_NIL;
 927 }
 928
 929 static SCM
 930 scm_read_semicolon_comment (int chr, SCM port)
 931 {
 932   int c;
 933
 934   /* We use the get_byte here because there is no need to get the
 935      locale correct with comment input. This presumes that newline
 936      always represents itself no matter what the encoding is.  */
 937   for (c = scm_get_byte_or_eof_unlocked (port);
 938        (c != EOF) && (c != '\n');
 939        c = scm_get_byte_or_eof_unlocked (port));
 940
 941   return SCM_UNSPECIFIED;
 942 }
 943
 944 \f
 945 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 946
 947 static SCM
 948 scm_read_boolean (int chr, SCM port)
 949 {
 950   switch (chr)
 951     {
 952     case 't':
 953     case 'T':
 954       return SCM_BOOL_T;
 955
 956     case 'f':
 957     case 'F':
 958       return SCM_BOOL_F;
 959     }
 960
 961   return SCM_UNSPECIFIED;
 962 }
 963
 964 static SCM
 965 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 966 #define FUNC_NAME "scm_lreadr"
 967 {
 968   char buffer[READER_CHAR_NAME_MAX_SIZE];
 969   SCM charname;
 970   size_t charname_len, bytes_read;
 971   scm_t_wchar cp;
 972   int overflow;
 973   scm_t_port_internal *pti;
 974
 975   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 976                          &bytes_read);
 977   if (overflow)
 978     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 979
 980   if (bytes_read == 0)
 981     {
 982       chr = scm_getc_unlocked (port);
 983       if (chr == EOF)
 984         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 985                            "while reading character", SCM_EOL);
 986
 987       /* CHR must be a token delimiter, like a whitespace.  */
 988       return (SCM_MAKE_CHAR (chr));
 989     }
 990
 991   pti = SCM_PORT_GET_INTERNAL (port);
 992
 993   /* Simple ASCII characters can be processed immediately.  Also, simple
 994      ISO-8859-1 characters can be processed immediately if the encoding for this
 995      port is ISO-8859-1.  */
 996   if (bytes_read == 1 &&
 997       ((unsigned char) buffer[0] <= 127
 998        || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
 999     {
1000       SCM_COL (port) += 1;
1001       return SCM_MAKE_CHAR (buffer[0]);
1002     }
1003
1004   /* Otherwise, convert the buffer into a proper scheme string for
1005      processing.  */
1006   charname = scm_from_port_stringn (buffer, bytes_read, port);
1007   charname_len = scm_i_string_length (charname);
1008   SCM_COL (port) += charname_len;
1009   cp = scm_i_string_ref (charname, 0);
1010   if (charname_len == 1)
1011     return SCM_MAKE_CHAR (cp);
1012
1013   /* Ignore dotted circles, which may be used to keep combining characters from
1014      combining with the backslash in #\charname.  */
1015   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1016     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1017
1018   if (cp >= '0' && cp < '8')
1019     {
1020       /* Dirk:FIXME::  This type of character syntax is not R5RS
1021        * compliant.  Further, it should be verified that the constant
1022        * does only consist of octal digits.  */
1023       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1024       if (SCM_I_INUMP (p))
1025         {
1026           scm_t_wchar c = scm_to_uint32 (p);
1027           if (SCM_IS_UNICODE_CHAR (c))
1028             return SCM_MAKE_CHAR (c);
1029           else
1030             scm_i_input_error (FUNC_NAME, port,
1031                                "out-of-range octal character escape: ~a",
1032                                scm_list_1 (charname));
1033         }
1034     }
1035
1036   if (cp == 'x' && (charname_len > 1))
1037     {
1038       SCM p;
1039
1040       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1041       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1042                                 scm_from_uint (16));
1043       if (SCM_I_INUMP (p))
1044         {
1045           scm_t_wchar c = scm_to_uint32 (p);
1046           if (SCM_IS_UNICODE_CHAR (c))
1047             return SCM_MAKE_CHAR (c);
1048           else
1049             scm_i_input_error (FUNC_NAME, port,
1050                                "out-of-range hex character escape: ~a",
1051                                scm_list_1 (charname));
1052         }
1053     }
1054
1055   /* The names of characters should never have non-Latin1
1056      characters.  */
1057   if (scm_i_is_narrow_string (charname)
1058       || scm_i_try_narrow_string (charname))
1059     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1060                                        charname_len);
1061       if (scm_is_true (ch))
1062         return ch;
1063     }
1064
1065   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1066                      scm_list_1 (charname));
1067
1068   return SCM_UNSPECIFIED;
1069 }
1070 #undef FUNC_NAME
1071
1072 static SCM
1073 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1074 {
1075   SCM symbol;
1076
1077   /* Read the symbol that comprises the keyword.  Doing this instead of
1078      invoking a specific symbol reader function allows `scm_read_keyword ()'
1079      to adapt to the delimiters currently valid of symbols.
1080
1081      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1082   symbol = scm_read_expression (port, opts);
1083   if (!scm_is_symbol (symbol))
1084     scm_i_input_error ("scm_read_keyword", port,
1085                        "keyword prefix `~a' not followed by a symbol: ~s",
1086                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1087
1088   return (scm_symbol_to_keyword (symbol));
1089 }
1090
1091 static SCM
1092 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1093                  long line, int column)
1094 {
1095   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1096      guarantee that it's going to do what we want.  After all, this is an
1097      implementation detail of `scm_read_vector ()', not a desirable
1098      property.  */
1099   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1100                                 port, opts, line, column);
1101 }
1102
1103 /* Helper used by scm_read_array */
1104 static int
1105 read_decimal_integer (SCM port, int c, ssize_t *resp)
1106 {
1107   ssize_t sign = 1;
1108   ssize_t res = 0;
1109   int got_it = 0;
1110
1111   if (c == '-')
1112     {
1113       sign = -1;
1114       c = scm_getc_unlocked (port);
1115     }
1116
1117   while ('0' <= c && c <= '9')
1118     {
1119       res = 10*res + c-'0';
1120       got_it = 1;
1121       c = scm_getc_unlocked (port);
1122     }
1123
1124   if (got_it)
1125     *resp = sign * res;
1126   return c;
1127 }
1128
1129 /* Read an array.  This function can also read vectors and uniform
1130    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1131    handled here.
1132
1133    C is the first character read after the '#'. */
1134 static SCM
1135 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1136 {
1137   ssize_t rank;
1138   scm_t_wchar tag_buf[8];
1139   int tag_len;
1140
1141   SCM tag, shape = SCM_BOOL_F, elements, array;
1142
1143   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1144      the array code can not deal with zero-length dimensions yet, and
1145      we want to allow zero-length vectors, of course. */
1146   if (c == '(')
1147     return scm_read_vector (c, port, opts, line, column);
1148
1149   /* Disambiguate between '#f' and uniform floating point vectors. */
1150   if (c == 'f')
1151     {
1152       c = scm_getc_unlocked (port);
1153       if (c != '3' && c != '6')
1154         {
1155           if (c != EOF)
1156             scm_ungetc_unlocked (c, port);
1157           return SCM_BOOL_F;
1158         }
1159       rank = 1;
1160       tag_buf[0] = 'f';
1161       tag_len = 1;
1162       goto continue_reading_tag;
1163     }
1164
1165   /* Read rank. */
1166   rank = 1;
1167   c = read_decimal_integer (port, c, &rank);
1168   if (rank < 0)
1169     scm_i_input_error (NULL, port, "array rank must be non-negative",
1170                        SCM_EOL);
1171
1172   /* Read tag. */
1173   tag_len = 0;
1174  continue_reading_tag:
1175   while (c != EOF && c != '(' && c != '@' && c != ':'
1176          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1177     {
1178       tag_buf[tag_len++] = c;
1179       c = scm_getc_unlocked (port);
1180     }
1181   if (tag_len == 0)
1182     tag = SCM_BOOL_T;
1183   else
1184     {
1185       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1186       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1187         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1188                            scm_list_1 (tag));
1189     }
1190
1191   /* Read shape. */
1192   if (c == '@' || c == ':')
1193     {
1194       shape = SCM_EOL;
1195
1196       do
1197         {
1198           ssize_t lbnd = 0, len = 0;
1199           SCM s;
1200
1201           if (c == '@')
1202             {
1203               c = scm_getc_unlocked (port);
1204               c = read_decimal_integer (port, c, &lbnd);
1205             }
1206
1207           s = scm_from_ssize_t (lbnd);
1208
1209           if (c == ':')
1210             {
1211               c = scm_getc_unlocked (port);
1212               c = read_decimal_integer (port, c, &len);
1213               if (len < 0)
1214                 scm_i_input_error (NULL, port,
1215                                    "array length must be non-negative",
1216                                    SCM_EOL);
1217
1218               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1219             }
1220
1221           shape = scm_cons (s, shape);
1222         } while (c == '@' || c == ':');
1223
1224       shape = scm_reverse_x (shape, SCM_EOL);
1225     }
1226
1227   /* Read nested lists of elements. */
1228   if (c != '(')
1229     scm_i_input_error (NULL, port,
1230                        "missing '(' in vector or array literal",
1231                        SCM_EOL);
1232   elements = scm_read_sexp (c, port, opts);
1233
1234   if (scm_is_false (shape))
1235     shape = scm_from_ssize_t (rank);
1236   else if (scm_ilength (shape) != rank)
1237     scm_i_input_error
1238       (NULL, port,
1239        "the number of shape specifications must match the array rank",
1240        SCM_EOL);
1241
1242   /* Handle special print syntax of rank zero arrays; see
1243      scm_i_print_array for a rationale. */
1244   if (rank == 0)
1245     {
1246       if (!scm_is_pair (elements))
1247         scm_i_input_error (NULL, port,
1248                            "too few elements in array literal, need 1",
1249                            SCM_EOL);
1250       if (!scm_is_null (SCM_CDR (elements)))
1251         scm_i_input_error (NULL, port,
1252                            "too many elements in array literal, want 1",
1253                            SCM_EOL);
1254       elements = SCM_CAR (elements);
1255     }
1256
1257   /* Construct array, annotate with source location, and return. */
1258   array = scm_list_to_typed_array (tag, shape, elements);
1259   return maybe_annotate_source (array, port, opts, line, column);
1260 }
1261
1262 static SCM
1263 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1264                        long line, int column)
1265 {
1266   return scm_read_array (chr, port, opts, line, column);
1267 }
1268
1269 static SCM
1270 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1271                      long line, int column)
1272 {
1273   chr = scm_getc_unlocked (port);
1274   if (chr != 'u')
1275     goto syntax;
1276
1277   chr = scm_getc_unlocked (port);
1278   if (chr != '8')
1279     goto syntax;
1280
1281   chr = scm_getc_unlocked (port);
1282   if (chr != '(')
1283     goto syntax;
1284
1285   return maybe_annotate_source
1286     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1287      port, opts, line, column);
1288
1289  syntax:
1290   scm_i_input_error ("read_bytevector", port,
1291                      "invalid bytevector prefix",
1292                      SCM_MAKE_CHAR (chr));
1293   return SCM_UNSPECIFIED;
1294 }
1295
1296 static SCM
1297 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1298                            long line, int column)
1299 {
1300   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1301      terribly inefficient but who cares?  */
1302   SCM s_bits = SCM_EOL;
1303
1304   for (chr = scm_getc_unlocked (port);
1305        (chr != EOF) && ((chr == '0') || (chr == '1'));
1306        chr = scm_getc_unlocked (port))
1307     {
1308       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1309     }
1310
1311   if (chr != EOF)
1312     scm_ungetc_unlocked (chr, port);
1313
1314   return maybe_annotate_source
1315     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1316      port, opts, line, column);
1317 }
1318
1319 static SCM
1320 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1321 {
1322   int bang_seen = 0;
1323
1324   for (;;)
1325     {
1326       int c = scm_getc_unlocked (port);
1327
1328       if (c == EOF)
1329         scm_i_input_error ("skip_block_comment", port,
1330                            "unterminated `#! ... !#' comment", SCM_EOL);
1331
1332       if (c == '!')
1333         bang_seen = 1;
1334       else if (c == '#' && bang_seen)
1335         break;
1336       else
1337         bang_seen = 0;
1338     }
1339
1340   return SCM_UNSPECIFIED;
1341 }
1342
1343 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1344                                          int value);
1345 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1346                                         int value);
1347 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1348                                     int value);
1349
1350 static SCM
1351 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1352 {
1353   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1354   int c;
1355   int i = 0;
1356
1357   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1358     {
1359       c = scm_getc_unlocked (port);
1360       if (c == EOF)
1361         scm_i_input_error ("skip_block_comment", port,
1362                            "unterminated `#! ... !#' comment", SCM_EOL);
1363       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1364         name[i++] = c;
1365       else if (CHAR_IS_DELIMITER (c))
1366         {
1367           scm_ungetc_unlocked (c, port);
1368           name[i] = '\0';
1369           if (0 == strcmp ("r6rs", name))
1370             ;  /* Silently ignore */
1371           else if (0 == strcmp ("fold-case", name))
1372             set_port_case_insensitive_p (port, opts, 1);
1373           else if (0 == strcmp ("no-fold-case", name))
1374             set_port_case_insensitive_p (port, opts, 0);
1375           else if (0 == strcmp ("curly-infix", name))
1376             set_port_curly_infix_p (port, opts, 1);
1377           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1378             {
1379               set_port_curly_infix_p (port, opts, 1);
1380               set_port_square_brackets_p (port, opts, 0);
1381             }
1382           else
1383             break;
1384
1385           return SCM_UNSPECIFIED;
1386         }
1387       else
1388         {
1389           scm_ungetc_unlocked (c, port);
1390           break;
1391         }
1392     }
1393   while (i > 0)
1394     scm_ungetc_unlocked (name[--i], port);
1395   return scm_read_scsh_block_comment (chr, port);
1396 }
1397
1398 static SCM
1399 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1400 {
1401   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1402      nested.  So care must be taken.  */
1403   int nesting_level = 1;
1404
1405   int a = scm_getc_unlocked (port);
1406
1407   if (a == EOF)
1408     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1409                        "unterminated `#| ... |#' comment", SCM_EOL);
1410
1411   while (nesting_level > 0)
1412     {
1413       int b = scm_getc_unlocked (port);
1414
1415       if (b == EOF)
1416         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1417                            "unterminated `#| ... |#' comment", SCM_EOL);
1418
1419       if (a == '|' && b == '#')
1420         {
1421           nesting_level--;
1422           b = EOF;
1423         }
1424       else if (a == '#' && b == '|')
1425         {
1426           nesting_level++;
1427           b = EOF;
1428         }
1429
1430       a = b;
1431     }
1432
1433   return SCM_UNSPECIFIED;
1434 }
1435
1436 static SCM
1437 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1438                                scm_t_read_opts *opts)
1439 {
1440   scm_t_wchar c;
1441
1442   c = flush_ws (port, opts, (char *) NULL);
1443   if (EOF == c)
1444     scm_i_input_error ("read_commented_expression", port,
1445                        "no expression after #; comment", SCM_EOL);
1446   scm_ungetc_unlocked (c, port);
1447   scm_read_expression (port, opts);
1448   return SCM_UNSPECIFIED;
1449 }
1450
1451 static SCM
1452 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1453 {
1454   /* Guile's extended symbol read syntax looks like this:
1455
1456        #{This is all a symbol name}#
1457
1458      So here, CHR is expected to be `{'.  */
1459   int saw_brace = 0;
1460   size_t len = 0;
1461   SCM buf = scm_i_make_string (1024, NULL, 0);
1462
1463   buf = scm_i_string_start_writing (buf);
1464
1465   while ((chr = scm_getc_unlocked (port)) != EOF)
1466     {
1467       if (saw_brace)
1468         {
1469           if (chr == '#')
1470             {
1471               break;
1472             }
1473           else
1474             {
1475               saw_brace = 0;
1476               scm_i_string_set_x (buf, len++, '}');
1477             }
1478         }
1479
1480       if (chr == '}')
1481         saw_brace = 1;
1482       else if (chr == '\\')
1483         {
1484           /* It used to be that print.c would print extended-read-syntax
1485              symbols with backslashes before "non-standard" chars, but
1486              this routine wouldn't do anything with those escapes.
1487              Bummer.  What we've done is to change print.c to output
1488              R6RS hex escapes for those characters, relying on the fact
1489              that the extended read syntax would never put a `\' before
1490              an `x'.  For now, we just ignore other instances of
1491              backslash in the string.  */
1492           switch ((chr = scm_getc_unlocked (port)))
1493             {
1494             case EOF:
1495               goto done;
1496             case 'x':
1497               {
1498                 scm_t_wchar c;
1499
1500                 SCM_READ_HEX_ESCAPE (10, ';');
1501                 scm_i_string_set_x (buf, len++, c);
1502                 break;
1503
1504               str_eof:
1505                 chr = EOF;
1506                 goto done;
1507
1508               bad_escaped:
1509                 scm_i_string_stop_writing ();
1510                 scm_i_input_error ("scm_read_extended_symbol", port,
1511                                    "illegal character in escape sequence: ~S",
1512                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1513                 break;
1514               }
1515             default:
1516               scm_i_string_set_x (buf, len++, chr);
1517               break;
1518             }
1519         }
1520       else
1521         scm_i_string_set_x (buf, len++, chr);
1522
1523       if (len >= scm_i_string_length (buf) - 2)
1524         {
1525           SCM addy;
1526
1527           scm_i_string_stop_writing ();
1528           addy = scm_i_make_string (1024, NULL, 0);
1529           buf = scm_string_append (scm_list_2 (buf, addy));
1530           len = 0;
1531           buf = scm_i_string_start_writing (buf);
1532         }
1533     }
1534
1535  done:
1536   scm_i_string_stop_writing ();
1537   if (chr == EOF)
1538     scm_i_input_error ("scm_read_extended_symbol", port,
1539                        "end of file while reading symbol", SCM_EOL);
1540
1541   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1542 }
1543
1544
1545 \f
1546 /* Top-level token readers, i.e., dispatchers.  */
1547
1548 static SCM
1549 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1550 {
1551   SCM proc;
1552
1553   proc = scm_get_hash_procedure (chr);
1554   if (scm_is_true (scm_procedure_p (proc)))
1555     {
1556       long line = SCM_LINUM (port);
1557       int column = SCM_COL (port) - 2;
1558       SCM got;
1559
1560       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1561
1562       if (opts->record_positions_p && SCM_NIMP (got)
1563           && !scm_i_has_source_properties (got))
1564         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1565
1566       return got;
1567     }
1568
1569   return SCM_UNSPECIFIED;
1570 }
1571
1572 /* The reader for the sharp `#' character.  It basically dispatches reads
1573    among the above token readers.   */
1574 static SCM
1575 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1576                 long line, int column)
1577 #define FUNC_NAME "scm_lreadr"
1578 {
1579   SCM result;
1580
1581   chr = scm_getc_unlocked (port);
1582
1583   result = scm_read_sharp_extension (chr, port, opts);
1584   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1585     return result;
1586
1587   switch (chr)
1588     {
1589     case '\\':
1590       return (scm_read_character (chr, port, opts));
1591     case '(':
1592       return (scm_read_vector (chr, port, opts, line, column));
1593     case 's':
1594     case 'u':
1595     case 'f':
1596     case 'c':
1597       /* This one may return either a boolean or an SRFI-4 vector.  */
1598       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1599     case 'v':
1600       return (scm_read_bytevector (chr, port, opts, line, column));
1601     case '*':
1602       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1603     case 't':
1604     case 'T':
1605     case 'F':
1606       return (scm_read_boolean (chr, port));
1607     case ':':
1608       return (scm_read_keyword (chr, port, opts));
1609     case '0': case '1': case '2': case '3': case '4':
1610     case '5': case '6': case '7': case '8': case '9':
1611     case '@':
1612       return (scm_read_array (chr, port, opts, line, column));
1613
1614     case 'i':
1615     case 'e':
1616     case 'b':
1617     case 'B':
1618     case 'o':
1619     case 'O':
1620     case 'd':
1621     case 'D':
1622     case 'x':
1623     case 'X':
1624     case 'I':
1625     case 'E':
1626       return (scm_read_number_and_radix (chr, port, opts));
1627     case '{':
1628       return (scm_read_extended_symbol (chr, port));
1629     case '!':
1630       return (scm_read_shebang (chr, port, opts));
1631     case ';':
1632       return (scm_read_commented_expression (chr, port, opts));
1633     case '`':
1634     case '\'':
1635     case ',':
1636       return (scm_read_syntax (chr, port, opts));
1637     case 'n':
1638       return (scm_read_nil (chr, port, opts));
1639     default:
1640       result = scm_read_sharp_extension (chr, port, opts);
1641       if (scm_is_eq (result, SCM_UNSPECIFIED))
1642         {
1643           /* To remain compatible with 1.8 and earlier, the following
1644              characters have lower precedence than `read-hash-extend'
1645              characters.  */
1646           switch (chr)
1647             {
1648             case '|':
1649               return scm_read_r6rs_block_comment (chr, port);
1650             default:
1651               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1652                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1653             }
1654         }
1655       else
1656         return result;
1657     }
1658
1659   return SCM_UNSPECIFIED;
1660 }
1661 #undef FUNC_NAME
1662
1663 static SCM
1664 read_inner_expression (SCM port, scm_t_read_opts *opts)
1665 #define FUNC_NAME "read_inner_expression"
1666 {
1667   while (1)
1668     {
1669       scm_t_wchar chr;
1670
1671       chr = scm_getc_unlocked (port);
1672
1673       switch (chr)
1674         {
1675         case SCM_WHITE_SPACES:
1676         case SCM_LINE_INCREMENTORS:
1677           break;
1678         case ';':
1679           (void) scm_read_semicolon_comment (chr, port);
1680           break;
1681         case '{':
1682           if (opts->curly_infix_p)
1683             {
1684               if (opts->neoteric_p)
1685                 return scm_read_sexp (chr, port, opts);
1686               else
1687                 {
1688                   SCM expr;
1689
1690                   /* Enable neoteric expressions within curly braces */
1691                   opts->neoteric_p = 1;
1692                   expr = scm_read_sexp (chr, port, opts);
1693                   opts->neoteric_p = 0;
1694                   return expr;
1695                 }
1696             }
1697           else
1698             return scm_read_mixed_case_symbol (chr, port, opts);
1699         case '[':
1700           if (opts->square_brackets_p)
1701             return scm_read_sexp (chr, port, opts);
1702           else if (opts->curly_infix_p)
1703             {
1704               /* The syntax of neoteric expressions requires that '[' be
1705                  a delimiter when curly-infix is enabled, so it cannot
1706                  be part of an unescaped symbol.  We might as well do
1707                  something useful with it, so we adopt Kawa's convention:
1708                  [...] => ($bracket-list$ ...) */
1709               long line = SCM_LINUM (port);
1710               int column = SCM_COL (port) - 1;
1711               return maybe_annotate_source
1712                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1713                  port, opts, line, column);
1714             }
1715           else
1716             return scm_read_mixed_case_symbol (chr, port, opts);
1717         case '(':
1718           return (scm_read_sexp (chr, port, opts));
1719         case '"':
1720           return (scm_read_string (chr, port, opts));
1721         case '\'':
1722         case '`':
1723         case ',':
1724           return (scm_read_quote (chr, port, opts));
1725         case '#':
1726           {
1727             long line  = SCM_LINUM (port);
1728             int column = SCM_COL (port) - 1;
1729             SCM result = scm_read_sharp (chr, port, opts, line, column);
1730             if (scm_is_eq (result, SCM_UNSPECIFIED))
1731               /* We read a comment or some such.  */
1732               break;
1733             else
1734               return result;
1735           }
1736         case ')':
1737           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1738           break;
1739         case '}':
1740           if (opts->curly_infix_p)
1741             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1742           else
1743             return scm_read_mixed_case_symbol (chr, port, opts);
1744         case ']':
1745           if (opts->square_brackets_p)
1746             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1747           /* otherwise fall through */
1748         case EOF:
1749           return SCM_EOF_VAL;
1750         case ':':
1751           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1752             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1753           /* Fall through.  */
1754
1755         default:
1756           {
1757             if (((chr >= '0') && (chr <= '9'))
1758                 || (strchr ("+-.", chr)))
1759               return (scm_read_number (chr, port, opts));
1760             else
1761               return (scm_read_mixed_case_symbol (chr, port, opts));
1762           }
1763         }
1764     }
1765 }
1766 #undef FUNC_NAME
1767
1768 static SCM
1769 scm_read_expression (SCM port, scm_t_read_opts *opts)
1770 #define FUNC_NAME "scm_read_expression"
1771 {
1772   if (!opts->neoteric_p)
1773     return read_inner_expression (port, opts);
1774   else
1775     {
1776       long line = 0;
1777       int column = 0;
1778       SCM expr;
1779
1780       if (opts->record_positions_p)
1781         {
1782           /* We need to get the position of the first non-whitespace
1783              character in order to correctly annotate neoteric
1784              expressions.  For example, for the expression 'f(x)', the
1785              first call to 'read_inner_expression' reads the 'f' (which
1786              cannot be annotated), and then we later read the '(x)' and
1787              use it to construct the new list (f x). */
1788           int c = flush_ws (port, opts, (char *) NULL);
1789           if (c == EOF)
1790             return SCM_EOF_VAL;
1791           scm_ungetc_unlocked (c, port);
1792           line = SCM_LINUM (port);
1793           column = SCM_COL (port);
1794         }
1795
1796       expr = read_inner_expression (port, opts);
1797
1798       /* 'expr' is the first component of the neoteric expression.  Now
1799          we loop, and as long as the next character is '(', '[', or '{',
1800          (without any intervening whitespace), we use it to construct a
1801          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1802       for (;;)
1803         {
1804           int chr = scm_getc_unlocked (port);
1805
1806           if (chr == '(')
1807             /* e(...) => (e ...) */
1808             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1809           else if (chr == '[')
1810             /* e[...] => ($bracket-apply$ e ...) */
1811             expr = scm_cons (sym_bracket_apply,
1812                              scm_cons (expr,
1813                                        scm_read_sexp (chr, port, opts)));
1814           else if (chr == '{')
1815             {
1816               SCM arg = scm_read_sexp (chr, port, opts);
1817
1818               if (scm_is_null (arg))
1819                 expr = scm_list_1 (expr);       /* e{} => (e) */
1820               else
1821                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1822             }
1823           else
1824             {
1825               if (chr != EOF)
1826                 scm_ungetc_unlocked (chr, port);
1827               break;
1828             }
1829           maybe_annotate_source (expr, port, opts, line, column);
1830         }
1831       return expr;
1832     }
1833 }
1834 #undef FUNC_NAME
1835
1836 \f
1837 /* Actual reader.  */
1838
1839 static void init_read_options (SCM port, scm_t_read_opts *opts);
1840
1841 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1842             (SCM port),
1843             "Read an s-expression from the input port @var{port}, or from\n"
1844             "the current input port if @var{port} is not specified.\n"
1845             "Any whitespace before the next token is discarded.")
1846 #define FUNC_NAME s_scm_read
1847 {
1848   scm_t_read_opts opts;
1849   int c;
1850
1851   if (SCM_UNBNDP (port))
1852     port = scm_current_input_port ();
1853   SCM_VALIDATE_OPINPORT (1, port);
1854
1855   init_read_options (port, &opts);
1856
1857   c = flush_ws (port, &opts, (char *) NULL);
1858   if (EOF == c)
1859     return SCM_EOF_VAL;
1860   scm_ungetc_unlocked (c, port);
1861
1862   return (scm_read_expression (port, &opts));
1863 }
1864 #undef FUNC_NAME
1865
1866
1867 \f
1868
1869 /* Manipulate the read-hash-procedures alist.  This could be written in
1870    Scheme, but maybe it will also be used by C code during initialisation.  */
1871 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1872             (SCM chr, SCM proc),
1873             "Install the procedure @var{proc} for reading expressions\n"
1874             "starting with the character sequence @code{#} and @var{chr}.\n"
1875             "@var{proc} will be called with two arguments:  the character\n"
1876             "@var{chr} and the port to read further data from. The object\n"
1877             "returned will be the return value of @code{read}. \n"
1878             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1879             )
1880 #define FUNC_NAME s_scm_read_hash_extend
1881 {
1882   SCM this;
1883   SCM prev;
1884
1885   SCM_VALIDATE_CHAR (1, chr);
1886   SCM_ASSERT (scm_is_false (proc)
1887               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1888               proc, SCM_ARG2, FUNC_NAME);
1889
1890   /* Check if chr is already in the alist.  */
1891   this = scm_i_read_hash_procedures_ref ();
1892   prev = SCM_BOOL_F;
1893   while (1)
1894     {
1895       if (scm_is_null (this))
1896         {
1897           /* not found, so add it to the beginning.  */
1898           if (scm_is_true (proc))
1899             {
1900               SCM new = scm_cons (scm_cons (chr, proc),
1901                                   scm_i_read_hash_procedures_ref ());
1902               scm_i_read_hash_procedures_set_x (new);
1903             }
1904           break;
1905         }
1906       if (scm_is_eq (chr, SCM_CAAR (this)))
1907         {
1908           /* already in the alist.  */
1909           if (scm_is_false (proc))
1910             {
1911               /* remove it.  */
1912               if (scm_is_false (prev))
1913                 {
1914                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1915                   scm_i_read_hash_procedures_set_x (rest);
1916                 }
1917               else
1918                 scm_set_cdr_x (prev, SCM_CDR (this));
1919             }
1920           else
1921             {
1922               /* replace it.  */
1923               scm_set_cdr_x (SCM_CAR (this), proc);
1924             }
1925           break;
1926         }
1927       prev = this;
1928       this = SCM_CDR (this);
1929     }
1930
1931   return SCM_UNSPECIFIED;
1932 }
1933 #undef FUNC_NAME
1934
1935 /* Recover the read-hash procedure corresponding to char c.  */
1936 static SCM
1937 scm_get_hash_procedure (int c)
1938 {
1939   SCM rest = scm_i_read_hash_procedures_ref ();
1940
1941   while (1)
1942     {
1943       if (scm_is_null (rest))
1944         return SCM_BOOL_F;
1945
1946       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1947         return SCM_CDAR (rest);
1948
1949       rest = SCM_CDR (rest);
1950     }
1951 }
1952
1953 #define SCM_ENCODING_SEARCH_SIZE (500)
1954
1955 static int
1956 is_encoding_char (char c)
1957 {
1958   if (c >= 'a' && c <= 'z') return 1;
1959   if (c >= 'A' && c <= 'Z') return 1;
1960   if (c >= '0' && c <= '9') return 1;
1961   return strchr ("_-.:/,+=()", c) != NULL;
1962 }
1963
1964 /* Search the first few hundred characters of a file for an Emacs-like coding
1965    declaration.  Returns either NULL or a string whose storage has been
1966    allocated with `scm_gc_malloc ()'.  */
1967 char *
1968 scm_i_scan_for_encoding (SCM port)
1969 {
1970   scm_t_port *pt;
1971   char header[SCM_ENCODING_SEARCH_SIZE+1];
1972   size_t bytes_read, encoding_length, i;
1973   char *encoding = NULL;
1974   char *pos, *encoding_start;
1975   int in_comment;
1976
1977   pt = SCM_PTAB_ENTRY (port);
1978
1979   if (pt->rw_active == SCM_PORT_WRITE)
1980     scm_flush_unlocked (port);
1981
1982   if (pt->rw_random)
1983     pt->rw_active = SCM_PORT_READ;
1984
1985   if (pt->read_pos == pt->read_end)
1986     {
1987       /* We can use the read buffer, and thus avoid a seek. */
1988       if (scm_fill_input_unlocked (port) == EOF)
1989         return NULL;
1990
1991       bytes_read = pt->read_end - pt->read_pos;
1992       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1993         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1994
1995       if (bytes_read <= 1)
1996         /* An unbuffered port -- don't scan.  */
1997         return NULL;
1998
1999       memcpy (header, pt->read_pos, bytes_read);
2000       header[bytes_read] = '\0';
2001     }
2002   else
2003     {
2004       /* Try to read some bytes and then seek back.  Not all ports
2005          support seeking back; and indeed some file ports (like
2006          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2007          check performed by SCM_FPORT_FDES---but fail to seek
2008          backwards.  Hence this block comes second.  We prefer to use
2009          the read buffer in-place.  */
2010       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2011         return NULL;
2012
2013       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2014       header[bytes_read] = '\0';
2015       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2016     }
2017
2018   /* search past "coding[:=]" */
2019   pos = header;
2020   while (1)
2021     {
2022       if ((pos = strstr(pos, "coding")) == NULL)
2023         return NULL;
2024
2025       pos += strlen("coding");
2026       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2027           (*pos == ':' || *pos == '='))
2028         {
2029           pos ++;
2030           break;
2031         }
2032     }
2033
2034   /* skip spaces */
2035   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2036          (*pos == ' ' || *pos == '\t'))
2037     pos ++;
2038
2039   /* grab the next token */
2040   encoding_start = pos;
2041   i = 0;
2042   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2043          && encoding_start + i - header < bytes_read
2044          && is_encoding_char (encoding_start[i]))
2045     i++;
2046
2047   encoding_length = i;
2048   if (encoding_length == 0)
2049     return NULL;
2050
2051   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2052
2053   /* push backwards to make sure we were in a comment */
2054   in_comment = 0;
2055   pos = encoding_start;
2056   while (pos >= header)
2057     {
2058       if (*pos == ';')
2059         {
2060           in_comment = 1;
2061           break;
2062         }
2063       else if (*pos == '\n' || pos == header)
2064         {
2065           /* This wasn't in a semicolon comment. Check for a
2066            hash-bang comment. */
2067           char *beg = strstr (header, "#!");
2068           char *end = strstr (header, "!#");
2069           if (beg < encoding_start && encoding_start + encoding_length <= end)
2070             in_comment = 1;
2071           break;
2072         }
2073       else
2074         {
2075           pos --;
2076           continue;
2077         }
2078     }
2079   if (!in_comment)
2080     /* This wasn't in a comment */
2081     return NULL;
2082
2083   return encoding;
2084 }
2085
2086 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2087             (SCM port),
2088             "Scans the port for an Emacs-like character coding declaration\n"
2089             "near the top of the contents of a port with random-accessible contents.\n"
2090             "The coding declaration is of the form\n"
2091             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2092             "\n"
2093             "Returns a string containing the character encoding of the file\n"
2094             "if a declaration was found, or @code{#f} otherwise.\n")
2095 #define FUNC_NAME s_scm_file_encoding
2096 {
2097   char *enc;
2098   SCM s_enc;
2099
2100   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2101
2102   enc = scm_i_scan_for_encoding (port);
2103   if (enc == NULL)
2104     return SCM_BOOL_F;
2105   else
2106     {
2107       s_enc = scm_string_upcase (scm_from_locale_string (enc));
2108       return s_enc;
2109     }
2110
2111   return SCM_BOOL_F;
2112 }
2113 #undef FUNC_NAME
2114
2115 \f
2116 /* Per-port read options.
2117
2118    We store per-port read options in the 'port-read-options' key of the
2119    port's alist, which is stored in the internal port structure.  The
2120    value stored in the alist is a single integer that contains a two-bit
2121    field for each read option.
2122
2123    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2124    the applicable value should be inherited from the corresponding
2125    global read option.  Otherwise, the bit field contains the value of
2126    the read option.  For boolean read options that have been set
2127    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2128    read option has been set per-port, its possible values are those in
2129    'enum t_keyword_style'. */
2130
2131 /* Key to read options in per-port alists. */
2132 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2133
2134 /* Offsets of bit fields for each per-port override */
2135 #define READ_OPTION_COPY_SOURCE_P          0
2136 #define READ_OPTION_RECORD_POSITIONS_P     2
2137 #define READ_OPTION_CASE_INSENSITIVE_P     4
2138 #define READ_OPTION_KEYWORD_STYLE          6
2139 #define READ_OPTION_R6RS_ESCAPES_P         8
2140 #define READ_OPTION_SQUARE_BRACKETS_P     10
2141 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2142 #define READ_OPTION_CURLY_INFIX_P         14
2143
2144 /* The total width in bits of the per-port overrides */
2145 #define READ_OPTIONS_NUM_BITS             16
2146
2147 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2148 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2149
2150 #define READ_OPTION_MASK     3
2151 #define READ_OPTION_INHERIT  3
2152
2153 static void
2154 set_port_read_option (SCM port, int option, int new_value)
2155 {
2156   SCM alist, scm_read_options;
2157   unsigned int read_options;
2158
2159   new_value &= READ_OPTION_MASK;
2160   alist = scm_i_port_alist (port);
2161   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2162   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2163     read_options = scm_to_uint (scm_read_options);
2164   else
2165     read_options = READ_OPTIONS_INHERIT_ALL;
2166   read_options &= ~(READ_OPTION_MASK << option);
2167   read_options |= new_value << option;
2168   scm_read_options = scm_from_uint (read_options);
2169   alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2170   scm_i_set_port_alist_x (port, alist);
2171 }
2172
2173 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2174 static void
2175 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2176 {
2177   value = !!value;
2178   opts->case_insensitive_p = value;
2179   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2180 }
2181
2182 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2183 static void
2184 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2185 {
2186   value = !!value;
2187   opts->square_brackets_p = value;
2188   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2189 }
2190
2191 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2192 static void
2193 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2194 {
2195   value = !!value;
2196   opts->curly_infix_p = value;
2197   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2198 }
2199
2200 /* Initialize OPTS based on PORT's read options and the global read
2201    options. */
2202 static void
2203 init_read_options (SCM port, scm_t_read_opts *opts)
2204 {
2205   SCM alist, val, scm_read_options;
2206   unsigned int read_options, x;
2207
2208   alist = scm_i_port_alist (port);
2209   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2210
2211   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2212     read_options = scm_to_uint (scm_read_options);
2213   else
2214     read_options = READ_OPTIONS_INHERIT_ALL;
2215
2216   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2217   if (x == READ_OPTION_INHERIT)
2218     {
2219       val = SCM_PACK (SCM_KEYWORD_STYLE);
2220       if (scm_is_eq (val, scm_keyword_prefix))
2221         x = KEYWORD_STYLE_PREFIX;
2222       else if (scm_is_eq (val, scm_keyword_postfix))
2223         x = KEYWORD_STYLE_POSTFIX;
2224       else
2225         x = KEYWORD_STYLE_HASH_PREFIX;
2226     }
2227   opts->keyword_style = x;
2228
2229 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2230   do                                                                    \
2231     {                                                                   \
2232       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2233       if (x == READ_OPTION_INHERIT)                                     \
2234         x = !!SCM_ ## NAME;                                             \
2235           opts->name = x;                                               \
2236     }                                                                   \
2237   while (0)
2238
2239   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2240   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2241   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2242   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2243   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2244   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2245   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2246
2247 #undef RESOLVE_BOOLEAN_OPTION
2248
2249   opts->neoteric_p = 0;
2250 }
2251
2252 void
2253 scm_init_read ()
2254 {
2255   SCM read_hash_procs;
2256
2257   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2258
2259   scm_i_read_hash_procedures =
2260     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2261
2262   scm_init_opts (scm_read_options, scm_read_opts);
2263 #include "libguile/read.x"
2264 }
2265
2266 /*
2267   Local Variables:
2268   c-file-style: "gnu"
2269   End:
2270 */