libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33 #include <c-strcase.h>
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/bytevectors.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/eval.h"
  39 #include "libguile/arrays.h"
  40 #include "libguile/bitvectors.h"
  41 #include "libguile/keywords.h"
  42 #include "libguile/alist.h"
  43 #include "libguile/srcprop.h"
  44 #include "libguile/hashtab.h"
  45 #include "libguile/hash.h"
  46 #include "libguile/ports.h"
  47 #include "libguile/fports.h"
  48 #include "libguile/root.h"
  49 #include "libguile/strings.h"
  50 #include "libguile/strports.h"
  51 #include "libguile/vectors.h"
  52 #include "libguile/validate.h"
  53 #include "libguile/srfi-4.h"
  54 #include "libguile/srfi-13.h"
  55
  56 #include "libguile/read.h"
  57 #include "libguile/private-options.h"
  58
  59
  60 \f
  61
  62 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  63 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  64 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  65 SCM_SYMBOL (sym_nil, "nil");
  66
  67 /* SRFI-105 curly infix expression support */
  68 SCM_SYMBOL (sym_nfx, "$nfx$");
  69 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  70 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  71
  72 scm_t_option scm_read_opts[] =
  73   {
  74     { SCM_OPTION_BOOLEAN, "copy", 0,
  75       "Copy source code expressions." },
  76     { SCM_OPTION_BOOLEAN, "positions", 1,
  77       "Record positions of source code expressions." },
  78     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  79       "Convert symbols to lower case."},
  80     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  81       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  82     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  83       "Use R6RS variable-length character and string hex escapes."},
  84     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  85       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  86     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  87       "In strings, consume leading whitespace after an escaped end-of-line."},
  88     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  89       "Support SRFI-105 curly infix expressions."},
  90     { 0, },
  91   };
  92
  93 /* Internal read options structure.  This is initialized by 'scm_read'
  94    from the global and per-port read options, and a pointer is passed
  95    down to all helper functions. */
  96
  97 enum t_keyword_style
  98   {
  99     KEYWORD_STYLE_HASH_PREFIX,
 100     KEYWORD_STYLE_PREFIX,
 101     KEYWORD_STYLE_POSTFIX
 102   };
 103
 104 struct t_read_opts
 105 {
 106   enum t_keyword_style keyword_style;
 107   unsigned int copy_source_p        : 1;
 108   unsigned int record_positions_p   : 1;
 109   unsigned int case_insensitive_p   : 1;
 110   unsigned int r6rs_escapes_p       : 1;
 111   unsigned int square_brackets_p    : 1;
 112   unsigned int hungry_eol_escapes_p : 1;
 113   unsigned int curly_infix_p        : 1;
 114   unsigned int neoteric_p           : 1;
 115 };
 116
 117 typedef struct t_read_opts scm_t_read_opts;
 118
 119
 120 /*
 121   Give meaningful error messages for errors
 122
 123   We use the format
 124
 125   FILE:LINE:COL: MESSAGE
 126   This happened in ....
 127
 128   This is not standard GNU format, but the test-suite likes the real
 129   message to be in front.
 130
 131  */
 132
 133
 134 void
 135 scm_i_input_error (char const *function,
 136                    SCM port, const char *message, SCM arg)
 137 {
 138   SCM fn = (scm_is_string (SCM_FILENAME(port))
 139             ? SCM_FILENAME(port)
 140             : scm_from_locale_string ("#<unknown port>"));
 141
 142   SCM string_port = scm_open_output_string ();
 143   SCM string = SCM_EOL;
 144   scm_simple_format (string_port,
 145                      scm_from_locale_string ("~A:~S:~S: ~A"),
 146                      scm_list_4 (fn,
 147                                  scm_from_long (SCM_LINUM (port) + 1),
 148                                  scm_from_int (SCM_COL (port) + 1),
 149                                  scm_from_locale_string (message)));
 150
 151   string = scm_get_output_string (string_port);
 152   scm_close_output_port (string_port);
 153   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 154                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 155                  string,
 156                  arg,
 157                  SCM_BOOL_F);
 158 }
 159
 160
 161 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 162             (SCM setting),
 163             "Option interface for the read options. Instead of using\n"
 164             "this procedure directly, use the procedures @code{read-enable},\n"
 165             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 166 #define FUNC_NAME s_scm_read_options
 167 {
 168   SCM ans = scm_options (setting,
 169                          scm_read_opts,
 170                          FUNC_NAME);
 171   if (SCM_COPY_SOURCE_P)
 172     SCM_RECORD_POSITIONS_P = 1;
 173   return ans;
 174 }
 175 #undef FUNC_NAME
 176
 177 /* A fluid referring to an association list mapping extra hash
 178    characters to procedures.  */
 179 static SCM *scm_i_read_hash_procedures;
 180
 181 static SCM
 182 scm_i_read_hash_procedures_ref (void)
 183 {
 184   return scm_fluid_ref (*scm_i_read_hash_procedures);
 185 }
 186
 187 static void
 188 scm_i_read_hash_procedures_set_x (SCM value)
 189 {
 190   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 191 }
 192
 193 \f
 194 /* Token readers.  */
 195
 196
 197 /* Size of the C buffer used to read symbols and numbers.  */
 198 #define READER_BUFFER_SIZE            128
 199
 200 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 201 #define READER_STRING_BUFFER_SIZE     128
 202
 203 /* The maximum size of Scheme character names.  */
 204 #define READER_CHAR_NAME_MAX_SIZE      50
 205
 206 /* The maximum size of reader directive names.  */
 207 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 208
 209
 210 /* `isblank' is only in C99.  */
 211 #define CHAR_IS_BLANK_(_chr)                                    \
 212   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 213    || ((_chr) == '\f') || ((_chr) == '\r'))
 214
 215 #ifdef MSDOS
 216 # define CHAR_IS_BLANK(_chr)                    \
 217   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 218 #else
 219 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 220 #endif
 221
 222
 223 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 224    structure'').  */
 225 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 226   (CHAR_IS_BLANK (c)                                            \
 227    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 228
 229 #define CHAR_IS_DELIMITER(c)                                    \
 230   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 231    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 232                                       || opts->curly_infix_p))  \
 233    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 234
 235 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 236    Structure''.  */
 237 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 238   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 239    || ((_chr) == 'd') || ((_chr) == 'l'))
 240
 241 /* Read an SCSH block comment.  */
 242 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 244 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 246 static SCM scm_get_hash_procedure (int);
 247
 248 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 249    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 250    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 251    bytes actually read.  */
 252 static int
 253 read_token (SCM port, scm_t_read_opts *opts,
 254             char *buf, size_t buf_size, size_t *read)
 255 {
 256    *read = 0;
 257
 258    while (*read < buf_size)
 259      {
 260        int chr;
 261
 262        chr = scm_get_byte_or_eof (port);
 263
 264        if (chr == EOF)
 265         return 0;
 266       else if (CHAR_IS_DELIMITER (chr))
 267         {
 268           scm_unget_byte (chr, port);
 269           return 0;
 270         }
 271       else
 272         {
 273           *buf = (char) chr;
 274           buf++, (*read)++;
 275         }
 276      }
 277
 278    return 1;
 279  }
 280
 281 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 282    if the token doesn't fit in BUFFER_SIZE bytes.  */
 283 static char *
 284 read_complete_token (SCM port, scm_t_read_opts *opts,
 285                      char *buffer, size_t buffer_size, size_t *read)
 286 {
 287   int overflow = 0;
 288   size_t bytes_read, overflow_size = 0;
 289   char *overflow_buffer = NULL;
 290
 291   do
 292     {
 293       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 294       if (bytes_read == 0)
 295         break;
 296       if (overflow || overflow_size != 0)
 297         {
 298           if (overflow_size == 0)
 299             {
 300               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 301               memcpy (overflow_buffer, buffer, bytes_read);
 302               overflow_size = bytes_read;
 303             }
 304           else
 305             {
 306               char *new_buf =
 307                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 308
 309               memcpy (new_buf, overflow_buffer, overflow_size);
 310               memcpy (new_buf + overflow_size, buffer, bytes_read);
 311
 312               overflow_buffer = new_buf;
 313               overflow_size += bytes_read;
 314             }
 315         }
 316     }
 317   while (overflow);
 318
 319   if (overflow_size)
 320     *read = overflow_size;
 321   else
 322     *read = bytes_read;
 323
 324   return (overflow_size > 0 ? overflow_buffer : buffer);
 325 }
 326
 327 /* Skip whitespace from PORT and return the first non-whitespace character
 328    read.  Raise an error on end-of-file.  */
 329 static int
 330 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 331 {
 332   scm_t_wchar c;
 333   while (1)
 334     switch (c = scm_getc (port))
 335       {
 336       case EOF:
 337       goteof:
 338         if (eoferr)
 339           {
 340             scm_i_input_error (eoferr,
 341                                port,
 342                                "end of file",
 343                                SCM_EOL);
 344           }
 345         return c;
 346
 347       case ';':
 348       lp:
 349         switch (c = scm_getc (port))
 350           {
 351           case EOF:
 352             goto goteof;
 353           default:
 354             goto lp;
 355           case SCM_LINE_INCREMENTORS:
 356             break;
 357           }
 358         break;
 359
 360       case '#':
 361         switch (c = scm_getc (port))
 362           {
 363           case EOF:
 364             eoferr = "read_sharp";
 365             goto goteof;
 366           case '!':
 367             scm_read_shebang (c, port, opts);
 368             break;
 369           case ';':
 370             scm_read_commented_expression (c, port, opts);
 371             break;
 372           case '|':
 373             if (scm_is_false (scm_get_hash_procedure (c)))
 374               {
 375                 scm_read_r6rs_block_comment (c, port);
 376                 break;
 377               }
 378             /* fall through */
 379           default:
 380             scm_ungetc (c, port);
 381             return '#';
 382           }
 383         break;
 384
 385       case SCM_LINE_INCREMENTORS:
 386       case SCM_SINGLE_SPACES:
 387       case '\t':
 388         break;
 389
 390       default:
 391         return c;
 392       }
 393
 394   return 0;
 395 }
 396
 397
 398 \f
 399 /* Token readers.  */
 400
 401 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 402 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 403                            long line, int column);
 404
 405
 406 static SCM
 407 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 408                        long line, int column)
 409 {
 410   if (opts->record_positions_p)
 411     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 412   return x;
 413 }
 414
 415 static SCM
 416 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 417 #define FUNC_NAME "scm_i_lreadparen"
 418 {
 419   int c;
 420   SCM tmp, tl, ans = SCM_EOL;
 421   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 422   const int terminating_char = ((chr == '{') ? '}'
 423                                 : ((chr == '[') ? ']'
 424                                    : ')'));
 425
 426   /* Need to capture line and column numbers here. */
 427   long line = SCM_LINUM (port);
 428   int column = SCM_COL (port) - 1;
 429
 430   c = flush_ws (port, opts, FUNC_NAME);
 431   if (terminating_char == c)
 432     return SCM_EOL;
 433
 434   scm_ungetc (c, port);
 435   tmp = scm_read_expression (port, opts);
 436
 437   /* Note that it is possible for scm_read_expression to return
 438      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 439      check that it's a real dot by checking `c'.  */
 440   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 441     {
 442       ans = scm_read_expression (port, opts);
 443       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 444         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 445                            SCM_EOL);
 446       return ans;
 447     }
 448
 449   /* Build the head of the list structure. */
 450   ans = tl = scm_cons (tmp, SCM_EOL);
 451
 452   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 453     {
 454       SCM new_tail;
 455
 456       if (c == ')' || (c == ']' && opts->square_brackets_p)
 457           || ((c == '}' || c == ']') && opts->curly_infix_p))
 458         scm_i_input_error (FUNC_NAME, port,
 459                            "in pair: mismatched close paren: ~A",
 460                            scm_list_1 (SCM_MAKE_CHAR (c)));
 461
 462       scm_ungetc (c, port);
 463       tmp = scm_read_expression (port, opts);
 464
 465       /* See above note about scm_sym_dot.  */
 466       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 467         {
 468           SCM_SETCDR (tl, scm_read_expression (port, opts));
 469
 470           c = flush_ws (port, opts, FUNC_NAME);
 471           if (terminating_char != c)
 472             scm_i_input_error (FUNC_NAME, port,
 473                                "in pair: missing close paren", SCM_EOL);
 474           break;
 475         }
 476
 477       new_tail = scm_cons (tmp, SCM_EOL);
 478       SCM_SETCDR (tl, new_tail);
 479       tl = new_tail;
 480     }
 481
 482   if (curly_list_p)
 483     {
 484       /* In addition to finding the length, 'scm_ilength' checks for
 485          improper or circular lists, in which case it returns -1. */
 486       int len = scm_ilength (ans);
 487
 488       /* The (len == 0) case is handled above */
 489       if (len == 1)
 490         /* Return directly to avoid re-annotating the element's source
 491            location with the position of the outer brace.  Also, it
 492            might not be possible to annotate the element. */
 493         return scm_car (ans);  /* {e} => e */
 494       else if (len == 2)
 495         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 496       else if (len >= 3 && (len & 1))
 497         {
 498           /* It's a proper list whose length is odd and at least 3.  If
 499              the elements at odd indices (the infix operator positions)
 500              are all 'equal?', then it's a simple curly-infix list.
 501              Otherwise it's a mixed curly-infix list. */
 502           SCM op = scm_cadr (ans);
 503
 504           /* Check to see if the elements at odd indices are 'equal?' */
 505           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 506             {
 507               if (scm_is_null (tl))
 508                 {
 509                   /* Convert simple curly-infix list to prefix:
 510                      {a <op> b <op> ...} => (<op> a b ...) */
 511                   tl = ans;
 512                   while (scm_is_pair (scm_cdr (tl)))
 513                     {
 514                       tmp = scm_cddr (tl);
 515                       SCM_SETCDR (tl, tmp);
 516                       tl = tmp;
 517                     }
 518                   ans = scm_cons (op, ans);
 519                   break;
 520                 }
 521               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 522                 {
 523                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 524                   ans = scm_cons (sym_nfx, ans);
 525                   break;
 526                 }
 527             }
 528         }
 529       else
 530         /* Mixed curly-infix (possibly improper) list:
 531            {e . tail} => ($nfx$ e . tail) */
 532         ans = scm_cons (sym_nfx, ans);
 533     }
 534
 535   return maybe_annotate_source (ans, port, opts, line, column);
 536 }
 537 #undef FUNC_NAME
 538
 539
 540 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 541    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 542    found.  */
 543 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 544   do                                                               \
 545     {                                                              \
 546       scm_t_wchar a;                                               \
 547       size_t i = 0;                                                \
 548       c = 0;                                                       \
 549       while (i < ndigits)                                          \
 550         {                                                          \
 551           a = scm_getc (port);                                     \
 552           if (a == EOF)                                            \
 553             goto str_eof;                                          \
 554           if (terminator                                           \
 555               && (a == (scm_t_wchar) terminator)                   \
 556               && (i > 0))                                          \
 557             break;                                                 \
 558           if ('0' <= a && a <= '9')                                \
 559             a -= '0';                                              \
 560           else if ('A' <= a && a <= 'F')                           \
 561             a = a - 'A' + 10;                                      \
 562           else if ('a' <= a && a <= 'f')                           \
 563             a = a - 'a' + 10;                                      \
 564           else                                                     \
 565             {                                                      \
 566               c = a;                                               \
 567               goto bad_escaped;                                    \
 568             }                                                      \
 569           c = c * 16 + a;                                          \
 570           i ++;                                                    \
 571         }                                                          \
 572     } while (0)
 573
 574 static void
 575 skip_intraline_whitespace (SCM port)
 576 {
 577   scm_t_wchar c;
 578
 579   do
 580     {
 581       c = scm_getc (port);
 582       if (c == EOF)
 583         return;
 584     }
 585   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 586
 587   scm_ungetc (c, port);
 588 }
 589
 590 static SCM
 591 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 592 #define FUNC_NAME "scm_lreadr"
 593 {
 594   /* For strings smaller than C_STR, this function creates only one Scheme
 595      object (the string returned).  */
 596
 597   SCM str = SCM_EOL;
 598   size_t c_str_len = 0;
 599   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 600
 601   /* Need to capture line and column numbers here. */
 602   long line = SCM_LINUM (port);
 603   int column = SCM_COL (port) - 1;
 604
 605   while ('"' != (c = scm_getc (port)))
 606     {
 607       if (c == EOF)
 608         {
 609         str_eof:
 610           scm_i_input_error (FUNC_NAME, port,
 611                              "end of file in string constant", SCM_EOL);
 612         }
 613
 614       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 615         {
 616           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 617           c_str_len = 0;
 618         }
 619
 620       if (c == '\\')
 621         {
 622           switch (c = scm_getc (port))
 623             {
 624             case EOF:
 625               goto str_eof;
 626             case '"':
 627             case '|':
 628             case '\\':
 629               break;
 630             case '\n':
 631               if (opts->hungry_eol_escapes_p)
 632                 skip_intraline_whitespace (port);
 633               continue;
 634             case '0':
 635               c = '\0';
 636               break;
 637             case 'f':
 638               c = '\f';
 639               break;
 640             case 'n':
 641               c = '\n';
 642               break;
 643             case 'r':
 644               c = '\r';
 645               break;
 646             case 't':
 647               c = '\t';
 648               break;
 649             case 'a':
 650               c = '\007';
 651               break;
 652             case 'v':
 653               c = '\v';
 654               break;
 655             case 'b':
 656               c = '\010';
 657               break;
 658             case 'x':
 659               if (opts->r6rs_escapes_p)
 660                 SCM_READ_HEX_ESCAPE (10, ';');
 661               else
 662                 SCM_READ_HEX_ESCAPE (2, '\0');
 663               break;
 664             case 'u':
 665               if (!opts->r6rs_escapes_p)
 666                 {
 667                   SCM_READ_HEX_ESCAPE (4, '\0');
 668                   break;
 669                 }
 670             case 'U':
 671               if (!opts->r6rs_escapes_p)
 672                 {
 673                   SCM_READ_HEX_ESCAPE (6, '\0');
 674                   break;
 675                 }
 676             default:
 677             bad_escaped:
 678               scm_i_input_error (FUNC_NAME, port,
 679                                  "illegal character in escape sequence: ~S",
 680                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 681             }
 682         }
 683
 684       c_str[c_str_len++] = c;
 685     }
 686
 687   if (scm_is_null (str))
 688     /* Fast path: we got a string that fits in C_STR.  */
 689     str = scm_from_utf32_stringn (c_str, c_str_len);
 690   else
 691     {
 692       if (c_str_len > 0)
 693         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 694
 695       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 696     }
 697
 698   return maybe_annotate_source (str, port, opts, line, column);
 699 }
 700 #undef FUNC_NAME
 701
 702
 703 static SCM
 704 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 705 {
 706   SCM result, str = SCM_EOL;
 707   char local_buffer[READER_BUFFER_SIZE], *buffer;
 708   size_t bytes_read;
 709   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 710
 711   /* Need to capture line and column numbers here. */
 712   long line = SCM_LINUM (port);
 713   int column = SCM_COL (port) - 1;
 714
 715   scm_ungetc (chr, port);
 716   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 717                                 &bytes_read);
 718
 719   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 720
 721   result = scm_string_to_number (str, SCM_UNDEFINED);
 722   if (scm_is_false (result))
 723     {
 724       /* Return a symbol instead of a number */
 725       if (opts->case_insensitive_p)
 726         str = scm_string_downcase_x (str);
 727       result = scm_string_to_symbol (str);
 728     }
 729   else if (SCM_NIMP (result))
 730     result = maybe_annotate_source (result, port, opts, line, column);
 731
 732   SCM_COL (port) += scm_i_string_length (str);
 733   return result;
 734 }
 735
 736 static SCM
 737 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 738 {
 739   SCM result;
 740   int ends_with_colon = 0;
 741   size_t bytes_read;
 742   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 743   char local_buffer[READER_BUFFER_SIZE], *buffer;
 744   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 745   SCM str;
 746
 747   scm_ungetc (chr, port);
 748   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 749                                 &bytes_read);
 750   if (bytes_read > 0)
 751     ends_with_colon = buffer[bytes_read - 1] == ':';
 752
 753   if (postfix && ends_with_colon && (bytes_read > 1))
 754     {
 755       str = scm_from_stringn (buffer, bytes_read - 1,
 756                               pt->encoding, pt->ilseq_handler);
 757
 758       if (opts->case_insensitive_p)
 759         str = scm_string_downcase_x (str);
 760       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 761     }
 762   else
 763     {
 764       str = scm_from_stringn (buffer, bytes_read,
 765                               pt->encoding, pt->ilseq_handler);
 766
 767       if (opts->case_insensitive_p)
 768         str = scm_string_downcase_x (str);
 769       result = scm_string_to_symbol (str);
 770     }
 771
 772   SCM_COL (port) += scm_i_string_length (str);
 773   return result;
 774 }
 775
 776 static SCM
 777 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 778 #define FUNC_NAME "scm_lreadr"
 779 {
 780   SCM result;
 781   size_t read;
 782   char local_buffer[READER_BUFFER_SIZE], *buffer;
 783   unsigned int radix;
 784   SCM str;
 785   scm_t_port *pt;
 786
 787   switch (chr)
 788     {
 789     case 'B':
 790     case 'b':
 791       radix = 2;
 792       break;
 793
 794     case 'o':
 795     case 'O':
 796       radix = 8;
 797       break;
 798
 799     case 'd':
 800     case 'D':
 801       radix = 10;
 802       break;
 803
 804     case 'x':
 805     case 'X':
 806       radix = 16;
 807       break;
 808
 809     default:
 810       scm_ungetc (chr, port);
 811       scm_ungetc ('#', port);
 812       radix = 10;
 813     }
 814
 815   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 816                                 &read);
 817
 818   pt = SCM_PTAB_ENTRY (port);
 819   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 820
 821   result = scm_string_to_number (str, scm_from_uint (radix));
 822
 823   SCM_COL (port) += scm_i_string_length (str);
 824
 825   if (scm_is_true (result))
 826     return result;
 827
 828   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 829
 830   return SCM_BOOL_F;
 831 }
 832 #undef FUNC_NAME
 833
 834 static SCM
 835 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 836 {
 837   SCM p;
 838   long line = SCM_LINUM (port);
 839   int column = SCM_COL (port) - 1;
 840
 841   switch (chr)
 842     {
 843     case '`':
 844       p = scm_sym_quasiquote;
 845       break;
 846
 847     case '\'':
 848       p = scm_sym_quote;
 849       break;
 850
 851     case ',':
 852       {
 853         scm_t_wchar c;
 854
 855         c = scm_getc (port);
 856         if ('@' == c)
 857           p = scm_sym_uq_splicing;
 858         else
 859           {
 860             scm_ungetc (c, port);
 861             p = scm_sym_unquote;
 862           }
 863         break;
 864       }
 865
 866     default:
 867       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 868                "scm_read_quote", chr);
 869       abort ();
 870     }
 871
 872   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 873   return maybe_annotate_source (p, port, opts, line, column);
 874 }
 875
 876 SCM_SYMBOL (sym_syntax, "syntax");
 877 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 878 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 879 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 880
 881 static SCM
 882 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 883 {
 884   SCM p;
 885   long line = SCM_LINUM (port);
 886   int column = SCM_COL (port) - 1;
 887
 888   switch (chr)
 889     {
 890     case '`':
 891       p = sym_quasisyntax;
 892       break;
 893
 894     case '\'':
 895       p = sym_syntax;
 896       break;
 897
 898     case ',':
 899       {
 900         int c;
 901
 902         c = scm_getc (port);
 903         if ('@' == c)
 904           p = sym_unsyntax_splicing;
 905         else
 906           {
 907             scm_ungetc (c, port);
 908             p = sym_unsyntax;
 909           }
 910         break;
 911       }
 912
 913     default:
 914       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 915                "scm_read_syntax", chr);
 916       abort ();
 917     }
 918
 919   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 920   return maybe_annotate_source (p, port, opts, line, column);
 921 }
 922
 923 static SCM
 924 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 925 {
 926   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 927
 928   if (!scm_is_eq (id, sym_nil))
 929     scm_i_input_error ("scm_read_nil", port,
 930                        "unexpected input while reading #nil: ~a",
 931                        scm_list_1 (id));
 932
 933   return SCM_ELISP_NIL;
 934 }
 935
 936 static SCM
 937 scm_read_semicolon_comment (int chr, SCM port)
 938 {
 939   int c;
 940
 941   /* We use the get_byte here because there is no need to get the
 942      locale correct with comment input. This presumes that newline
 943      always represents itself no matter what the encoding is.  */
 944   for (c = scm_get_byte_or_eof (port);
 945        (c != EOF) && (c != '\n');
 946        c = scm_get_byte_or_eof (port));
 947
 948   return SCM_UNSPECIFIED;
 949 }
 950
 951 /* If the EXPECTED_CHARS are the next ones available from PORT, then
 952    consume them and return 1.  Otherwise leave the port position where
 953    it was and return 0.  EXPECTED_CHARS should be all lowercase, and
 954    will be matched case-insensitively against the characters read from
 955    PORT. */
 956 static int
 957 try_read_ci_chars (SCM port, const char *expected_chars)
 958 {
 959   int num_chars_wanted = strlen (expected_chars);
 960   int num_chars_read = 0;
 961   char *chars_read = alloca (num_chars_wanted);
 962   int c;
 963
 964   while (num_chars_read < num_chars_wanted)
 965     {
 966       c = scm_getc (port);
 967       if (c == EOF)
 968         break;
 969       else if (tolower (c) != expected_chars[num_chars_read])
 970         {
 971           scm_ungetc (c, port);
 972           break;
 973         }
 974       else
 975         chars_read[num_chars_read++] = c;
 976     }
 977
 978   if (num_chars_read == num_chars_wanted)
 979     return 1;
 980   else
 981     {
 982       while (num_chars_read > 0)
 983         scm_ungetc (chars_read[--num_chars_read], port);
 984       return 0;
 985     }
 986 }
 987
 988 \f
 989 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 990
 991 static SCM
 992 scm_read_boolean (int chr, SCM port)
 993 {
 994   switch (chr)
 995     {
 996     case 't':
 997     case 'T':
 998       try_read_ci_chars (port, "rue");
 999       return SCM_BOOL_T;
1000
1001     case 'f':
1002     case 'F':
1003       try_read_ci_chars (port, "alse");
1004       return SCM_BOOL_F;
1005     }
1006
1007   return SCM_UNSPECIFIED;
1008 }
1009
1010 static SCM
1011 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1012 #define FUNC_NAME "scm_lreadr"
1013 {
1014   char buffer[READER_CHAR_NAME_MAX_SIZE];
1015   SCM charname;
1016   size_t charname_len, bytes_read;
1017   scm_t_wchar cp;
1018   int overflow;
1019   scm_t_port *pt;
1020
1021   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1022                          &bytes_read);
1023   if (overflow)
1024     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1025
1026   if (bytes_read == 0)
1027     {
1028       chr = scm_getc (port);
1029       if (chr == EOF)
1030         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1031                            "while reading character", SCM_EOL);
1032
1033       /* CHR must be a token delimiter, like a whitespace.  */
1034       return (SCM_MAKE_CHAR (chr));
1035     }
1036
1037   pt = SCM_PTAB_ENTRY (port);
1038
1039   /* Simple ASCII characters can be processed immediately.  Also, simple
1040      ISO-8859-1 characters can be processed immediately if the encoding for this
1041      port is ISO-8859-1.  */
1042   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1043     {
1044       SCM_COL (port) += 1;
1045       return SCM_MAKE_CHAR (buffer[0]);
1046     }
1047
1048   /* Otherwise, convert the buffer into a proper scheme string for
1049      processing.  */
1050   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1051                                pt->ilseq_handler);
1052   charname_len = scm_i_string_length (charname);
1053   SCM_COL (port) += charname_len;
1054   cp = scm_i_string_ref (charname, 0);
1055   if (charname_len == 1)
1056     return SCM_MAKE_CHAR (cp);
1057
1058   /* Ignore dotted circles, which may be used to keep combining characters from
1059      combining with the backslash in #\charname.  */
1060   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1061     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1062
1063   if (cp >= '0' && cp < '8')
1064     {
1065       /* Dirk:FIXME::  This type of character syntax is not R5RS
1066        * compliant.  Further, it should be verified that the constant
1067        * does only consist of octal digits.  */
1068       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1069       if (SCM_I_INUMP (p))
1070         {
1071           scm_t_wchar c = scm_to_uint32 (p);
1072           if (SCM_IS_UNICODE_CHAR (c))
1073             return SCM_MAKE_CHAR (c);
1074           else
1075             scm_i_input_error (FUNC_NAME, port,
1076                                "out-of-range octal character escape: ~a",
1077                                scm_list_1 (charname));
1078         }
1079     }
1080
1081   if (cp == 'x' && (charname_len > 1))
1082     {
1083       SCM p;
1084
1085       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1086       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1087                                 scm_from_uint (16));
1088       if (SCM_I_INUMP (p))
1089         {
1090           scm_t_wchar c = scm_to_uint32 (p);
1091           if (SCM_IS_UNICODE_CHAR (c))
1092             return SCM_MAKE_CHAR (c);
1093           else
1094             scm_i_input_error (FUNC_NAME, port,
1095                                "out-of-range hex character escape: ~a",
1096                                scm_list_1 (charname));
1097         }
1098     }
1099
1100   /* The names of characters should never have non-Latin1
1101      characters.  */
1102   if (scm_i_is_narrow_string (charname)
1103       || scm_i_try_narrow_string (charname))
1104     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1105                                        charname_len);
1106       if (scm_is_true (ch))
1107         return ch;
1108     }
1109
1110   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1111                      scm_list_1 (charname));
1112
1113   return SCM_UNSPECIFIED;
1114 }
1115 #undef FUNC_NAME
1116
1117 static SCM
1118 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1119 {
1120   SCM symbol;
1121
1122   /* Read the symbol that comprises the keyword.  Doing this instead of
1123      invoking a specific symbol reader function allows `scm_read_keyword ()'
1124      to adapt to the delimiters currently valid of symbols.
1125
1126      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1127   symbol = scm_read_expression (port, opts);
1128   if (!scm_is_symbol (symbol))
1129     scm_i_input_error ("scm_read_keyword", port,
1130                        "keyword prefix `~a' not followed by a symbol: ~s",
1131                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1132
1133   return (scm_symbol_to_keyword (symbol));
1134 }
1135
1136 static SCM
1137 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1138                  long line, int column)
1139 {
1140   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1141      guarantee that it's going to do what we want.  After all, this is an
1142      implementation detail of `scm_read_vector ()', not a desirable
1143      property.  */
1144   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1145                                 port, opts, line, column);
1146 }
1147
1148 /* Helper used by scm_read_array */
1149 static int
1150 read_decimal_integer (SCM port, int c, ssize_t *resp)
1151 {
1152   ssize_t sign = 1;
1153   ssize_t res = 0;
1154   int got_it = 0;
1155
1156   if (c == '-')
1157     {
1158       sign = -1;
1159       c = scm_getc (port);
1160     }
1161
1162   while ('0' <= c && c <= '9')
1163     {
1164       if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1165         scm_i_input_error ("read_decimal_integer", port,
1166                            "number too large", SCM_EOL);
1167       res = 10*res + c-'0';
1168       got_it = 1;
1169       c = scm_getc (port);
1170     }
1171
1172   if (got_it)
1173     *resp = sign * res;
1174   return c;
1175 }
1176
1177 /* Read an array.  This function can also read vectors and uniform
1178    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1179    handled here.
1180
1181    C is the first character read after the '#'. */
1182 static SCM
1183 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1184 {
1185   ssize_t rank;
1186   scm_t_wchar tag_buf[8];
1187   int tag_len;
1188
1189   SCM tag, shape = SCM_BOOL_F, elements, array;
1190
1191   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1192      the array code can not deal with zero-length dimensions yet, and
1193      we want to allow zero-length vectors, of course. */
1194   if (c == '(')
1195     return scm_read_vector (c, port, opts, line, column);
1196
1197   /* Disambiguate between '#f' and uniform floating point vectors. */
1198   if (c == 'f')
1199     {
1200       c = scm_getc (port);
1201       if (c != '3' && c != '6')
1202         {
1203           if (c == 'a' && try_read_ci_chars (port, "lse"))
1204             return SCM_BOOL_F;
1205           else if (c != EOF)
1206             scm_ungetc (c, port);
1207           return SCM_BOOL_F;
1208         }
1209       rank = 1;
1210       tag_buf[0] = 'f';
1211       tag_len = 1;
1212       goto continue_reading_tag;
1213     }
1214
1215   /* Read rank. */
1216   rank = 1;
1217   c = read_decimal_integer (port, c, &rank);
1218   if (rank < 0)
1219     scm_i_input_error (NULL, port, "array rank must be non-negative",
1220                        SCM_EOL);
1221
1222   /* Read tag. */
1223   tag_len = 0;
1224  continue_reading_tag:
1225   while (c != EOF && c != '(' && c != '@' && c != ':'
1226          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1227     {
1228       tag_buf[tag_len++] = c;
1229       c = scm_getc (port);
1230     }
1231   if (tag_len == 0)
1232     tag = SCM_BOOL_T;
1233   else
1234     {
1235       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1236       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1237         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1238                            scm_list_1 (tag));
1239     }
1240
1241   /* Read shape. */
1242   if (c == '@' || c == ':')
1243     {
1244       shape = SCM_EOL;
1245
1246       do
1247         {
1248           ssize_t lbnd = 0, len = 0;
1249           SCM s;
1250
1251           if (c == '@')
1252             {
1253               c = scm_getc (port);
1254               c = read_decimal_integer (port, c, &lbnd);
1255             }
1256
1257           s = scm_from_ssize_t (lbnd);
1258
1259           if (c == ':')
1260             {
1261               c = scm_getc (port);
1262               c = read_decimal_integer (port, c, &len);
1263               if (len < 0)
1264                 scm_i_input_error (NULL, port,
1265                                    "array length must be non-negative",
1266                                    SCM_EOL);
1267
1268               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1269             }
1270
1271           shape = scm_cons (s, shape);
1272         } while (c == '@' || c == ':');
1273
1274       shape = scm_reverse_x (shape, SCM_EOL);
1275     }
1276
1277   /* Read nested lists of elements. */
1278   if (c != '(')
1279     scm_i_input_error (NULL, port,
1280                        "missing '(' in vector or array literal",
1281                        SCM_EOL);
1282   elements = scm_read_sexp (c, port, opts);
1283
1284   if (scm_is_false (shape))
1285     shape = scm_from_ssize_t (rank);
1286   else if (scm_ilength (shape) != rank)
1287     scm_i_input_error
1288       (NULL, port,
1289        "the number of shape specifications must match the array rank",
1290        SCM_EOL);
1291
1292   /* Handle special print syntax of rank zero arrays; see
1293      scm_i_print_array for a rationale. */
1294   if (rank == 0)
1295     {
1296       if (!scm_is_pair (elements))
1297         scm_i_input_error (NULL, port,
1298                            "too few elements in array literal, need 1",
1299                            SCM_EOL);
1300       if (!scm_is_null (SCM_CDR (elements)))
1301         scm_i_input_error (NULL, port,
1302                            "too many elements in array literal, want 1",
1303                            SCM_EOL);
1304       elements = SCM_CAR (elements);
1305     }
1306
1307   /* Construct array, annotate with source location, and return. */
1308   array = scm_list_to_typed_array (tag, shape, elements);
1309   return maybe_annotate_source (array, port, opts, line, column);
1310 }
1311
1312 static SCM
1313 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1314                        long line, int column)
1315 {
1316   return scm_read_array (chr, port, opts, line, column);
1317 }
1318
1319 static SCM
1320 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1321                      long line, int column)
1322 {
1323   chr = scm_getc (port);
1324   if (chr != 'u')
1325     goto syntax;
1326
1327   chr = scm_getc (port);
1328   if (chr != '8')
1329     goto syntax;
1330
1331   chr = scm_getc (port);
1332   if (chr != '(')
1333     goto syntax;
1334
1335   return maybe_annotate_source
1336     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1337      port, opts, line, column);
1338
1339  syntax:
1340   scm_i_input_error ("read_bytevector", port,
1341                      "invalid bytevector prefix",
1342                      SCM_MAKE_CHAR (chr));
1343   return SCM_UNSPECIFIED;
1344 }
1345
1346 static SCM
1347 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1348                            long line, int column)
1349 {
1350   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1351      terribly inefficient but who cares?  */
1352   SCM s_bits = SCM_EOL;
1353
1354   for (chr = scm_getc (port);
1355        (chr != EOF) && ((chr == '0') || (chr == '1'));
1356        chr = scm_getc (port))
1357     {
1358       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1359     }
1360
1361   if (chr != EOF)
1362     scm_ungetc (chr, port);
1363
1364   return maybe_annotate_source
1365     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1366      port, opts, line, column);
1367 }
1368
1369 static SCM
1370 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1371 {
1372   int bang_seen = 0;
1373
1374   for (;;)
1375     {
1376       int c = scm_getc (port);
1377
1378       if (c == EOF)
1379         scm_i_input_error ("skip_block_comment", port,
1380                            "unterminated `#! ... !#' comment", SCM_EOL);
1381
1382       if (c == '!')
1383         bang_seen = 1;
1384       else if (c == '#' && bang_seen)
1385         break;
1386       else
1387         bang_seen = 0;
1388     }
1389
1390   return SCM_UNSPECIFIED;
1391 }
1392
1393 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1394                                          int value);
1395 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1396                                         int value);
1397 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1398                                     int value);
1399
1400 static SCM
1401 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1402 {
1403   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1404   int c;
1405   int i = 0;
1406
1407   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1408     {
1409       c = scm_getc (port);
1410       if (c == EOF)
1411         scm_i_input_error ("skip_block_comment", port,
1412                            "unterminated `#! ... !#' comment", SCM_EOL);
1413       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1414         name[i++] = c;
1415       else if (CHAR_IS_DELIMITER (c))
1416         {
1417           scm_ungetc (c, port);
1418           name[i] = '\0';
1419           if (0 == strcmp ("r6rs", name))
1420             ;  /* Silently ignore */
1421           else if (0 == strcmp ("fold-case", name))
1422             set_port_case_insensitive_p (port, opts, 1);
1423           else if (0 == strcmp ("no-fold-case", name))
1424             set_port_case_insensitive_p (port, opts, 0);
1425           else if (0 == strcmp ("curly-infix", name))
1426             set_port_curly_infix_p (port, opts, 1);
1427           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1428             {
1429               set_port_curly_infix_p (port, opts, 1);
1430               set_port_square_brackets_p (port, opts, 0);
1431             }
1432           else
1433             break;
1434
1435           return SCM_UNSPECIFIED;
1436         }
1437       else
1438         {
1439           scm_ungetc (c, port);
1440           break;
1441         }
1442     }
1443   while (i > 0)
1444     scm_ungetc (name[--i], port);
1445   return scm_read_scsh_block_comment (chr, port);
1446 }
1447
1448 static SCM
1449 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1450 {
1451   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1452      nested.  So care must be taken.  */
1453   int nesting_level = 1;
1454
1455   int a = scm_getc (port);
1456
1457   if (a == EOF)
1458     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1459                        "unterminated `#| ... |#' comment", SCM_EOL);
1460
1461   while (nesting_level > 0)
1462     {
1463       int b = scm_getc (port);
1464
1465       if (b == EOF)
1466         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1467                            "unterminated `#| ... |#' comment", SCM_EOL);
1468
1469       if (a == '|' && b == '#')
1470         {
1471           nesting_level--;
1472           b = EOF;
1473         }
1474       else if (a == '#' && b == '|')
1475         {
1476           nesting_level++;
1477           b = EOF;
1478         }
1479
1480       a = b;
1481     }
1482
1483   return SCM_UNSPECIFIED;
1484 }
1485
1486 static SCM
1487 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1488                                scm_t_read_opts *opts)
1489 {
1490   scm_t_wchar c;
1491
1492   c = flush_ws (port, opts, (char *) NULL);
1493   if (EOF == c)
1494     scm_i_input_error ("read_commented_expression", port,
1495                        "no expression after #; comment", SCM_EOL);
1496   scm_ungetc (c, port);
1497   scm_read_expression (port, opts);
1498   return SCM_UNSPECIFIED;
1499 }
1500
1501 static SCM
1502 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1503 {
1504   /* Guile's extended symbol read syntax looks like this:
1505
1506        #{This is all a symbol name}#
1507
1508      So here, CHR is expected to be `{'.  */
1509   int saw_brace = 0;
1510   size_t len = 0;
1511   SCM buf = scm_i_make_string (1024, NULL, 0);
1512
1513   buf = scm_i_string_start_writing (buf);
1514
1515   while ((chr = scm_getc (port)) != EOF)
1516     {
1517       if (saw_brace)
1518         {
1519           if (chr == '#')
1520             {
1521               break;
1522             }
1523           else
1524             {
1525               saw_brace = 0;
1526               scm_i_string_set_x (buf, len++, '}');
1527             }
1528         }
1529
1530       if (chr == '}')
1531         saw_brace = 1;
1532       else if (chr == '\\')
1533         {
1534           /* It used to be that print.c would print extended-read-syntax
1535              symbols with backslashes before "non-standard" chars, but
1536              this routine wouldn't do anything with those escapes.
1537              Bummer.  What we've done is to change print.c to output
1538              R6RS hex escapes for those characters, relying on the fact
1539              that the extended read syntax would never put a `\' before
1540              an `x'.  For now, we just ignore other instances of
1541              backslash in the string.  */
1542           switch ((chr = scm_getc (port)))
1543             {
1544             case EOF:
1545               goto done;
1546             case 'x':
1547               {
1548                 scm_t_wchar c;
1549
1550                 SCM_READ_HEX_ESCAPE (10, ';');
1551                 scm_i_string_set_x (buf, len++, c);
1552                 break;
1553
1554               str_eof:
1555                 chr = EOF;
1556                 goto done;
1557
1558               bad_escaped:
1559                 scm_i_string_stop_writing ();
1560                 scm_i_input_error ("scm_read_extended_symbol", port,
1561                                    "illegal character in escape sequence: ~S",
1562                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1563                 break;
1564               }
1565             default:
1566               scm_i_string_set_x (buf, len++, chr);
1567               break;
1568             }
1569         }
1570       else
1571         scm_i_string_set_x (buf, len++, chr);
1572
1573       if (len >= scm_i_string_length (buf) - 2)
1574         {
1575           SCM addy;
1576
1577           scm_i_string_stop_writing ();
1578           addy = scm_i_make_string (1024, NULL, 0);
1579           buf = scm_string_append (scm_list_2 (buf, addy));
1580           len = 0;
1581           buf = scm_i_string_start_writing (buf);
1582         }
1583     }
1584
1585  done:
1586   scm_i_string_stop_writing ();
1587   if (chr == EOF)
1588     scm_i_input_error ("scm_read_extended_symbol", port,
1589                        "end of file while reading symbol", SCM_EOL);
1590
1591   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1592 }
1593
1594
1595 \f
1596 /* Top-level token readers, i.e., dispatchers.  */
1597
1598 static SCM
1599 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1600 {
1601   SCM proc;
1602
1603   proc = scm_get_hash_procedure (chr);
1604   if (scm_is_true (scm_procedure_p (proc)))
1605     {
1606       long line = SCM_LINUM (port);
1607       int column = SCM_COL (port) - 2;
1608       SCM got;
1609
1610       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1611
1612       if (opts->record_positions_p && SCM_NIMP (got)
1613           && !scm_i_has_source_properties (got))
1614         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1615
1616       return got;
1617     }
1618
1619   return SCM_UNSPECIFIED;
1620 }
1621
1622 /* The reader for the sharp `#' character.  It basically dispatches reads
1623    among the above token readers.   */
1624 static SCM
1625 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1626                 long line, int column)
1627 #define FUNC_NAME "scm_lreadr"
1628 {
1629   SCM result;
1630
1631   chr = scm_getc (port);
1632
1633   result = scm_read_sharp_extension (chr, port, opts);
1634   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1635     return result;
1636
1637   switch (chr)
1638     {
1639     case '\\':
1640       return (scm_read_character (chr, port, opts));
1641     case '(':
1642       return (scm_read_vector (chr, port, opts, line, column));
1643     case 's':
1644     case 'u':
1645     case 'f':
1646     case 'c':
1647       /* This one may return either a boolean or an SRFI-4 vector.  */
1648       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1649     case 'v':
1650       return (scm_read_bytevector (chr, port, opts, line, column));
1651     case '*':
1652       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1653     case 't':
1654     case 'T':
1655     case 'F':
1656       return (scm_read_boolean (chr, port));
1657     case ':':
1658       return (scm_read_keyword (chr, port, opts));
1659     case '0': case '1': case '2': case '3': case '4':
1660     case '5': case '6': case '7': case '8': case '9':
1661     case '@':
1662 #if SCM_ENABLE_DEPRECATED
1663       /* See below for 'i' and 'e'. */
1664     case 'a':
1665     case 'y':
1666     case 'h':
1667     case 'l':
1668 #endif
1669       return (scm_read_array (chr, port, opts, line, column));
1670
1671     case 'i':
1672     case 'e':
1673 #if SCM_ENABLE_DEPRECATED
1674       {
1675         /* When next char is '(', it really is an old-style
1676            uniform array. */
1677         scm_t_wchar next_c = scm_getc (port);
1678         if (next_c != EOF)
1679           scm_ungetc (next_c, port);
1680         if (next_c == '(')
1681           return scm_read_array (chr, port, opts, line, column);
1682         /* Fall through. */
1683       }
1684 #endif
1685     case 'b':
1686     case 'B':
1687     case 'o':
1688     case 'O':
1689     case 'd':
1690     case 'D':
1691     case 'x':
1692     case 'X':
1693     case 'I':
1694     case 'E':
1695       return (scm_read_number_and_radix (chr, port, opts));
1696     case '{':
1697       return (scm_read_extended_symbol (chr, port));
1698     case '!':
1699       return (scm_read_shebang (chr, port, opts));
1700     case ';':
1701       return (scm_read_commented_expression (chr, port, opts));
1702     case '`':
1703     case '\'':
1704     case ',':
1705       return (scm_read_syntax (chr, port, opts));
1706     case 'n':
1707       return (scm_read_nil (chr, port, opts));
1708     default:
1709       result = scm_read_sharp_extension (chr, port, opts);
1710       if (scm_is_eq (result, SCM_UNSPECIFIED))
1711         {
1712           /* To remain compatible with 1.8 and earlier, the following
1713              characters have lower precedence than `read-hash-extend'
1714              characters.  */
1715           switch (chr)
1716             {
1717             case '|':
1718               return scm_read_r6rs_block_comment (chr, port);
1719             default:
1720               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1721                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1722             }
1723         }
1724       else
1725         return result;
1726     }
1727
1728   return SCM_UNSPECIFIED;
1729 }
1730 #undef FUNC_NAME
1731
1732 static SCM
1733 read_inner_expression (SCM port, scm_t_read_opts *opts)
1734 #define FUNC_NAME "read_inner_expression"
1735 {
1736   while (1)
1737     {
1738       scm_t_wchar chr;
1739
1740       chr = scm_getc (port);
1741
1742       switch (chr)
1743         {
1744         case SCM_WHITE_SPACES:
1745         case SCM_LINE_INCREMENTORS:
1746           break;
1747         case ';':
1748           (void) scm_read_semicolon_comment (chr, port);
1749           break;
1750         case '{':
1751           if (opts->curly_infix_p)
1752             {
1753               if (opts->neoteric_p)
1754                 return scm_read_sexp (chr, port, opts);
1755               else
1756                 {
1757                   SCM expr;
1758
1759                   /* Enable neoteric expressions within curly braces */
1760                   opts->neoteric_p = 1;
1761                   expr = scm_read_sexp (chr, port, opts);
1762                   opts->neoteric_p = 0;
1763                   return expr;
1764                 }
1765             }
1766           else
1767             return scm_read_mixed_case_symbol (chr, port, opts);
1768         case '[':
1769           if (opts->square_brackets_p)
1770             return scm_read_sexp (chr, port, opts);
1771           else if (opts->curly_infix_p)
1772             {
1773               /* The syntax of neoteric expressions requires that '[' be
1774                  a delimiter when curly-infix is enabled, so it cannot
1775                  be part of an unescaped symbol.  We might as well do
1776                  something useful with it, so we adopt Kawa's convention:
1777                  [...] => ($bracket-list$ ...) */
1778               long line = SCM_LINUM (port);
1779               int column = SCM_COL (port) - 1;
1780               return maybe_annotate_source
1781                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1782                  port, opts, line, column);
1783             }
1784           else
1785             return scm_read_mixed_case_symbol (chr, port, opts);
1786         case '(':
1787           return (scm_read_sexp (chr, port, opts));
1788         case '"':
1789           return (scm_read_string (chr, port, opts));
1790         case '\'':
1791         case '`':
1792         case ',':
1793           return (scm_read_quote (chr, port, opts));
1794         case '#':
1795           {
1796             long line  = SCM_LINUM (port);
1797             int column = SCM_COL (port) - 1;
1798             SCM result = scm_read_sharp (chr, port, opts, line, column);
1799             if (scm_is_eq (result, SCM_UNSPECIFIED))
1800               /* We read a comment or some such.  */
1801               break;
1802             else
1803               return result;
1804           }
1805         case ')':
1806           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1807           break;
1808         case '}':
1809           if (opts->curly_infix_p)
1810             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1811           else
1812             return scm_read_mixed_case_symbol (chr, port, opts);
1813         case ']':
1814           if (opts->square_brackets_p)
1815             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1816           /* otherwise fall through */
1817         case EOF:
1818           return SCM_EOF_VAL;
1819         case ':':
1820           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1821             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1822           /* Fall through.  */
1823
1824         default:
1825           {
1826             if (((chr >= '0') && (chr <= '9'))
1827                 || (strchr ("+-.", chr)))
1828               return (scm_read_number (chr, port, opts));
1829             else
1830               return (scm_read_mixed_case_symbol (chr, port, opts));
1831           }
1832         }
1833     }
1834 }
1835 #undef FUNC_NAME
1836
1837 static SCM
1838 scm_read_expression (SCM port, scm_t_read_opts *opts)
1839 #define FUNC_NAME "scm_read_expression"
1840 {
1841   if (!opts->neoteric_p)
1842     return read_inner_expression (port, opts);
1843   else
1844     {
1845       long line = 0;
1846       int column = 0;
1847       SCM expr;
1848
1849       if (opts->record_positions_p)
1850         {
1851           /* We need to get the position of the first non-whitespace
1852              character in order to correctly annotate neoteric
1853              expressions.  For example, for the expression 'f(x)', the
1854              first call to 'read_inner_expression' reads the 'f' (which
1855              cannot be annotated), and then we later read the '(x)' and
1856              use it to construct the new list (f x). */
1857           int c = flush_ws (port, opts, (char *) NULL);
1858           if (c == EOF)
1859             return SCM_EOF_VAL;
1860           scm_ungetc (c, port);
1861           line = SCM_LINUM (port);
1862           column = SCM_COL (port);
1863         }
1864
1865       expr = read_inner_expression (port, opts);
1866
1867       /* 'expr' is the first component of the neoteric expression.  Now
1868          we loop, and as long as the next character is '(', '[', or '{',
1869          (without any intervening whitespace), we use it to construct a
1870          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1871       for (;;)
1872         {
1873           int chr = scm_getc (port);
1874
1875           if (chr == '(')
1876             /* e(...) => (e ...) */
1877             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1878           else if (chr == '[')
1879             /* e[...] => ($bracket-apply$ e ...) */
1880             expr = scm_cons (sym_bracket_apply,
1881                              scm_cons (expr,
1882                                        scm_read_sexp (chr, port, opts)));
1883           else if (chr == '{')
1884             {
1885               SCM arg = scm_read_sexp (chr, port, opts);
1886
1887               if (scm_is_null (arg))
1888                 expr = scm_list_1 (expr);       /* e{} => (e) */
1889               else
1890                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1891             }
1892           else
1893             {
1894               if (chr != EOF)
1895                 scm_ungetc (chr, port);
1896               break;
1897             }
1898           maybe_annotate_source (expr, port, opts, line, column);
1899         }
1900       return expr;
1901     }
1902 }
1903 #undef FUNC_NAME
1904
1905 \f
1906 /* Actual reader.  */
1907
1908 static void init_read_options (SCM port, scm_t_read_opts *opts);
1909
1910 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1911             (SCM port),
1912             "Read an s-expression from the input port @var{port}, or from\n"
1913             "the current input port if @var{port} is not specified.\n"
1914             "Any whitespace before the next token is discarded.")
1915 #define FUNC_NAME s_scm_read
1916 {
1917   scm_t_read_opts opts;
1918   int c;
1919
1920   if (SCM_UNBNDP (port))
1921     port = scm_current_input_port ();
1922   SCM_VALIDATE_OPINPORT (1, port);
1923
1924   init_read_options (port, &opts);
1925
1926   c = flush_ws (port, &opts, (char *) NULL);
1927   if (EOF == c)
1928     return SCM_EOF_VAL;
1929   scm_ungetc (c, port);
1930
1931   return (scm_read_expression (port, &opts));
1932 }
1933 #undef FUNC_NAME
1934
1935
1936 \f
1937
1938 /* Manipulate the read-hash-procedures alist.  This could be written in
1939    Scheme, but maybe it will also be used by C code during initialisation.  */
1940 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1941             (SCM chr, SCM proc),
1942             "Install the procedure @var{proc} for reading expressions\n"
1943             "starting with the character sequence @code{#} and @var{chr}.\n"
1944             "@var{proc} will be called with two arguments:  the character\n"
1945             "@var{chr} and the port to read further data from. The object\n"
1946             "returned will be the return value of @code{read}. \n"
1947             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1948             )
1949 #define FUNC_NAME s_scm_read_hash_extend
1950 {
1951   SCM this;
1952   SCM prev;
1953
1954   SCM_VALIDATE_CHAR (1, chr);
1955   SCM_ASSERT (scm_is_false (proc)
1956               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1957               proc, SCM_ARG2, FUNC_NAME);
1958
1959   /* Check if chr is already in the alist.  */
1960   this = scm_i_read_hash_procedures_ref ();
1961   prev = SCM_BOOL_F;
1962   while (1)
1963     {
1964       if (scm_is_null (this))
1965         {
1966           /* not found, so add it to the beginning.  */
1967           if (scm_is_true (proc))
1968             {
1969               SCM new = scm_cons (scm_cons (chr, proc),
1970                                   scm_i_read_hash_procedures_ref ());
1971               scm_i_read_hash_procedures_set_x (new);
1972             }
1973           break;
1974         }
1975       if (scm_is_eq (chr, SCM_CAAR (this)))
1976         {
1977           /* already in the alist.  */
1978           if (scm_is_false (proc))
1979             {
1980               /* remove it.  */
1981               if (scm_is_false (prev))
1982                 {
1983                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1984                   scm_i_read_hash_procedures_set_x (rest);
1985                 }
1986               else
1987                 scm_set_cdr_x (prev, SCM_CDR (this));
1988             }
1989           else
1990             {
1991               /* replace it.  */
1992               scm_set_cdr_x (SCM_CAR (this), proc);
1993             }
1994           break;
1995         }
1996       prev = this;
1997       this = SCM_CDR (this);
1998     }
1999
2000   return SCM_UNSPECIFIED;
2001 }
2002 #undef FUNC_NAME
2003
2004 /* Recover the read-hash procedure corresponding to char c.  */
2005 static SCM
2006 scm_get_hash_procedure (int c)
2007 {
2008   SCM rest = scm_i_read_hash_procedures_ref ();
2009
2010   while (1)
2011     {
2012       if (scm_is_null (rest))
2013         return SCM_BOOL_F;
2014
2015       if (SCM_CHAR (SCM_CAAR (rest)) == c)
2016         return SCM_CDAR (rest);
2017
2018       rest = SCM_CDR (rest);
2019     }
2020 }
2021
2022 #define SCM_ENCODING_SEARCH_SIZE (500)
2023
2024 /* Search the first few hundred characters of a file for an Emacs-like coding
2025    declaration.  Returns either NULL or a string whose storage has been
2026    allocated with `scm_gc_malloc ()'.  */
2027 char *
2028 scm_i_scan_for_encoding (SCM port)
2029 {
2030   scm_t_port *pt;
2031   char header[SCM_ENCODING_SEARCH_SIZE+1];
2032   size_t bytes_read, encoding_length, i;
2033   char *encoding = NULL;
2034   char *pos, *encoding_start;
2035   int in_comment;
2036
2037   pt = SCM_PTAB_ENTRY (port);
2038
2039   if (pt->rw_active == SCM_PORT_WRITE)
2040     scm_flush (port);
2041
2042   if (pt->rw_random)
2043     pt->rw_active = SCM_PORT_READ;
2044
2045   if (pt->read_pos == pt->read_end)
2046     {
2047       /* We can use the read buffer, and thus avoid a seek. */
2048       if (scm_fill_input (port) == EOF)
2049         return NULL;
2050
2051       bytes_read = pt->read_end - pt->read_pos;
2052       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2053         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2054
2055       if (bytes_read <= 1)
2056         /* An unbuffered port -- don't scan.  */
2057         return NULL;
2058
2059       memcpy (header, pt->read_pos, bytes_read);
2060       header[bytes_read] = '\0';
2061     }
2062   else
2063     {
2064       /* Try to read some bytes and then seek back.  Not all ports
2065          support seeking back; and indeed some file ports (like
2066          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2067          check performed by SCM_FPORT_FDES---but fail to seek
2068          backwards.  Hence this block comes second.  We prefer to use
2069          the read buffer in-place.  */
2070       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2071         return NULL;
2072
2073       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2074       header[bytes_read] = '\0';
2075       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2076     }
2077
2078   /* search past "coding[:=]" */
2079   pos = header;
2080   while (1)
2081     {
2082       if ((pos = strstr(pos, "coding")) == NULL)
2083         return NULL;
2084
2085       pos += strlen("coding");
2086       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2087           (*pos == ':' || *pos == '='))
2088         {
2089           pos ++;
2090           break;
2091         }
2092     }
2093
2094   /* skip spaces */
2095   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2096          (*pos == ' ' || *pos == '\t'))
2097     pos ++;
2098
2099   /* grab the next token */
2100   encoding_start = pos;
2101   i = 0;
2102   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2103          && encoding_start + i - header < bytes_read
2104          && (isalnum ((int) encoding_start[i])
2105              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2106     i++;
2107
2108   encoding_length = i;
2109   if (encoding_length == 0)
2110     return NULL;
2111
2112   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2113   for (i = 0; i < encoding_length; i++)
2114     encoding[i] = toupper ((int) encoding[i]);
2115
2116   /* push backwards to make sure we were in a comment */
2117   in_comment = 0;
2118   pos = encoding_start;
2119   while (pos >= header)
2120     {
2121       if (*pos == ';')
2122         {
2123           in_comment = 1;
2124           break;
2125         }
2126       else if (*pos == '\n' || pos == header)
2127         {
2128           /* This wasn't in a semicolon comment. Check for a
2129            hash-bang comment. */
2130           char *beg = strstr (header, "#!");
2131           char *end = strstr (header, "!#");
2132           if (beg < encoding_start && encoding_start + encoding_length <= end)
2133             in_comment = 1;
2134           break;
2135         }
2136       else
2137         {
2138           pos --;
2139           continue;
2140         }
2141     }
2142   if (!in_comment)
2143     /* This wasn't in a comment */
2144     return NULL;
2145
2146   return encoding;
2147 }
2148
2149 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2150             (SCM port),
2151             "Scans the port for an Emacs-like character coding declaration\n"
2152             "near the top of the contents of a port with random-accessible contents.\n"
2153             "The coding declaration is of the form\n"
2154             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2155             "\n"
2156             "Returns a string containing the character encoding of the file\n"
2157             "if a declaration was found, or @code{#f} otherwise.\n")
2158 #define FUNC_NAME s_scm_file_encoding
2159 {
2160   char *enc;
2161   SCM s_enc;
2162
2163   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2164
2165   enc = scm_i_scan_for_encoding (port);
2166   if (enc == NULL)
2167     return SCM_BOOL_F;
2168   else
2169     {
2170       s_enc = scm_from_locale_string (enc);
2171       return s_enc;
2172     }
2173
2174   return SCM_BOOL_F;
2175 }
2176 #undef FUNC_NAME
2177
2178 \f
2179 /* Per-port read options.
2180
2181    We store per-port read options in the 'port-read-options' port
2182    property, which is stored in the internal port structure.  The value
2183    stored is a single integer that contains a two-bit field for each
2184    read option.
2185
2186    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2187    the applicable value should be inherited from the corresponding
2188    global read option.  Otherwise, the bit field contains the value of
2189    the read option.  For boolean read options that have been set
2190    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2191    read option has been set per-port, its possible values are those in
2192    'enum t_keyword_style'. */
2193
2194 /* Key to read options in port properties. */
2195 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2196
2197 /* Offsets of bit fields for each per-port override */
2198 #define READ_OPTION_COPY_SOURCE_P          0
2199 #define READ_OPTION_RECORD_POSITIONS_P     2
2200 #define READ_OPTION_CASE_INSENSITIVE_P     4
2201 #define READ_OPTION_KEYWORD_STYLE          6
2202 #define READ_OPTION_R6RS_ESCAPES_P         8
2203 #define READ_OPTION_SQUARE_BRACKETS_P     10
2204 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2205 #define READ_OPTION_CURLY_INFIX_P         14
2206
2207 /* The total width in bits of the per-port overrides */
2208 #define READ_OPTIONS_NUM_BITS             16
2209
2210 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2211 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2212
2213 #define READ_OPTION_MASK     3
2214 #define READ_OPTION_INHERIT  3
2215
2216 static void
2217 set_port_read_option (SCM port, int option, int new_value)
2218 {
2219   SCM scm_read_options;
2220   unsigned int read_options;
2221
2222   new_value &= READ_OPTION_MASK;
2223   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2224   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2225     read_options = scm_to_uint (scm_read_options);
2226   else
2227     read_options = READ_OPTIONS_INHERIT_ALL;
2228   read_options &= ~(READ_OPTION_MASK << option);
2229   read_options |= new_value << option;
2230   scm_read_options = scm_from_uint (read_options);
2231   scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2232 }
2233
2234 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2235 static void
2236 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2237 {
2238   value = !!value;
2239   opts->case_insensitive_p = value;
2240   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2241 }
2242
2243 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2244 static void
2245 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2246 {
2247   value = !!value;
2248   opts->square_brackets_p = value;
2249   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2250 }
2251
2252 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2253 static void
2254 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2255 {
2256   value = !!value;
2257   opts->curly_infix_p = value;
2258   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2259 }
2260
2261 /* Initialize OPTS based on PORT's read options and the global read
2262    options. */
2263 static void
2264 init_read_options (SCM port, scm_t_read_opts *opts)
2265 {
2266   SCM val, scm_read_options;
2267   unsigned int read_options, x;
2268
2269   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2270
2271   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2272     read_options = scm_to_uint (scm_read_options);
2273   else
2274     read_options = READ_OPTIONS_INHERIT_ALL;
2275
2276   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2277   if (x == READ_OPTION_INHERIT)
2278     {
2279       val = SCM_PACK (SCM_KEYWORD_STYLE);
2280       if (scm_is_eq (val, scm_keyword_prefix))
2281         x = KEYWORD_STYLE_PREFIX;
2282       else if (scm_is_eq (val, scm_keyword_postfix))
2283         x = KEYWORD_STYLE_POSTFIX;
2284       else
2285         x = KEYWORD_STYLE_HASH_PREFIX;
2286     }
2287   opts->keyword_style = x;
2288
2289 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2290   do                                                                    \
2291     {                                                                   \
2292       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2293       if (x == READ_OPTION_INHERIT)                                     \
2294         x = !!SCM_ ## NAME;                                             \
2295           opts->name = x;                                               \
2296     }                                                                   \
2297   while (0)
2298
2299   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2300   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2301   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2302   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2303   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2304   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2305   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2306
2307 #undef RESOLVE_BOOLEAN_OPTION
2308
2309   opts->neoteric_p = 0;
2310 }
2311
2312 void
2313 scm_init_read ()
2314 {
2315   SCM read_hash_procs;
2316
2317   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2318
2319   scm_i_read_hash_procedures =
2320     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2321
2322   scm_init_opts (scm_read_options, scm_read_opts);
2323 #include "libguile/read.x"
2324 }
2325
2326 /*
2327   Local Variables:
2328   c-file-style: "gnu"
2329   End:
2330 */