libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33 #include <c-strcase.h>
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/bytevectors.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/eval.h"
  39 #include "libguile/arrays.h"
  40 #include "libguile/bitvectors.h"
  41 #include "libguile/keywords.h"
  42 #include "libguile/alist.h"
  43 #include "libguile/srcprop.h"
  44 #include "libguile/hashtab.h"
  45 #include "libguile/hash.h"
  46 #include "libguile/ports.h"
  47 #include "libguile/fports.h"
  48 #include "libguile/root.h"
  49 #include "libguile/strings.h"
  50 #include "libguile/strports.h"
  51 #include "libguile/vectors.h"
  52 #include "libguile/validate.h"
  53 #include "libguile/srfi-4.h"
  54 #include "libguile/srfi-13.h"
  55
  56 #include "libguile/read.h"
  57 #include "libguile/private-options.h"
  58
  59
  60 \f
  61
  62 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  63 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  64 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  65 SCM_SYMBOL (sym_nil, "nil");
  66
  67 /* SRFI-105 curly infix expression support */
  68 SCM_SYMBOL (sym_nfx, "$nfx$");
  69 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  70 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  71
  72 scm_t_option scm_read_opts[] =
  73   {
  74     { SCM_OPTION_BOOLEAN, "copy", 0,
  75       "Copy source code expressions." },
  76     { SCM_OPTION_BOOLEAN, "positions", 1,
  77       "Record positions of source code expressions." },
  78     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  79       "Convert symbols to lower case."},
  80     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  81       "Style of keyword recognition: #f, 'prefix or 'postfix."},
  82     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  83       "Use R6RS variable-length character and string hex escapes."},
  84     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  85       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  86     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  87       "In strings, consume leading whitespace after an escaped end-of-line."},
  88     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  89       "Support SRFI-105 curly infix expressions."},
  90     { 0, },
  91   };
  92
  93 /* Internal read options structure.  This is initialized by 'scm_read'
  94    from the global and per-port read options, and a pointer is passed
  95    down to all helper functions. */
  96
  97 enum t_keyword_style
  98   {
  99     KEYWORD_STYLE_HASH_PREFIX,
 100     KEYWORD_STYLE_PREFIX,
 101     KEYWORD_STYLE_POSTFIX
 102   };
 103
 104 struct t_read_opts
 105 {
 106   enum t_keyword_style keyword_style;
 107   unsigned int copy_source_p        : 1;
 108   unsigned int record_positions_p   : 1;
 109   unsigned int case_insensitive_p   : 1;
 110   unsigned int r6rs_escapes_p       : 1;
 111   unsigned int square_brackets_p    : 1;
 112   unsigned int hungry_eol_escapes_p : 1;
 113   unsigned int curly_infix_p        : 1;
 114   unsigned int neoteric_p           : 1;
 115 };
 116
 117 typedef struct t_read_opts scm_t_read_opts;
 118
 119
 120 /*
 121   Give meaningful error messages for errors
 122
 123   We use the format
 124
 125   FILE:LINE:COL: MESSAGE
 126   This happened in ....
 127
 128   This is not standard GNU format, but the test-suite likes the real
 129   message to be in front.
 130
 131  */
 132
 133
 134 void
 135 scm_i_input_error (char const *function,
 136                    SCM port, const char *message, SCM arg)
 137 {
 138   SCM fn = (scm_is_string (SCM_FILENAME(port))
 139             ? SCM_FILENAME(port)
 140             : scm_from_locale_string ("#<unknown port>"));
 141
 142   SCM string_port = scm_open_output_string ();
 143   SCM string = SCM_EOL;
 144   scm_simple_format (string_port,
 145                      scm_from_locale_string ("~A:~S:~S: ~A"),
 146                      scm_list_4 (fn,
 147                                  scm_from_long (SCM_LINUM (port) + 1),
 148                                  scm_from_int (SCM_COL (port) + 1),
 149                                  scm_from_locale_string (message)));
 150
 151   string = scm_get_output_string (string_port);
 152   scm_close_output_port (string_port);
 153   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 154                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 155                  string,
 156                  arg,
 157                  SCM_BOOL_F);
 158 }
 159
 160
 161 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 162             (SCM setting),
 163             "Option interface for the read options. Instead of using\n"
 164             "this procedure directly, use the procedures @code{read-enable},\n"
 165             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 166 #define FUNC_NAME s_scm_read_options
 167 {
 168   SCM ans = scm_options (setting,
 169                          scm_read_opts,
 170                          FUNC_NAME);
 171   if (SCM_COPY_SOURCE_P)
 172     SCM_RECORD_POSITIONS_P = 1;
 173   return ans;
 174 }
 175 #undef FUNC_NAME
 176
 177 /* A fluid referring to an association list mapping extra hash
 178    characters to procedures.  */
 179 static SCM *scm_i_read_hash_procedures;
 180
 181 static SCM
 182 scm_i_read_hash_procedures_ref (void)
 183 {
 184   return scm_fluid_ref (*scm_i_read_hash_procedures);
 185 }
 186
 187 static void
 188 scm_i_read_hash_procedures_set_x (SCM value)
 189 {
 190   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 191 }
 192
 193 \f
 194 /* Token readers.  */
 195
 196
 197 /* Size of the C buffer used to read symbols and numbers.  */
 198 #define READER_BUFFER_SIZE            128
 199
 200 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 201 #define READER_STRING_BUFFER_SIZE     128
 202
 203 /* The maximum size of Scheme character names.  */
 204 #define READER_CHAR_NAME_MAX_SIZE      50
 205
 206 /* The maximum size of reader directive names.  */
 207 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
 208
 209
 210 /* `isblank' is only in C99.  */
 211 #define CHAR_IS_BLANK_(_chr)                                    \
 212   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 213    || ((_chr) == '\f') || ((_chr) == '\r'))
 214
 215 #ifdef MSDOS
 216 # define CHAR_IS_BLANK(_chr)                    \
 217   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 218 #else
 219 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 220 #endif
 221
 222
 223 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 224    structure'').  */
 225 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 226   (CHAR_IS_BLANK (c)                                            \
 227    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
 228
 229 #define CHAR_IS_DELIMITER(c)                                    \
 230   (CHAR_IS_R5RS_DELIMITER (c)                                   \
 231    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
 232                                       || opts->curly_infix_p))  \
 233    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
 234
 235 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 236    Structure''.  */
 237 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 238   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 239    || ((_chr) == 'd') || ((_chr) == 'l'))
 240
 241 /* Read an SCSH block comment.  */
 242 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 243 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 244 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
 245 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
 246 static SCM scm_get_hash_procedure (int);
 247
 248 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 249    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 250    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 251    bytes actually read.  */
 252 static int
 253 read_token (SCM port, scm_t_read_opts *opts,
 254             char *buf, size_t buf_size, size_t *read)
 255 {
 256    *read = 0;
 257
 258    while (*read < buf_size)
 259      {
 260        int chr;
 261
 262        chr = scm_get_byte_or_eof (port);
 263
 264        if (chr == EOF)
 265         return 0;
 266       else if (CHAR_IS_DELIMITER (chr))
 267         {
 268           scm_unget_byte (chr, port);
 269           return 0;
 270         }
 271       else
 272         {
 273           *buf = (char) chr;
 274           buf++, (*read)++;
 275         }
 276      }
 277
 278    return 1;
 279  }
 280
 281 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 282    if the token doesn't fit in BUFFER_SIZE bytes.  */
 283 static char *
 284 read_complete_token (SCM port, scm_t_read_opts *opts,
 285                      char *buffer, size_t buffer_size, size_t *read)
 286 {
 287   int overflow = 0;
 288   size_t bytes_read, overflow_size = 0;
 289   char *overflow_buffer = NULL;
 290
 291   do
 292     {
 293       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
 294       if (bytes_read == 0)
 295         break;
 296       if (overflow || overflow_size != 0)
 297         {
 298           if (overflow_size == 0)
 299             {
 300               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 301               memcpy (overflow_buffer, buffer, bytes_read);
 302               overflow_size = bytes_read;
 303             }
 304           else
 305             {
 306               char *new_buf =
 307                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 308
 309               memcpy (new_buf, overflow_buffer, overflow_size);
 310               memcpy (new_buf + overflow_size, buffer, bytes_read);
 311
 312               overflow_buffer = new_buf;
 313               overflow_size += bytes_read;
 314             }
 315         }
 316     }
 317   while (overflow);
 318
 319   if (overflow_size)
 320     *read = overflow_size;
 321   else
 322     *read = bytes_read;
 323
 324   return (overflow_size > 0 ? overflow_buffer : buffer);
 325 }
 326
 327 /* Skip whitespace from PORT and return the first non-whitespace character
 328    read.  Raise an error on end-of-file.  */
 329 static int
 330 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
 331 {
 332   scm_t_wchar c;
 333   while (1)
 334     switch (c = scm_getc (port))
 335       {
 336       case EOF:
 337       goteof:
 338         if (eoferr)
 339           {
 340             scm_i_input_error (eoferr,
 341                                port,
 342                                "end of file",
 343                                SCM_EOL);
 344           }
 345         return c;
 346
 347       case ';':
 348       lp:
 349         switch (c = scm_getc (port))
 350           {
 351           case EOF:
 352             goto goteof;
 353           default:
 354             goto lp;
 355           case SCM_LINE_INCREMENTORS:
 356             break;
 357           }
 358         break;
 359
 360       case '#':
 361         switch (c = scm_getc (port))
 362           {
 363           case EOF:
 364             eoferr = "read_sharp";
 365             goto goteof;
 366           case '!':
 367             scm_read_shebang (c, port, opts);
 368             break;
 369           case ';':
 370             scm_read_commented_expression (c, port, opts);
 371             break;
 372           case '|':
 373             if (scm_is_false (scm_get_hash_procedure (c)))
 374               {
 375                 scm_read_r6rs_block_comment (c, port);
 376                 break;
 377               }
 378             /* fall through */
 379           default:
 380             scm_ungetc (c, port);
 381             return '#';
 382           }
 383         break;
 384
 385       case SCM_LINE_INCREMENTORS:
 386       case SCM_SINGLE_SPACES:
 387       case '\t':
 388         break;
 389
 390       default:
 391         return c;
 392       }
 393
 394   return 0;
 395 }
 396
 397
 398 \f
 399 /* Token readers.  */
 400
 401 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
 402 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
 403                            long line, int column);
 404
 405
 406 static SCM
 407 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
 408                        long line, int column)
 409 {
 410   if (opts->record_positions_p)
 411     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 412   return x;
 413 }
 414
 415 static SCM
 416 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 417 #define FUNC_NAME "scm_i_lreadparen"
 418 {
 419   int c;
 420   SCM tmp, tl, ans = SCM_EOL;
 421   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
 422   const int terminating_char = ((chr == '{') ? '}'
 423                                 : ((chr == '[') ? ']'
 424                                    : ')'));
 425
 426   /* Need to capture line and column numbers here. */
 427   long line = SCM_LINUM (port);
 428   int column = SCM_COL (port) - 1;
 429
 430   c = flush_ws (port, opts, FUNC_NAME);
 431   if (terminating_char == c)
 432     return SCM_EOL;
 433
 434   scm_ungetc (c, port);
 435   tmp = scm_read_expression (port, opts);
 436
 437   /* Note that it is possible for scm_read_expression to return
 438      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 439      check that it's a real dot by checking `c'.  */
 440   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 441     {
 442       ans = scm_read_expression (port, opts);
 443       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 444         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 445                            SCM_EOL);
 446       return ans;
 447     }
 448
 449   /* Build the head of the list structure. */
 450   ans = tl = scm_cons (tmp, SCM_EOL);
 451
 452   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
 453     {
 454       SCM new_tail;
 455
 456       if (c == ')' || (c == ']' && opts->square_brackets_p)
 457           || ((c == '}' || c == ']') && opts->curly_infix_p))
 458         scm_i_input_error (FUNC_NAME, port,
 459                            "in pair: mismatched close paren: ~A",
 460                            scm_list_1 (SCM_MAKE_CHAR (c)));
 461
 462       scm_ungetc (c, port);
 463       tmp = scm_read_expression (port, opts);
 464
 465       /* See above note about scm_sym_dot.  */
 466       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 467         {
 468           SCM_SETCDR (tl, scm_read_expression (port, opts));
 469
 470           c = flush_ws (port, opts, FUNC_NAME);
 471           if (terminating_char != c)
 472             scm_i_input_error (FUNC_NAME, port,
 473                                "in pair: missing close paren", SCM_EOL);
 474           break;
 475         }
 476
 477       new_tail = scm_cons (tmp, SCM_EOL);
 478       SCM_SETCDR (tl, new_tail);
 479       tl = new_tail;
 480     }
 481
 482   if (curly_list_p)
 483     {
 484       /* In addition to finding the length, 'scm_ilength' checks for
 485          improper or circular lists, in which case it returns -1. */
 486       int len = scm_ilength (ans);
 487
 488       /* The (len == 0) case is handled above */
 489       if (len == 1)
 490         /* Return directly to avoid re-annotating the element's source
 491            location with the position of the outer brace.  Also, it
 492            might not be possible to annotate the element. */
 493         return scm_car (ans);  /* {e} => e */
 494       else if (len == 2)
 495         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
 496       else if (len >= 3 && (len & 1))
 497         {
 498           /* It's a proper list whose length is odd and at least 3.  If
 499              the elements at odd indices (the infix operator positions)
 500              are all 'equal?', then it's a simple curly-infix list.
 501              Otherwise it's a mixed curly-infix list. */
 502           SCM op = scm_cadr (ans);
 503
 504           /* Check to see if the elements at odd indices are 'equal?' */
 505           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
 506             {
 507               if (scm_is_null (tl))
 508                 {
 509                   /* Convert simple curly-infix list to prefix:
 510                      {a <op> b <op> ...} => (<op> a b ...) */
 511                   tl = ans;
 512                   while (scm_is_pair (scm_cdr (tl)))
 513                     {
 514                       tmp = scm_cddr (tl);
 515                       SCM_SETCDR (tl, tmp);
 516                       tl = tmp;
 517                     }
 518                   ans = scm_cons (op, ans);
 519                   break;
 520                 }
 521               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
 522                 {
 523                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
 524                   ans = scm_cons (sym_nfx, ans);
 525                   break;
 526                 }
 527             }
 528         }
 529       else
 530         /* Mixed curly-infix (possibly improper) list:
 531            {e . tail} => ($nfx$ e . tail) */
 532         ans = scm_cons (sym_nfx, ans);
 533     }
 534
 535   return maybe_annotate_source (ans, port, opts, line, column);
 536 }
 537 #undef FUNC_NAME
 538
 539
 540 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 541    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 542    found.  */
 543 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 544   do                                                               \
 545     {                                                              \
 546       scm_t_wchar a;                                               \
 547       size_t i = 0;                                                \
 548       c = 0;                                                       \
 549       while (i < ndigits)                                          \
 550         {                                                          \
 551           a = scm_getc (port);                                     \
 552           if (a == EOF)                                            \
 553             goto str_eof;                                          \
 554           if (terminator                                           \
 555               && (a == (scm_t_wchar) terminator)                   \
 556               && (i > 0))                                          \
 557             break;                                                 \
 558           if ('0' <= a && a <= '9')                                \
 559             a -= '0';                                              \
 560           else if ('A' <= a && a <= 'F')                           \
 561             a = a - 'A' + 10;                                      \
 562           else if ('a' <= a && a <= 'f')                           \
 563             a = a - 'a' + 10;                                      \
 564           else                                                     \
 565             {                                                      \
 566               c = a;                                               \
 567               goto bad_escaped;                                    \
 568             }                                                      \
 569           c = c * 16 + a;                                          \
 570           i ++;                                                    \
 571         }                                                          \
 572     } while (0)
 573
 574 static void
 575 skip_intraline_whitespace (SCM port)
 576 {
 577   scm_t_wchar c;
 578
 579   do
 580     {
 581       c = scm_getc (port);
 582       if (c == EOF)
 583         return;
 584     }
 585   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 586
 587   scm_ungetc (c, port);
 588 }
 589
 590 static SCM
 591 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
 592 #define FUNC_NAME "scm_lreadr"
 593 {
 594   /* For strings smaller than C_STR, this function creates only one Scheme
 595      object (the string returned).  */
 596
 597   SCM str = SCM_EOL;
 598   size_t c_str_len = 0;
 599   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 600
 601   /* Need to capture line and column numbers here. */
 602   long line = SCM_LINUM (port);
 603   int column = SCM_COL (port) - 1;
 604
 605   while ('"' != (c = scm_getc (port)))
 606     {
 607       if (c == EOF)
 608         {
 609         str_eof:
 610           scm_i_input_error (FUNC_NAME, port,
 611                              "end of file in string constant", SCM_EOL);
 612         }
 613
 614       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 615         {
 616           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 617           c_str_len = 0;
 618         }
 619
 620       if (c == '\\')
 621         {
 622           switch (c = scm_getc (port))
 623             {
 624             case EOF:
 625               goto str_eof;
 626             case '"':
 627             case '\\':
 628               break;
 629             case '\n':
 630               if (opts->hungry_eol_escapes_p)
 631                 skip_intraline_whitespace (port);
 632               continue;
 633             case '0':
 634               c = '\0';
 635               break;
 636             case 'f':
 637               c = '\f';
 638               break;
 639             case 'n':
 640               c = '\n';
 641               break;
 642             case 'r':
 643               c = '\r';
 644               break;
 645             case 't':
 646               c = '\t';
 647               break;
 648             case 'a':
 649               c = '\007';
 650               break;
 651             case 'v':
 652               c = '\v';
 653               break;
 654             case 'b':
 655               c = '\010';
 656               break;
 657             case 'x':
 658               if (opts->r6rs_escapes_p)
 659                 SCM_READ_HEX_ESCAPE (10, ';');
 660               else
 661                 SCM_READ_HEX_ESCAPE (2, '\0');
 662               break;
 663             case 'u':
 664               if (!opts->r6rs_escapes_p)
 665                 {
 666                   SCM_READ_HEX_ESCAPE (4, '\0');
 667                   break;
 668                 }
 669             case 'U':
 670               if (!opts->r6rs_escapes_p)
 671                 {
 672                   SCM_READ_HEX_ESCAPE (6, '\0');
 673                   break;
 674                 }
 675             default:
 676             bad_escaped:
 677               scm_i_input_error (FUNC_NAME, port,
 678                                  "illegal character in escape sequence: ~S",
 679                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 680             }
 681         }
 682
 683       c_str[c_str_len++] = c;
 684     }
 685
 686   if (scm_is_null (str))
 687     /* Fast path: we got a string that fits in C_STR.  */
 688     str = scm_from_utf32_stringn (c_str, c_str_len);
 689   else
 690     {
 691       if (c_str_len > 0)
 692         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 693
 694       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 695     }
 696
 697   return maybe_annotate_source (str, port, opts, line, column);
 698 }
 699 #undef FUNC_NAME
 700
 701
 702 static SCM
 703 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 704 {
 705   SCM result, str = SCM_EOL;
 706   char local_buffer[READER_BUFFER_SIZE], *buffer;
 707   size_t bytes_read;
 708   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 709
 710   /* Need to capture line and column numbers here. */
 711   long line = SCM_LINUM (port);
 712   int column = SCM_COL (port) - 1;
 713
 714   scm_ungetc (chr, port);
 715   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 716                                 &bytes_read);
 717
 718   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 719
 720   result = scm_string_to_number (str, SCM_UNDEFINED);
 721   if (scm_is_false (result))
 722     {
 723       /* Return a symbol instead of a number */
 724       if (opts->case_insensitive_p)
 725         str = scm_string_downcase_x (str);
 726       result = scm_string_to_symbol (str);
 727     }
 728   else if (SCM_NIMP (result))
 729     result = maybe_annotate_source (result, port, opts, line, column);
 730
 731   SCM_COL (port) += scm_i_string_length (str);
 732   return result;
 733 }
 734
 735 static SCM
 736 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 737 {
 738   SCM result;
 739   int ends_with_colon = 0;
 740   size_t bytes_read;
 741   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
 742   char local_buffer[READER_BUFFER_SIZE], *buffer;
 743   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 744   SCM str;
 745
 746   scm_ungetc (chr, port);
 747   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 748                                 &bytes_read);
 749   if (bytes_read > 0)
 750     ends_with_colon = buffer[bytes_read - 1] == ':';
 751
 752   if (postfix && ends_with_colon && (bytes_read > 1))
 753     {
 754       str = scm_from_stringn (buffer, bytes_read - 1,
 755                               pt->encoding, pt->ilseq_handler);
 756
 757       if (opts->case_insensitive_p)
 758         str = scm_string_downcase_x (str);
 759       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 760     }
 761   else
 762     {
 763       str = scm_from_stringn (buffer, bytes_read,
 764                               pt->encoding, pt->ilseq_handler);
 765
 766       if (opts->case_insensitive_p)
 767         str = scm_string_downcase_x (str);
 768       result = scm_string_to_symbol (str);
 769     }
 770
 771   SCM_COL (port) += scm_i_string_length (str);
 772   return result;
 773 }
 774
 775 static SCM
 776 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 777 #define FUNC_NAME "scm_lreadr"
 778 {
 779   SCM result;
 780   size_t read;
 781   char local_buffer[READER_BUFFER_SIZE], *buffer;
 782   unsigned int radix;
 783   SCM str;
 784   scm_t_port *pt;
 785
 786   switch (chr)
 787     {
 788     case 'B':
 789     case 'b':
 790       radix = 2;
 791       break;
 792
 793     case 'o':
 794     case 'O':
 795       radix = 8;
 796       break;
 797
 798     case 'd':
 799     case 'D':
 800       radix = 10;
 801       break;
 802
 803     case 'x':
 804     case 'X':
 805       radix = 16;
 806       break;
 807
 808     default:
 809       scm_ungetc (chr, port);
 810       scm_ungetc ('#', port);
 811       radix = 10;
 812     }
 813
 814   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
 815                                 &read);
 816
 817   pt = SCM_PTAB_ENTRY (port);
 818   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 819
 820   result = scm_string_to_number (str, scm_from_uint (radix));
 821
 822   SCM_COL (port) += scm_i_string_length (str);
 823
 824   if (scm_is_true (result))
 825     return result;
 826
 827   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 828
 829   return SCM_BOOL_F;
 830 }
 831 #undef FUNC_NAME
 832
 833 static SCM
 834 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
 835 {
 836   SCM p;
 837   long line = SCM_LINUM (port);
 838   int column = SCM_COL (port) - 1;
 839
 840   switch (chr)
 841     {
 842     case '`':
 843       p = scm_sym_quasiquote;
 844       break;
 845
 846     case '\'':
 847       p = scm_sym_quote;
 848       break;
 849
 850     case ',':
 851       {
 852         scm_t_wchar c;
 853
 854         c = scm_getc (port);
 855         if ('@' == c)
 856           p = scm_sym_uq_splicing;
 857         else
 858           {
 859             scm_ungetc (c, port);
 860             p = scm_sym_unquote;
 861           }
 862         break;
 863       }
 864
 865     default:
 866       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 867                "scm_read_quote", chr);
 868       abort ();
 869     }
 870
 871   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 872   return maybe_annotate_source (p, port, opts, line, column);
 873 }
 874
 875 SCM_SYMBOL (sym_syntax, "syntax");
 876 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 877 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 878 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 879
 880 static SCM
 881 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
 882 {
 883   SCM p;
 884   long line = SCM_LINUM (port);
 885   int column = SCM_COL (port) - 1;
 886
 887   switch (chr)
 888     {
 889     case '`':
 890       p = sym_quasisyntax;
 891       break;
 892
 893     case '\'':
 894       p = sym_syntax;
 895       break;
 896
 897     case ',':
 898       {
 899         int c;
 900
 901         c = scm_getc (port);
 902         if ('@' == c)
 903           p = sym_unsyntax_splicing;
 904         else
 905           {
 906             scm_ungetc (c, port);
 907             p = sym_unsyntax;
 908           }
 909         break;
 910       }
 911
 912     default:
 913       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 914                "scm_read_syntax", chr);
 915       abort ();
 916     }
 917
 918   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
 919   return maybe_annotate_source (p, port, opts, line, column);
 920 }
 921
 922 static SCM
 923 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
 924 {
 925   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
 926
 927   if (!scm_is_eq (id, sym_nil))
 928     scm_i_input_error ("scm_read_nil", port,
 929                        "unexpected input while reading #nil: ~a",
 930                        scm_list_1 (id));
 931
 932   return SCM_ELISP_NIL;
 933 }
 934
 935 static SCM
 936 scm_read_semicolon_comment (int chr, SCM port)
 937 {
 938   int c;
 939
 940   /* We use the get_byte here because there is no need to get the
 941      locale correct with comment input. This presumes that newline
 942      always represents itself no matter what the encoding is.  */
 943   for (c = scm_get_byte_or_eof (port);
 944        (c != EOF) && (c != '\n');
 945        c = scm_get_byte_or_eof (port));
 946
 947   return SCM_UNSPECIFIED;
 948 }
 949
 950 \f
 951 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 952
 953 static SCM
 954 scm_read_boolean (int chr, SCM port)
 955 {
 956   switch (chr)
 957     {
 958     case 't':
 959     case 'T':
 960       return SCM_BOOL_T;
 961
 962     case 'f':
 963     case 'F':
 964       return SCM_BOOL_F;
 965     }
 966
 967   return SCM_UNSPECIFIED;
 968 }
 969
 970 static SCM
 971 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
 972 #define FUNC_NAME "scm_lreadr"
 973 {
 974   char buffer[READER_CHAR_NAME_MAX_SIZE];
 975   SCM charname;
 976   size_t charname_len, bytes_read;
 977   scm_t_wchar cp;
 978   int overflow;
 979   scm_t_port *pt;
 980
 981   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
 982                          &bytes_read);
 983   if (overflow)
 984     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 985
 986   if (bytes_read == 0)
 987     {
 988       chr = scm_getc (port);
 989       if (chr == EOF)
 990         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 991                            "while reading character", SCM_EOL);
 992
 993       /* CHR must be a token delimiter, like a whitespace.  */
 994       return (SCM_MAKE_CHAR (chr));
 995     }
 996
 997   pt = SCM_PTAB_ENTRY (port);
 998
 999   /* Simple ASCII characters can be processed immediately.  Also, simple
1000      ISO-8859-1 characters can be processed immediately if the encoding for this
1001      port is ISO-8859-1.  */
1002   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1003     {
1004       SCM_COL (port) += 1;
1005       return SCM_MAKE_CHAR (buffer[0]);
1006     }
1007
1008   /* Otherwise, convert the buffer into a proper scheme string for
1009      processing.  */
1010   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1011                                pt->ilseq_handler);
1012   charname_len = scm_i_string_length (charname);
1013   SCM_COL (port) += charname_len;
1014   cp = scm_i_string_ref (charname, 0);
1015   if (charname_len == 1)
1016     return SCM_MAKE_CHAR (cp);
1017
1018   /* Ignore dotted circles, which may be used to keep combining characters from
1019      combining with the backslash in #\charname.  */
1020   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1021     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1022
1023   if (cp >= '0' && cp < '8')
1024     {
1025       /* Dirk:FIXME::  This type of character syntax is not R5RS
1026        * compliant.  Further, it should be verified that the constant
1027        * does only consist of octal digits.  */
1028       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1029       if (SCM_I_INUMP (p))
1030         {
1031           scm_t_wchar c = scm_to_uint32 (p);
1032           if (SCM_IS_UNICODE_CHAR (c))
1033             return SCM_MAKE_CHAR (c);
1034           else
1035             scm_i_input_error (FUNC_NAME, port,
1036                                "out-of-range octal character escape: ~a",
1037                                scm_list_1 (charname));
1038         }
1039     }
1040
1041   if (cp == 'x' && (charname_len > 1))
1042     {
1043       SCM p;
1044
1045       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1046       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1047                                 scm_from_uint (16));
1048       if (SCM_I_INUMP (p))
1049         {
1050           scm_t_wchar c = scm_to_uint32 (p);
1051           if (SCM_IS_UNICODE_CHAR (c))
1052             return SCM_MAKE_CHAR (c);
1053           else
1054             scm_i_input_error (FUNC_NAME, port,
1055                                "out-of-range hex character escape: ~a",
1056                                scm_list_1 (charname));
1057         }
1058     }
1059
1060   /* The names of characters should never have non-Latin1
1061      characters.  */
1062   if (scm_i_is_narrow_string (charname)
1063       || scm_i_try_narrow_string (charname))
1064     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1065                                        charname_len);
1066       if (scm_is_true (ch))
1067         return ch;
1068     }
1069
1070   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1071                      scm_list_1 (charname));
1072
1073   return SCM_UNSPECIFIED;
1074 }
1075 #undef FUNC_NAME
1076
1077 static SCM
1078 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1079 {
1080   SCM symbol;
1081
1082   /* Read the symbol that comprises the keyword.  Doing this instead of
1083      invoking a specific symbol reader function allows `scm_read_keyword ()'
1084      to adapt to the delimiters currently valid of symbols.
1085
1086      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1087   symbol = scm_read_expression (port, opts);
1088   if (!scm_is_symbol (symbol))
1089     scm_i_input_error ("scm_read_keyword", port,
1090                        "keyword prefix `~a' not followed by a symbol: ~s",
1091                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1092
1093   return (scm_symbol_to_keyword (symbol));
1094 }
1095
1096 static SCM
1097 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1098                  long line, int column)
1099 {
1100   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1101      guarantee that it's going to do what we want.  After all, this is an
1102      implementation detail of `scm_read_vector ()', not a desirable
1103      property.  */
1104   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1105                                 port, opts, line, column);
1106 }
1107
1108 /* Helper used by scm_read_array */
1109 static int
1110 read_decimal_integer (SCM port, int c, ssize_t *resp)
1111 {
1112   ssize_t sign = 1;
1113   ssize_t res = 0;
1114   int got_it = 0;
1115
1116   if (c == '-')
1117     {
1118       sign = -1;
1119       c = scm_getc (port);
1120     }
1121
1122   while ('0' <= c && c <= '9')
1123     {
1124       res = 10*res + c-'0';
1125       got_it = 1;
1126       c = scm_getc (port);
1127     }
1128
1129   if (got_it)
1130     *resp = sign * res;
1131   return c;
1132 }
1133
1134 /* Read an array.  This function can also read vectors and uniform
1135    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1136    handled here.
1137
1138    C is the first character read after the '#'. */
1139 static SCM
1140 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1141 {
1142   ssize_t rank;
1143   scm_t_wchar tag_buf[8];
1144   int tag_len;
1145
1146   SCM tag, shape = SCM_BOOL_F, elements, array;
1147
1148   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1149      the array code can not deal with zero-length dimensions yet, and
1150      we want to allow zero-length vectors, of course. */
1151   if (c == '(')
1152     return scm_read_vector (c, port, opts, line, column);
1153
1154   /* Disambiguate between '#f' and uniform floating point vectors. */
1155   if (c == 'f')
1156     {
1157       c = scm_getc (port);
1158       if (c != '3' && c != '6')
1159         {
1160           if (c != EOF)
1161             scm_ungetc (c, port);
1162           return SCM_BOOL_F;
1163         }
1164       rank = 1;
1165       tag_buf[0] = 'f';
1166       tag_len = 1;
1167       goto continue_reading_tag;
1168     }
1169
1170   /* Read rank. */
1171   rank = 1;
1172   c = read_decimal_integer (port, c, &rank);
1173   if (rank < 0)
1174     scm_i_input_error (NULL, port, "array rank must be non-negative",
1175                        SCM_EOL);
1176
1177   /* Read tag. */
1178   tag_len = 0;
1179  continue_reading_tag:
1180   while (c != EOF && c != '(' && c != '@' && c != ':'
1181          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1182     {
1183       tag_buf[tag_len++] = c;
1184       c = scm_getc (port);
1185     }
1186   if (tag_len == 0)
1187     tag = SCM_BOOL_T;
1188   else
1189     {
1190       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1191       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1192         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1193                            scm_list_1 (tag));
1194     }
1195
1196   /* Read shape. */
1197   if (c == '@' || c == ':')
1198     {
1199       shape = SCM_EOL;
1200
1201       do
1202         {
1203           ssize_t lbnd = 0, len = 0;
1204           SCM s;
1205
1206           if (c == '@')
1207             {
1208               c = scm_getc (port);
1209               c = read_decimal_integer (port, c, &lbnd);
1210             }
1211
1212           s = scm_from_ssize_t (lbnd);
1213
1214           if (c == ':')
1215             {
1216               c = scm_getc (port);
1217               c = read_decimal_integer (port, c, &len);
1218               if (len < 0)
1219                 scm_i_input_error (NULL, port,
1220                                    "array length must be non-negative",
1221                                    SCM_EOL);
1222
1223               s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1224             }
1225
1226           shape = scm_cons (s, shape);
1227         } while (c == '@' || c == ':');
1228
1229       shape = scm_reverse_x (shape, SCM_EOL);
1230     }
1231
1232   /* Read nested lists of elements. */
1233   if (c != '(')
1234     scm_i_input_error (NULL, port,
1235                        "missing '(' in vector or array literal",
1236                        SCM_EOL);
1237   elements = scm_read_sexp (c, port, opts);
1238
1239   if (scm_is_false (shape))
1240     shape = scm_from_ssize_t (rank);
1241   else if (scm_ilength (shape) != rank)
1242     scm_i_input_error
1243       (NULL, port,
1244        "the number of shape specifications must match the array rank",
1245        SCM_EOL);
1246
1247   /* Handle special print syntax of rank zero arrays; see
1248      scm_i_print_array for a rationale. */
1249   if (rank == 0)
1250     {
1251       if (!scm_is_pair (elements))
1252         scm_i_input_error (NULL, port,
1253                            "too few elements in array literal, need 1",
1254                            SCM_EOL);
1255       if (!scm_is_null (SCM_CDR (elements)))
1256         scm_i_input_error (NULL, port,
1257                            "too many elements in array literal, want 1",
1258                            SCM_EOL);
1259       elements = SCM_CAR (elements);
1260     }
1261
1262   /* Construct array, annotate with source location, and return. */
1263   array = scm_list_to_typed_array (tag, shape, elements);
1264   return maybe_annotate_source (array, port, opts, line, column);
1265 }
1266
1267 static SCM
1268 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1269                        long line, int column)
1270 {
1271   return scm_read_array (chr, port, opts, line, column);
1272 }
1273
1274 static SCM
1275 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1276                      long line, int column)
1277 {
1278   chr = scm_getc (port);
1279   if (chr != 'u')
1280     goto syntax;
1281
1282   chr = scm_getc (port);
1283   if (chr != '8')
1284     goto syntax;
1285
1286   chr = scm_getc (port);
1287   if (chr != '(')
1288     goto syntax;
1289
1290   return maybe_annotate_source
1291     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1292      port, opts, line, column);
1293
1294  syntax:
1295   scm_i_input_error ("read_bytevector", port,
1296                      "invalid bytevector prefix",
1297                      SCM_MAKE_CHAR (chr));
1298   return SCM_UNSPECIFIED;
1299 }
1300
1301 static SCM
1302 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1303                            long line, int column)
1304 {
1305   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1306      terribly inefficient but who cares?  */
1307   SCM s_bits = SCM_EOL;
1308
1309   for (chr = scm_getc (port);
1310        (chr != EOF) && ((chr == '0') || (chr == '1'));
1311        chr = scm_getc (port))
1312     {
1313       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1314     }
1315
1316   if (chr != EOF)
1317     scm_ungetc (chr, port);
1318
1319   return maybe_annotate_source
1320     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1321      port, opts, line, column);
1322 }
1323
1324 static SCM
1325 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1326 {
1327   int bang_seen = 0;
1328
1329   for (;;)
1330     {
1331       int c = scm_getc (port);
1332
1333       if (c == EOF)
1334         scm_i_input_error ("skip_block_comment", port,
1335                            "unterminated `#! ... !#' comment", SCM_EOL);
1336
1337       if (c == '!')
1338         bang_seen = 1;
1339       else if (c == '#' && bang_seen)
1340         break;
1341       else
1342         bang_seen = 0;
1343     }
1344
1345   return SCM_UNSPECIFIED;
1346 }
1347
1348 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1349                                          int value);
1350 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1351                                         int value);
1352 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1353                                     int value);
1354
1355 static SCM
1356 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1357 {
1358   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1359   int c;
1360   int i = 0;
1361
1362   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1363     {
1364       c = scm_getc (port);
1365       if (c == EOF)
1366         scm_i_input_error ("skip_block_comment", port,
1367                            "unterminated `#! ... !#' comment", SCM_EOL);
1368       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1369         name[i++] = c;
1370       else if (CHAR_IS_DELIMITER (c))
1371         {
1372           scm_ungetc (c, port);
1373           name[i] = '\0';
1374           if (0 == strcmp ("r6rs", name))
1375             ;  /* Silently ignore */
1376           else if (0 == strcmp ("fold-case", name))
1377             set_port_case_insensitive_p (port, opts, 1);
1378           else if (0 == strcmp ("no-fold-case", name))
1379             set_port_case_insensitive_p (port, opts, 0);
1380           else if (0 == strcmp ("curly-infix", name))
1381             set_port_curly_infix_p (port, opts, 1);
1382           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1383             {
1384               set_port_curly_infix_p (port, opts, 1);
1385               set_port_square_brackets_p (port, opts, 0);
1386             }
1387           else
1388             break;
1389
1390           return SCM_UNSPECIFIED;
1391         }
1392       else
1393         {
1394           scm_ungetc (c, port);
1395           break;
1396         }
1397     }
1398   while (i > 0)
1399     scm_ungetc (name[--i], port);
1400   return scm_read_scsh_block_comment (chr, port);
1401 }
1402
1403 static SCM
1404 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1405 {
1406   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1407      nested.  So care must be taken.  */
1408   int nesting_level = 1;
1409
1410   int a = scm_getc (port);
1411
1412   if (a == EOF)
1413     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1414                        "unterminated `#| ... |#' comment", SCM_EOL);
1415
1416   while (nesting_level > 0)
1417     {
1418       int b = scm_getc (port);
1419
1420       if (b == EOF)
1421         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1422                            "unterminated `#| ... |#' comment", SCM_EOL);
1423
1424       if (a == '|' && b == '#')
1425         {
1426           nesting_level--;
1427           b = EOF;
1428         }
1429       else if (a == '#' && b == '|')
1430         {
1431           nesting_level++;
1432           b = EOF;
1433         }
1434
1435       a = b;
1436     }
1437
1438   return SCM_UNSPECIFIED;
1439 }
1440
1441 static SCM
1442 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1443                                scm_t_read_opts *opts)
1444 {
1445   scm_t_wchar c;
1446
1447   c = flush_ws (port, opts, (char *) NULL);
1448   if (EOF == c)
1449     scm_i_input_error ("read_commented_expression", port,
1450                        "no expression after #; comment", SCM_EOL);
1451   scm_ungetc (c, port);
1452   scm_read_expression (port, opts);
1453   return SCM_UNSPECIFIED;
1454 }
1455
1456 static SCM
1457 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1458 {
1459   /* Guile's extended symbol read syntax looks like this:
1460
1461        #{This is all a symbol name}#
1462
1463      So here, CHR is expected to be `{'.  */
1464   int saw_brace = 0;
1465   size_t len = 0;
1466   SCM buf = scm_i_make_string (1024, NULL, 0);
1467
1468   buf = scm_i_string_start_writing (buf);
1469
1470   while ((chr = scm_getc (port)) != EOF)
1471     {
1472       if (saw_brace)
1473         {
1474           if (chr == '#')
1475             {
1476               break;
1477             }
1478           else
1479             {
1480               saw_brace = 0;
1481               scm_i_string_set_x (buf, len++, '}');
1482             }
1483         }
1484
1485       if (chr == '}')
1486         saw_brace = 1;
1487       else if (chr == '\\')
1488         {
1489           /* It used to be that print.c would print extended-read-syntax
1490              symbols with backslashes before "non-standard" chars, but
1491              this routine wouldn't do anything with those escapes.
1492              Bummer.  What we've done is to change print.c to output
1493              R6RS hex escapes for those characters, relying on the fact
1494              that the extended read syntax would never put a `\' before
1495              an `x'.  For now, we just ignore other instances of
1496              backslash in the string.  */
1497           switch ((chr = scm_getc (port)))
1498             {
1499             case EOF:
1500               goto done;
1501             case 'x':
1502               {
1503                 scm_t_wchar c;
1504
1505                 SCM_READ_HEX_ESCAPE (10, ';');
1506                 scm_i_string_set_x (buf, len++, c);
1507                 break;
1508
1509               str_eof:
1510                 chr = EOF;
1511                 goto done;
1512
1513               bad_escaped:
1514                 scm_i_string_stop_writing ();
1515                 scm_i_input_error ("scm_read_extended_symbol", port,
1516                                    "illegal character in escape sequence: ~S",
1517                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1518                 break;
1519               }
1520             default:
1521               scm_i_string_set_x (buf, len++, chr);
1522               break;
1523             }
1524         }
1525       else
1526         scm_i_string_set_x (buf, len++, chr);
1527
1528       if (len >= scm_i_string_length (buf) - 2)
1529         {
1530           SCM addy;
1531
1532           scm_i_string_stop_writing ();
1533           addy = scm_i_make_string (1024, NULL, 0);
1534           buf = scm_string_append (scm_list_2 (buf, addy));
1535           len = 0;
1536           buf = scm_i_string_start_writing (buf);
1537         }
1538     }
1539
1540  done:
1541   scm_i_string_stop_writing ();
1542   if (chr == EOF)
1543     scm_i_input_error ("scm_read_extended_symbol", port,
1544                        "end of file while reading symbol", SCM_EOL);
1545
1546   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1547 }
1548
1549
1550 \f
1551 /* Top-level token readers, i.e., dispatchers.  */
1552
1553 static SCM
1554 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1555 {
1556   SCM proc;
1557
1558   proc = scm_get_hash_procedure (chr);
1559   if (scm_is_true (scm_procedure_p (proc)))
1560     {
1561       long line = SCM_LINUM (port);
1562       int column = SCM_COL (port) - 2;
1563       SCM got;
1564
1565       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1566
1567       if (opts->record_positions_p && SCM_NIMP (got)
1568           && !scm_i_has_source_properties (got))
1569         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1570
1571       return got;
1572     }
1573
1574   return SCM_UNSPECIFIED;
1575 }
1576
1577 /* The reader for the sharp `#' character.  It basically dispatches reads
1578    among the above token readers.   */
1579 static SCM
1580 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1581                 long line, int column)
1582 #define FUNC_NAME "scm_lreadr"
1583 {
1584   SCM result;
1585
1586   chr = scm_getc (port);
1587
1588   result = scm_read_sharp_extension (chr, port, opts);
1589   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1590     return result;
1591
1592   switch (chr)
1593     {
1594     case '\\':
1595       return (scm_read_character (chr, port, opts));
1596     case '(':
1597       return (scm_read_vector (chr, port, opts, line, column));
1598     case 's':
1599     case 'u':
1600     case 'f':
1601     case 'c':
1602       /* This one may return either a boolean or an SRFI-4 vector.  */
1603       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1604     case 'v':
1605       return (scm_read_bytevector (chr, port, opts, line, column));
1606     case '*':
1607       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1608     case 't':
1609     case 'T':
1610     case 'F':
1611       return (scm_read_boolean (chr, port));
1612     case ':':
1613       return (scm_read_keyword (chr, port, opts));
1614     case '0': case '1': case '2': case '3': case '4':
1615     case '5': case '6': case '7': case '8': case '9':
1616     case '@':
1617 #if SCM_ENABLE_DEPRECATED
1618       /* See below for 'i' and 'e'. */
1619     case 'a':
1620     case 'y':
1621     case 'h':
1622     case 'l':
1623 #endif
1624       return (scm_read_array (chr, port, opts, line, column));
1625
1626     case 'i':
1627     case 'e':
1628 #if SCM_ENABLE_DEPRECATED
1629       {
1630         /* When next char is '(', it really is an old-style
1631            uniform array. */
1632         scm_t_wchar next_c = scm_getc (port);
1633         if (next_c != EOF)
1634           scm_ungetc (next_c, port);
1635         if (next_c == '(')
1636           return scm_read_array (chr, port, opts, line, column);
1637         /* Fall through. */
1638       }
1639 #endif
1640     case 'b':
1641     case 'B':
1642     case 'o':
1643     case 'O':
1644     case 'd':
1645     case 'D':
1646     case 'x':
1647     case 'X':
1648     case 'I':
1649     case 'E':
1650       return (scm_read_number_and_radix (chr, port, opts));
1651     case '{':
1652       return (scm_read_extended_symbol (chr, port));
1653     case '!':
1654       return (scm_read_shebang (chr, port, opts));
1655     case ';':
1656       return (scm_read_commented_expression (chr, port, opts));
1657     case '`':
1658     case '\'':
1659     case ',':
1660       return (scm_read_syntax (chr, port, opts));
1661     case 'n':
1662       return (scm_read_nil (chr, port, opts));
1663     default:
1664       result = scm_read_sharp_extension (chr, port, opts);
1665       if (scm_is_eq (result, SCM_UNSPECIFIED))
1666         {
1667           /* To remain compatible with 1.8 and earlier, the following
1668              characters have lower precedence than `read-hash-extend'
1669              characters.  */
1670           switch (chr)
1671             {
1672             case '|':
1673               return scm_read_r6rs_block_comment (chr, port);
1674             default:
1675               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1676                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1677             }
1678         }
1679       else
1680         return result;
1681     }
1682
1683   return SCM_UNSPECIFIED;
1684 }
1685 #undef FUNC_NAME
1686
1687 static SCM
1688 read_inner_expression (SCM port, scm_t_read_opts *opts)
1689 #define FUNC_NAME "read_inner_expression"
1690 {
1691   while (1)
1692     {
1693       scm_t_wchar chr;
1694
1695       chr = scm_getc (port);
1696
1697       switch (chr)
1698         {
1699         case SCM_WHITE_SPACES:
1700         case SCM_LINE_INCREMENTORS:
1701           break;
1702         case ';':
1703           (void) scm_read_semicolon_comment (chr, port);
1704           break;
1705         case '{':
1706           if (opts->curly_infix_p)
1707             {
1708               if (opts->neoteric_p)
1709                 return scm_read_sexp (chr, port, opts);
1710               else
1711                 {
1712                   SCM expr;
1713
1714                   /* Enable neoteric expressions within curly braces */
1715                   opts->neoteric_p = 1;
1716                   expr = scm_read_sexp (chr, port, opts);
1717                   opts->neoteric_p = 0;
1718                   return expr;
1719                 }
1720             }
1721           else
1722             return scm_read_mixed_case_symbol (chr, port, opts);
1723         case '[':
1724           if (opts->square_brackets_p)
1725             return scm_read_sexp (chr, port, opts);
1726           else if (opts->curly_infix_p)
1727             {
1728               /* The syntax of neoteric expressions requires that '[' be
1729                  a delimiter when curly-infix is enabled, so it cannot
1730                  be part of an unescaped symbol.  We might as well do
1731                  something useful with it, so we adopt Kawa's convention:
1732                  [...] => ($bracket-list$ ...) */
1733               long line = SCM_LINUM (port);
1734               int column = SCM_COL (port) - 1;
1735               return maybe_annotate_source
1736                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1737                  port, opts, line, column);
1738             }
1739           else
1740             return scm_read_mixed_case_symbol (chr, port, opts);
1741         case '(':
1742           return (scm_read_sexp (chr, port, opts));
1743         case '"':
1744           return (scm_read_string (chr, port, opts));
1745         case '\'':
1746         case '`':
1747         case ',':
1748           return (scm_read_quote (chr, port, opts));
1749         case '#':
1750           {
1751             long line  = SCM_LINUM (port);
1752             int column = SCM_COL (port) - 1;
1753             SCM result = scm_read_sharp (chr, port, opts, line, column);
1754             if (scm_is_eq (result, SCM_UNSPECIFIED))
1755               /* We read a comment or some such.  */
1756               break;
1757             else
1758               return result;
1759           }
1760         case ')':
1761           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1762           break;
1763         case '}':
1764           if (opts->curly_infix_p)
1765             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1766           else
1767             return scm_read_mixed_case_symbol (chr, port, opts);
1768         case ']':
1769           if (opts->square_brackets_p)
1770             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1771           /* otherwise fall through */
1772         case EOF:
1773           return SCM_EOF_VAL;
1774         case ':':
1775           if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1776             return scm_symbol_to_keyword (scm_read_expression (port, opts));
1777           /* Fall through.  */
1778
1779         default:
1780           {
1781             if (((chr >= '0') && (chr <= '9'))
1782                 || (strchr ("+-.", chr)))
1783               return (scm_read_number (chr, port, opts));
1784             else
1785               return (scm_read_mixed_case_symbol (chr, port, opts));
1786           }
1787         }
1788     }
1789 }
1790 #undef FUNC_NAME
1791
1792 static SCM
1793 scm_read_expression (SCM port, scm_t_read_opts *opts)
1794 #define FUNC_NAME "scm_read_expression"
1795 {
1796   if (!opts->neoteric_p)
1797     return read_inner_expression (port, opts);
1798   else
1799     {
1800       long line = 0;
1801       int column = 0;
1802       SCM expr;
1803
1804       if (opts->record_positions_p)
1805         {
1806           /* We need to get the position of the first non-whitespace
1807              character in order to correctly annotate neoteric
1808              expressions.  For example, for the expression 'f(x)', the
1809              first call to 'read_inner_expression' reads the 'f' (which
1810              cannot be annotated), and then we later read the '(x)' and
1811              use it to construct the new list (f x). */
1812           int c = flush_ws (port, opts, (char *) NULL);
1813           if (c == EOF)
1814             return SCM_EOF_VAL;
1815           scm_ungetc (c, port);
1816           line = SCM_LINUM (port);
1817           column = SCM_COL (port);
1818         }
1819
1820       expr = read_inner_expression (port, opts);
1821
1822       /* 'expr' is the first component of the neoteric expression.  Now
1823          we loop, and as long as the next character is '(', '[', or '{',
1824          (without any intervening whitespace), we use it to construct a
1825          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1826       for (;;)
1827         {
1828           int chr = scm_getc (port);
1829
1830           if (chr == '(')
1831             /* e(...) => (e ...) */
1832             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1833           else if (chr == '[')
1834             /* e[...] => ($bracket-apply$ e ...) */
1835             expr = scm_cons (sym_bracket_apply,
1836                              scm_cons (expr,
1837                                        scm_read_sexp (chr, port, opts)));
1838           else if (chr == '{')
1839             {
1840               SCM arg = scm_read_sexp (chr, port, opts);
1841
1842               if (scm_is_null (arg))
1843                 expr = scm_list_1 (expr);       /* e{} => (e) */
1844               else
1845                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1846             }
1847           else
1848             {
1849               if (chr != EOF)
1850                 scm_ungetc (chr, port);
1851               break;
1852             }
1853           maybe_annotate_source (expr, port, opts, line, column);
1854         }
1855       return expr;
1856     }
1857 }
1858 #undef FUNC_NAME
1859
1860 \f
1861 /* Actual reader.  */
1862
1863 static void init_read_options (SCM port, scm_t_read_opts *opts);
1864
1865 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1866             (SCM port),
1867             "Read an s-expression from the input port @var{port}, or from\n"
1868             "the current input port if @var{port} is not specified.\n"
1869             "Any whitespace before the next token is discarded.")
1870 #define FUNC_NAME s_scm_read
1871 {
1872   scm_t_read_opts opts;
1873   int c;
1874
1875   if (SCM_UNBNDP (port))
1876     port = scm_current_input_port ();
1877   SCM_VALIDATE_OPINPORT (1, port);
1878
1879   init_read_options (port, &opts);
1880
1881   c = flush_ws (port, &opts, (char *) NULL);
1882   if (EOF == c)
1883     return SCM_EOF_VAL;
1884   scm_ungetc (c, port);
1885
1886   return (scm_read_expression (port, &opts));
1887 }
1888 #undef FUNC_NAME
1889
1890
1891 \f
1892
1893 /* Manipulate the read-hash-procedures alist.  This could be written in
1894    Scheme, but maybe it will also be used by C code during initialisation.  */
1895 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1896             (SCM chr, SCM proc),
1897             "Install the procedure @var{proc} for reading expressions\n"
1898             "starting with the character sequence @code{#} and @var{chr}.\n"
1899             "@var{proc} will be called with two arguments:  the character\n"
1900             "@var{chr} and the port to read further data from. The object\n"
1901             "returned will be the return value of @code{read}. \n"
1902             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1903             )
1904 #define FUNC_NAME s_scm_read_hash_extend
1905 {
1906   SCM this;
1907   SCM prev;
1908
1909   SCM_VALIDATE_CHAR (1, chr);
1910   SCM_ASSERT (scm_is_false (proc)
1911               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1912               proc, SCM_ARG2, FUNC_NAME);
1913
1914   /* Check if chr is already in the alist.  */
1915   this = scm_i_read_hash_procedures_ref ();
1916   prev = SCM_BOOL_F;
1917   while (1)
1918     {
1919       if (scm_is_null (this))
1920         {
1921           /* not found, so add it to the beginning.  */
1922           if (scm_is_true (proc))
1923             {
1924               SCM new = scm_cons (scm_cons (chr, proc),
1925                                   scm_i_read_hash_procedures_ref ());
1926               scm_i_read_hash_procedures_set_x (new);
1927             }
1928           break;
1929         }
1930       if (scm_is_eq (chr, SCM_CAAR (this)))
1931         {
1932           /* already in the alist.  */
1933           if (scm_is_false (proc))
1934             {
1935               /* remove it.  */
1936               if (scm_is_false (prev))
1937                 {
1938                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1939                   scm_i_read_hash_procedures_set_x (rest);
1940                 }
1941               else
1942                 scm_set_cdr_x (prev, SCM_CDR (this));
1943             }
1944           else
1945             {
1946               /* replace it.  */
1947               scm_set_cdr_x (SCM_CAR (this), proc);
1948             }
1949           break;
1950         }
1951       prev = this;
1952       this = SCM_CDR (this);
1953     }
1954
1955   return SCM_UNSPECIFIED;
1956 }
1957 #undef FUNC_NAME
1958
1959 /* Recover the read-hash procedure corresponding to char c.  */
1960 static SCM
1961 scm_get_hash_procedure (int c)
1962 {
1963   SCM rest = scm_i_read_hash_procedures_ref ();
1964
1965   while (1)
1966     {
1967       if (scm_is_null (rest))
1968         return SCM_BOOL_F;
1969
1970       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1971         return SCM_CDAR (rest);
1972
1973       rest = SCM_CDR (rest);
1974     }
1975 }
1976
1977 #define SCM_ENCODING_SEARCH_SIZE (500)
1978
1979 /* Search the first few hundred characters of a file for an Emacs-like coding
1980    declaration.  Returns either NULL or a string whose storage has been
1981    allocated with `scm_gc_malloc ()'.  */
1982 char *
1983 scm_i_scan_for_encoding (SCM port)
1984 {
1985   scm_t_port *pt;
1986   char header[SCM_ENCODING_SEARCH_SIZE+1];
1987   size_t bytes_read, encoding_length, i;
1988   char *encoding = NULL;
1989   int utf8_bom = 0;
1990   char *pos, *encoding_start;
1991   int in_comment;
1992
1993   pt = SCM_PTAB_ENTRY (port);
1994
1995   if (pt->rw_active == SCM_PORT_WRITE)
1996     scm_flush (port);
1997
1998   if (pt->rw_random)
1999     pt->rw_active = SCM_PORT_READ;
2000
2001   if (pt->read_pos == pt->read_end)
2002     {
2003       /* We can use the read buffer, and thus avoid a seek. */
2004       if (scm_fill_input (port) == EOF)
2005         return NULL;
2006
2007       bytes_read = pt->read_end - pt->read_pos;
2008       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2009         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2010
2011       if (bytes_read <= 1)
2012         /* An unbuffered port -- don't scan.  */
2013         return NULL;
2014
2015       memcpy (header, pt->read_pos, bytes_read);
2016       header[bytes_read] = '\0';
2017     }
2018   else
2019     {
2020       /* Try to read some bytes and then seek back.  Not all ports
2021          support seeking back; and indeed some file ports (like
2022          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2023          check performed by SCM_FPORT_FDES---but fail to seek
2024          backwards.  Hence this block comes second.  We prefer to use
2025          the read buffer in-place.  */
2026       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2027         return NULL;
2028
2029       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2030       header[bytes_read] = '\0';
2031       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2032     }
2033
2034   if (bytes_read > 3
2035       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2036     utf8_bom = 1;
2037
2038   /* search past "coding[:=]" */
2039   pos = header;
2040   while (1)
2041     {
2042       if ((pos = strstr(pos, "coding")) == NULL)
2043         return NULL;
2044
2045       pos += strlen("coding");
2046       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2047           (*pos == ':' || *pos == '='))
2048         {
2049           pos ++;
2050           break;
2051         }
2052     }
2053
2054   /* skip spaces */
2055   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2056          (*pos == ' ' || *pos == '\t'))
2057     pos ++;
2058
2059   /* grab the next token */
2060   encoding_start = pos;
2061   i = 0;
2062   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2063          && encoding_start + i - header < bytes_read
2064          && (isalnum ((int) encoding_start[i])
2065              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2066     i++;
2067
2068   encoding_length = i;
2069   if (encoding_length == 0)
2070     return NULL;
2071
2072   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2073   for (i = 0; i < encoding_length; i++)
2074     encoding[i] = toupper ((int) encoding[i]);
2075
2076   /* push backwards to make sure we were in a comment */
2077   in_comment = 0;
2078   pos = encoding_start;
2079   while (pos >= header)
2080     {
2081       if (*pos == ';')
2082         {
2083           in_comment = 1;
2084           break;
2085         }
2086       else if (*pos == '\n' || pos == header)
2087         {
2088           /* This wasn't in a semicolon comment. Check for a
2089            hash-bang comment. */
2090           char *beg = strstr (header, "#!");
2091           char *end = strstr (header, "!#");
2092           if (beg < encoding_start && encoding_start + encoding_length <= end)
2093             in_comment = 1;
2094           break;
2095         }
2096       else
2097         {
2098           pos --;
2099           continue;
2100         }
2101     }
2102   if (!in_comment)
2103     /* This wasn't in a comment */
2104     return NULL;
2105
2106   if (utf8_bom && c_strcasecmp(encoding, "UTF-8"))
2107     scm_misc_error (NULL,
2108                     "the port input declares the encoding ~s but is encoded as UTF-8",
2109                     scm_list_1 (scm_from_locale_string (encoding)));
2110
2111   return encoding;
2112 }
2113
2114 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2115             (SCM port),
2116             "Scans the port for an Emacs-like character coding declaration\n"
2117             "near the top of the contents of a port with random-accessible contents.\n"
2118             "The coding declaration is of the form\n"
2119             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2120             "\n"
2121             "Returns a string containing the character encoding of the file\n"
2122             "if a declaration was found, or @code{#f} otherwise.\n")
2123 #define FUNC_NAME s_scm_file_encoding
2124 {
2125   char *enc;
2126   SCM s_enc;
2127
2128   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2129
2130   enc = scm_i_scan_for_encoding (port);
2131   if (enc == NULL)
2132     return SCM_BOOL_F;
2133   else
2134     {
2135       s_enc = scm_from_locale_string (enc);
2136       return s_enc;
2137     }
2138
2139   return SCM_BOOL_F;
2140 }
2141 #undef FUNC_NAME
2142
2143 \f
2144 /* Per-port read options.
2145
2146    We store per-port read options in the 'port-read-options' key of the
2147    port's alist, which is stored in the internal port structure.  The
2148    value stored in the alist is a single integer that contains a two-bit
2149    field for each read option.
2150
2151    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2152    the applicable value should be inherited from the corresponding
2153    global read option.  Otherwise, the bit field contains the value of
2154    the read option.  For boolean read options that have been set
2155    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2156    read option has been set per-port, its possible values are those in
2157    'enum t_keyword_style'. */
2158
2159 /* Key to read options in per-port alists. */
2160 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2161
2162 /* Offsets of bit fields for each per-port override */
2163 #define READ_OPTION_COPY_SOURCE_P          0
2164 #define READ_OPTION_RECORD_POSITIONS_P     2
2165 #define READ_OPTION_CASE_INSENSITIVE_P     4
2166 #define READ_OPTION_KEYWORD_STYLE          6
2167 #define READ_OPTION_R6RS_ESCAPES_P         8
2168 #define READ_OPTION_SQUARE_BRACKETS_P     10
2169 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2170 #define READ_OPTION_CURLY_INFIX_P         14
2171
2172 /* The total width in bits of the per-port overrides */
2173 #define READ_OPTIONS_NUM_BITS             16
2174
2175 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2176 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2177
2178 #define READ_OPTION_MASK     3
2179 #define READ_OPTION_INHERIT  3
2180
2181 static void
2182 set_port_read_option (SCM port, int option, int new_value)
2183 {
2184   SCM alist, scm_read_options;
2185   unsigned int read_options;
2186
2187   new_value &= READ_OPTION_MASK;
2188   alist = scm_i_port_alist (port);
2189   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2190   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2191     read_options = scm_to_uint (scm_read_options);
2192   else
2193     read_options = READ_OPTIONS_INHERIT_ALL;
2194   read_options &= ~(READ_OPTION_MASK << option);
2195   read_options |= new_value << option;
2196   scm_read_options = scm_from_uint (read_options);
2197   alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
2198   scm_i_set_port_alist_x (port, alist);
2199 }
2200
2201 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2202 static void
2203 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2204 {
2205   value = !!value;
2206   opts->case_insensitive_p = value;
2207   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2208 }
2209
2210 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2211 static void
2212 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2213 {
2214   value = !!value;
2215   opts->square_brackets_p = value;
2216   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2217 }
2218
2219 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2220 static void
2221 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2222 {
2223   value = !!value;
2224   opts->curly_infix_p = value;
2225   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2226 }
2227
2228 /* Initialize OPTS based on PORT's read options and the global read
2229    options. */
2230 static void
2231 init_read_options (SCM port, scm_t_read_opts *opts)
2232 {
2233   SCM alist, val, scm_read_options;
2234   unsigned int read_options, x;
2235
2236   alist = scm_i_port_alist (port);
2237   scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2238
2239   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2240     read_options = scm_to_uint (scm_read_options);
2241   else
2242     read_options = READ_OPTIONS_INHERIT_ALL;
2243
2244   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2245   if (x == READ_OPTION_INHERIT)
2246     {
2247       val = SCM_PACK (SCM_KEYWORD_STYLE);
2248       if (scm_is_eq (val, scm_keyword_prefix))
2249         x = KEYWORD_STYLE_PREFIX;
2250       else if (scm_is_eq (val, scm_keyword_postfix))
2251         x = KEYWORD_STYLE_POSTFIX;
2252       else
2253         x = KEYWORD_STYLE_HASH_PREFIX;
2254     }
2255   opts->keyword_style = x;
2256
2257 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2258   do                                                                    \
2259     {                                                                   \
2260       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2261       if (x == READ_OPTION_INHERIT)                                     \
2262         x = !!SCM_ ## NAME;                                             \
2263           opts->name = x;                                               \
2264     }                                                                   \
2265   while (0)
2266
2267   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2268   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2269   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2270   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2271   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2272   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2273   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2274
2275 #undef RESOLVE_BOOLEAN_OPTION
2276
2277   opts->neoteric_p = 0;
2278 }
2279
2280 void
2281 scm_init_read ()
2282 {
2283   SCM read_hash_procs;
2284
2285   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2286
2287   scm_i_read_hash_procedures =
2288     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2289
2290   scm_init_opts (scm_read_options, scm_read_opts);
2291 #include "libguile/read.x"
2292 }
2293
2294 /*
2295   Local Variables:
2296   c-file-style: "gnu"
2297   End:
2298 */