libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Number of 32-bit codepoints in the buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     128
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static int
 211 read_token (SCM port, char *buf, size_t buf_size, size_t *read)
 212 {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 239    if the token doesn't fit in BUFFER_SIZE bytes.  */
 240 static char *
 241 read_complete_token (SCM port, char *buffer, size_t buffer_size,
 242                      size_t *read)
 243 {
 244   int overflow = 0;
 245   size_t bytes_read, overflow_size = 0;
 246   char *overflow_buffer = NULL;
 247
 248   do
 249     {
 250       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 251       if (bytes_read == 0)
 252         break;
 253       if (overflow || overflow_size != 0)
 254         {
 255           if (overflow_size == 0)
 256             {
 257               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 258               memcpy (overflow_buffer, buffer, bytes_read);
 259               overflow_size = bytes_read;
 260             }
 261           else
 262             {
 263               char *new_buf =
 264                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 265
 266               memcpy (new_buf, overflow_buffer, overflow_size);
 267               memcpy (new_buf + overflow_size, buffer, bytes_read);
 268
 269               overflow_buffer = new_buf;
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size > 0 ? overflow_buffer : buffer);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port, long line, int column);
 360
 361
 362 static SCM
 363 maybe_annotate_source (SCM x, SCM port, long line, int column)
 364 {
 365   if (SCM_RECORD_POSITIONS_P)
 366     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 367   return x;
 368 }
 369
 370 static SCM
 371 scm_read_sexp (scm_t_wchar chr, SCM port)
 372 #define FUNC_NAME "scm_i_lreadparen"
 373 {
 374   int c;
 375   SCM tmp, tl, ans = SCM_EOL;
 376   const int terminating_char = ((chr == '[') ? ']' : ')');
 377
 378   /* Need to capture line and column numbers here. */
 379   long line = SCM_LINUM (port);
 380   int column = SCM_COL (port) - 1;
 381
 382   c = flush_ws (port, FUNC_NAME);
 383   if (terminating_char == c)
 384     return SCM_EOL;
 385
 386   scm_ungetc (c, port);
 387   tmp = scm_read_expression (port);
 388
 389   /* Note that it is possible for scm_read_expression to return
 390      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 391      check that it's a real dot by checking `c'.  */
 392   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 393     {
 394       ans = scm_read_expression (port);
 395       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 396         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 397                            SCM_EOL);
 398       return ans;
 399     }
 400
 401   /* Build the head of the list structure. */
 402   ans = tl = scm_cons (tmp, SCM_EOL);
 403
 404   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 405     {
 406       SCM new_tail;
 407
 408       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 409         scm_i_input_error (FUNC_NAME, port,
 410                            "in pair: mismatched close paren: ~A",
 411                            scm_list_1 (SCM_MAKE_CHAR (c)));
 412
 413       scm_ungetc (c, port);
 414       tmp = scm_read_expression (port);
 415
 416       /* See above note about scm_sym_dot.  */
 417       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 418         {
 419           SCM_SETCDR (tl, scm_read_expression (port));
 420
 421           c = flush_ws (port, FUNC_NAME);
 422           if (terminating_char != c)
 423             scm_i_input_error (FUNC_NAME, port,
 424                                "in pair: missing close paren", SCM_EOL);
 425           goto exit;
 426         }
 427
 428       new_tail = scm_cons (tmp, SCM_EOL);
 429       SCM_SETCDR (tl, new_tail);
 430       tl = new_tail;
 431     }
 432
 433  exit:
 434   return maybe_annotate_source (ans, port, line, column);
 435 }
 436 #undef FUNC_NAME
 437
 438
 439 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 440    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 441    found.  */
 442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 443   do                                                               \
 444     {                                                              \
 445       scm_t_wchar a;                                               \
 446       size_t i = 0;                                                \
 447       c = 0;                                                       \
 448       while (i < ndigits)                                          \
 449         {                                                          \
 450           a = scm_getc (port);                                     \
 451           if (a == EOF)                                            \
 452             goto str_eof;                                          \
 453           if (terminator                                           \
 454               && (a == (scm_t_wchar) terminator)                   \
 455               && (i > 0))                                          \
 456             break;                                                 \
 457           if ('0' <= a && a <= '9')                                \
 458             a -= '0';                                              \
 459           else if ('A' <= a && a <= 'F')                           \
 460             a = a - 'A' + 10;                                      \
 461           else if ('a' <= a && a <= 'f')                           \
 462             a = a - 'a' + 10;                                      \
 463           else                                                     \
 464             {                                                      \
 465               c = a;                                               \
 466               goto bad_escaped;                                    \
 467             }                                                      \
 468           c = c * 16 + a;                                          \
 469           i ++;                                                    \
 470         }                                                          \
 471     } while (0)
 472
 473 static void
 474 skip_intraline_whitespace (SCM port)
 475 {
 476   scm_t_wchar c;
 477
 478   do
 479     {
 480       c = scm_getc (port);
 481       if (c == EOF)
 482         return;
 483     }
 484   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 485
 486   scm_ungetc (c, port);
 487 }
 488
 489 static SCM
 490 scm_read_string (int chr, SCM port)
 491 #define FUNC_NAME "scm_lreadr"
 492 {
 493   /* For strings smaller than C_STR, this function creates only one Scheme
 494      object (the string returned).  */
 495
 496   SCM str = SCM_EOL;
 497   size_t c_str_len = 0;
 498   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
 499
 500   /* Need to capture line and column numbers here. */
 501   long line = SCM_LINUM (port);
 502   int column = SCM_COL (port) - 1;
 503
 504   while ('"' != (c = scm_getc (port)))
 505     {
 506       if (c == EOF)
 507         {
 508         str_eof:
 509           scm_i_input_error (FUNC_NAME, port,
 510                              "end of file in string constant", SCM_EOL);
 511         }
 512
 513       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
 514         {
 515           str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 516           c_str_len = 0;
 517         }
 518
 519       if (c == '\\')
 520         {
 521           switch (c = scm_getc (port))
 522             {
 523             case EOF:
 524               goto str_eof;
 525             case '"':
 526             case '\\':
 527               break;
 528             case '\n':
 529               if (SCM_HUNGRY_EOL_ESCAPES_P)
 530                 skip_intraline_whitespace (port);
 531               continue;
 532             case '0':
 533               c = '\0';
 534               break;
 535             case 'f':
 536               c = '\f';
 537               break;
 538             case 'n':
 539               c = '\n';
 540               break;
 541             case 'r':
 542               c = '\r';
 543               break;
 544             case 't':
 545               c = '\t';
 546               break;
 547             case 'a':
 548               c = '\007';
 549               break;
 550             case 'v':
 551               c = '\v';
 552               break;
 553             case 'b':
 554               c = '\010';
 555               break;
 556             case 'x':
 557               if (SCM_R6RS_ESCAPES_P)
 558                 SCM_READ_HEX_ESCAPE (10, ';');
 559               else
 560                 SCM_READ_HEX_ESCAPE (2, '\0');
 561               break;
 562             case 'u':
 563               if (!SCM_R6RS_ESCAPES_P)
 564                 {
 565                   SCM_READ_HEX_ESCAPE (4, '\0');
 566                   break;
 567                 }
 568             case 'U':
 569               if (!SCM_R6RS_ESCAPES_P)
 570                 {
 571                   SCM_READ_HEX_ESCAPE (6, '\0');
 572                   break;
 573                 }
 574             default:
 575             bad_escaped:
 576               scm_i_input_error (FUNC_NAME, port,
 577                                  "illegal character in escape sequence: ~S",
 578                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 579             }
 580         }
 581
 582       c_str[c_str_len++] = c;
 583     }
 584
 585   if (scm_is_null (str))
 586     /* Fast path: we got a string that fits in C_STR.  */
 587     str = scm_from_utf32_stringn (c_str, c_str_len);
 588   else
 589     {
 590       if (c_str_len > 0)
 591         str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
 592
 593       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
 594     }
 595
 596   return maybe_annotate_source (str, port, line, column);
 597 }
 598 #undef FUNC_NAME
 599
 600
 601 static SCM
 602 scm_read_number (scm_t_wchar chr, SCM port)
 603 {
 604   SCM result, str = SCM_EOL;
 605   char local_buffer[READER_BUFFER_SIZE], *buffer;
 606   size_t bytes_read;
 607   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 608
 609   /* Need to capture line and column numbers here. */
 610   long line = SCM_LINUM (port);
 611   int column = SCM_COL (port) - 1;
 612
 613   scm_ungetc (chr, port);
 614   buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
 615                                 &bytes_read);
 616
 617   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 618
 619   result = scm_string_to_number (str, SCM_UNDEFINED);
 620   if (scm_is_false (result))
 621     {
 622       /* Return a symbol instead of a number */
 623       if (SCM_CASE_INSENSITIVE_P)
 624         str = scm_string_downcase_x (str);
 625       result = scm_string_to_symbol (str);
 626     }
 627   else if (SCM_NIMP (result))
 628     result = maybe_annotate_source (result, port, line, column);
 629
 630   SCM_COL (port) += scm_i_string_length (str);
 631   return result;
 632 }
 633
 634 static SCM
 635 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 636 {
 637   SCM result;
 638   int ends_with_colon = 0;
 639   size_t bytes_read;
 640   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 641   char local_buffer[READER_BUFFER_SIZE], *buffer;
 642   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 643   SCM str;
 644
 645   scm_ungetc (chr, port);
 646   buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
 647                                 &bytes_read);
 648   if (bytes_read > 0)
 649     ends_with_colon = buffer[bytes_read - 1] == ':';
 650
 651   if (postfix && ends_with_colon && (bytes_read > 1))
 652     {
 653       str = scm_from_stringn (buffer, bytes_read - 1,
 654                               pt->encoding, pt->ilseq_handler);
 655
 656       if (SCM_CASE_INSENSITIVE_P)
 657         str = scm_string_downcase_x (str);
 658       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 659     }
 660   else
 661     {
 662       str = scm_from_stringn (buffer, bytes_read,
 663                               pt->encoding, pt->ilseq_handler);
 664
 665       if (SCM_CASE_INSENSITIVE_P)
 666         str = scm_string_downcase_x (str);
 667       result = scm_string_to_symbol (str);
 668     }
 669
 670   SCM_COL (port) += scm_i_string_length (str);
 671   return result;
 672 }
 673
 674 static SCM
 675 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 676 #define FUNC_NAME "scm_lreadr"
 677 {
 678   SCM result;
 679   size_t read;
 680   char local_buffer[READER_BUFFER_SIZE], *buffer;
 681   unsigned int radix;
 682   SCM str;
 683   scm_t_port *pt;
 684
 685   switch (chr)
 686     {
 687     case 'B':
 688     case 'b':
 689       radix = 2;
 690       break;
 691
 692     case 'o':
 693     case 'O':
 694       radix = 8;
 695       break;
 696
 697     case 'd':
 698     case 'D':
 699       radix = 10;
 700       break;
 701
 702     case 'x':
 703     case 'X':
 704       radix = 16;
 705       break;
 706
 707     default:
 708       scm_ungetc (chr, port);
 709       scm_ungetc ('#', port);
 710       radix = 10;
 711     }
 712
 713   buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
 714                                 &read);
 715
 716   pt = SCM_PTAB_ENTRY (port);
 717   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 718
 719   result = scm_string_to_number (str, scm_from_uint (radix));
 720
 721   SCM_COL (port) += scm_i_string_length (str);
 722
 723   if (scm_is_true (result))
 724     return result;
 725
 726   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 727
 728   return SCM_BOOL_F;
 729 }
 730 #undef FUNC_NAME
 731
 732 static SCM
 733 scm_read_quote (int chr, SCM port)
 734 {
 735   SCM p;
 736   long line = SCM_LINUM (port);
 737   int column = SCM_COL (port) - 1;
 738
 739   switch (chr)
 740     {
 741     case '`':
 742       p = scm_sym_quasiquote;
 743       break;
 744
 745     case '\'':
 746       p = scm_sym_quote;
 747       break;
 748
 749     case ',':
 750       {
 751         scm_t_wchar c;
 752
 753         c = scm_getc (port);
 754         if ('@' == c)
 755           p = scm_sym_uq_splicing;
 756         else
 757           {
 758             scm_ungetc (c, port);
 759             p = scm_sym_unquote;
 760           }
 761         break;
 762       }
 763
 764     default:
 765       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 766                "scm_read_quote", chr);
 767       abort ();
 768     }
 769
 770   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 771   return maybe_annotate_source (p, port, line, column);
 772 }
 773
 774 SCM_SYMBOL (sym_syntax, "syntax");
 775 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 776 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 777 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 778
 779 static SCM
 780 scm_read_syntax (int chr, SCM port)
 781 {
 782   SCM p;
 783   long line = SCM_LINUM (port);
 784   int column = SCM_COL (port) - 1;
 785
 786   switch (chr)
 787     {
 788     case '`':
 789       p = sym_quasisyntax;
 790       break;
 791
 792     case '\'':
 793       p = sym_syntax;
 794       break;
 795
 796     case ',':
 797       {
 798         int c;
 799
 800         c = scm_getc (port);
 801         if ('@' == c)
 802           p = sym_unsyntax_splicing;
 803         else
 804           {
 805             scm_ungetc (c, port);
 806             p = sym_unsyntax;
 807           }
 808         break;
 809       }
 810
 811     default:
 812       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 813                "scm_read_syntax", chr);
 814       abort ();
 815     }
 816
 817   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 818   return maybe_annotate_source (p, port, line, column);
 819 }
 820
 821 static SCM
 822 scm_read_nil (int chr, SCM port)
 823 {
 824   SCM id = scm_read_mixed_case_symbol (chr, port);
 825
 826   if (!scm_is_eq (id, sym_nil))
 827     scm_i_input_error ("scm_read_nil", port,
 828                        "unexpected input while reading #nil: ~a",
 829                        scm_list_1 (id));
 830
 831   return SCM_ELISP_NIL;
 832 }
 833
 834 static SCM
 835 scm_read_semicolon_comment (int chr, SCM port)
 836 {
 837   int c;
 838
 839   /* We use the get_byte here because there is no need to get the
 840      locale correct with comment input. This presumes that newline
 841      always represents itself no matter what the encoding is.  */
 842   for (c = scm_get_byte_or_eof (port);
 843        (c != EOF) && (c != '\n');
 844        c = scm_get_byte_or_eof (port));
 845
 846   return SCM_UNSPECIFIED;
 847 }
 848
 849 \f
 850 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 851
 852 static SCM
 853 scm_read_boolean (int chr, SCM port)
 854 {
 855   switch (chr)
 856     {
 857     case 't':
 858     case 'T':
 859       return SCM_BOOL_T;
 860
 861     case 'f':
 862     case 'F':
 863       return SCM_BOOL_F;
 864     }
 865
 866   return SCM_UNSPECIFIED;
 867 }
 868
 869 static SCM
 870 scm_read_character (scm_t_wchar chr, SCM port)
 871 #define FUNC_NAME "scm_lreadr"
 872 {
 873   char buffer[READER_CHAR_NAME_MAX_SIZE];
 874   SCM charname;
 875   size_t charname_len, bytes_read;
 876   scm_t_wchar cp;
 877   int overflow;
 878   scm_t_port *pt;
 879
 880   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 881   if (overflow)
 882     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 883
 884   if (bytes_read == 0)
 885     {
 886       chr = scm_getc (port);
 887       if (chr == EOF)
 888         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 889                            "while reading character", SCM_EOL);
 890
 891       /* CHR must be a token delimiter, like a whitespace.  */
 892       return (SCM_MAKE_CHAR (chr));
 893     }
 894
 895   pt = SCM_PTAB_ENTRY (port);
 896
 897   /* Simple ASCII characters can be processed immediately.  Also, simple
 898      ISO-8859-1 characters can be processed immediately if the encoding for this
 899      port is ISO-8859-1.  */
 900   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 901     {
 902       SCM_COL (port) += 1;
 903       return SCM_MAKE_CHAR (buffer[0]);
 904     }
 905
 906   /* Otherwise, convert the buffer into a proper scheme string for
 907      processing.  */
 908   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 909                                pt->ilseq_handler);
 910   charname_len = scm_i_string_length (charname);
 911   SCM_COL (port) += charname_len;
 912   cp = scm_i_string_ref (charname, 0);
 913   if (charname_len == 1)
 914     return SCM_MAKE_CHAR (cp);
 915
 916   /* Ignore dotted circles, which may be used to keep combining characters from
 917      combining with the backslash in #\charname.  */
 918   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 919     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 920
 921   if (cp >= '0' && cp < '8')
 922     {
 923       /* Dirk:FIXME::  This type of character syntax is not R5RS
 924        * compliant.  Further, it should be verified that the constant
 925        * does only consist of octal digits.  */
 926       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 927       if (SCM_I_INUMP (p))
 928         {
 929           scm_t_wchar c = scm_to_uint32 (p);
 930           if (SCM_IS_UNICODE_CHAR (c))
 931             return SCM_MAKE_CHAR (c);
 932           else
 933             scm_i_input_error (FUNC_NAME, port,
 934                                "out-of-range octal character escape: ~a",
 935                                scm_list_1 (charname));
 936         }
 937     }
 938
 939   if (cp == 'x' && (charname_len > 1))
 940     {
 941       SCM p;
 942
 943       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 944       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 945                                 scm_from_uint (16));
 946       if (SCM_I_INUMP (p))
 947         {
 948           scm_t_wchar c = scm_to_uint32 (p);
 949           if (SCM_IS_UNICODE_CHAR (c))
 950             return SCM_MAKE_CHAR (c);
 951           else
 952             scm_i_input_error (FUNC_NAME, port,
 953                                "out-of-range hex character escape: ~a",
 954                                scm_list_1 (charname));
 955         }
 956     }
 957
 958   /* The names of characters should never have non-Latin1
 959      characters.  */
 960   if (scm_i_is_narrow_string (charname)
 961       || scm_i_try_narrow_string (charname))
 962     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 963                                        charname_len);
 964       if (scm_is_true (ch))
 965         return ch;
 966     }
 967
 968   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 969                      scm_list_1 (charname));
 970
 971   return SCM_UNSPECIFIED;
 972 }
 973 #undef FUNC_NAME
 974
 975 static SCM
 976 scm_read_keyword (int chr, SCM port)
 977 {
 978   SCM symbol;
 979
 980   /* Read the symbol that comprises the keyword.  Doing this instead of
 981      invoking a specific symbol reader function allows `scm_read_keyword ()'
 982      to adapt to the delimiters currently valid of symbols.
 983
 984      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 985   symbol = scm_read_expression (port);
 986   if (!scm_is_symbol (symbol))
 987     scm_i_input_error ("scm_read_keyword", port,
 988                        "keyword prefix `~a' not followed by a symbol: ~s",
 989                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
 990
 991   return (scm_symbol_to_keyword (symbol));
 992 }
 993
 994 static SCM
 995 scm_read_vector (int chr, SCM port, long line, int column)
 996 {
 997   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
 998      guarantee that it's going to do what we want.  After all, this is an
 999      implementation detail of `scm_read_vector ()', not a desirable
1000      property.  */
1001   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1002                                 port, line, column);
1003 }
1004
1005 static SCM
1006 scm_read_array (int chr, SCM port, long line, int column)
1007 {
1008   SCM result = scm_i_read_array (port, chr);
1009   if (scm_is_false (result))
1010     return result;
1011   else
1012     return maybe_annotate_source (result, port, line, column);
1013 }
1014
1015 static SCM
1016 scm_read_srfi4_vector (int chr, SCM port, long line, int column)
1017 {
1018   return scm_read_array (chr, port, line, column);
1019 }
1020
1021 static SCM
1022 scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
1023 {
1024   chr = scm_getc (port);
1025   if (chr != 'u')
1026     goto syntax;
1027
1028   chr = scm_getc (port);
1029   if (chr != '8')
1030     goto syntax;
1031
1032   chr = scm_getc (port);
1033   if (chr != '(')
1034     goto syntax;
1035
1036   return maybe_annotate_source
1037     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1038      port, line, column);
1039
1040  syntax:
1041   scm_i_input_error ("read_bytevector", port,
1042                      "invalid bytevector prefix",
1043                      SCM_MAKE_CHAR (chr));
1044   return SCM_UNSPECIFIED;
1045 }
1046
1047 static SCM
1048 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
1049 {
1050   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1051      terribly inefficient but who cares?  */
1052   SCM s_bits = SCM_EOL;
1053
1054   for (chr = scm_getc (port);
1055        (chr != EOF) && ((chr == '0') || (chr == '1'));
1056        chr = scm_getc (port))
1057     {
1058       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1059     }
1060
1061   if (chr != EOF)
1062     scm_ungetc (chr, port);
1063
1064   return maybe_annotate_source
1065     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1066      port, line, column);
1067 }
1068
1069 static SCM
1070 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1071 {
1072   int bang_seen = 0;
1073
1074   for (;;)
1075     {
1076       int c = scm_getc (port);
1077
1078       if (c == EOF)
1079         scm_i_input_error ("skip_block_comment", port,
1080                            "unterminated `#! ... !#' comment", SCM_EOL);
1081
1082       if (c == '!')
1083         bang_seen = 1;
1084       else if (c == '#' && bang_seen)
1085         break;
1086       else
1087         bang_seen = 0;
1088     }
1089
1090   return SCM_UNSPECIFIED;
1091 }
1092
1093 static SCM
1094 scm_read_shebang (scm_t_wchar chr, SCM port)
1095 {
1096   int c = 0;
1097   if ((c = scm_get_byte_or_eof (port)) != 'r')
1098     {
1099       scm_ungetc (c, port);
1100       return scm_read_scsh_block_comment (chr, port);
1101     }
1102   if ((c = scm_get_byte_or_eof (port)) != '6')
1103     {
1104       scm_ungetc (c, port);
1105       scm_ungetc ('r', port);
1106       return scm_read_scsh_block_comment (chr, port);
1107     }
1108   if ((c = scm_get_byte_or_eof (port)) != 'r')
1109     {
1110       scm_ungetc (c, port);
1111       scm_ungetc ('6', port);
1112       scm_ungetc ('r', port);
1113       return scm_read_scsh_block_comment (chr, port);
1114     }
1115   if ((c = scm_get_byte_or_eof (port)) != 's')
1116     {
1117       scm_ungetc (c, port);
1118       scm_ungetc ('r', port);
1119       scm_ungetc ('6', port);
1120       scm_ungetc ('r', port);
1121       return scm_read_scsh_block_comment (chr, port);
1122     }
1123
1124   return SCM_UNSPECIFIED;
1125 }
1126
1127 static SCM
1128 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1129 {
1130   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1131      nested.  So care must be taken.  */
1132   int nesting_level = 1;
1133
1134   int a = scm_getc (port);
1135
1136   if (a == EOF)
1137     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1138                        "unterminated `#| ... |#' comment", SCM_EOL);
1139
1140   while (nesting_level > 0)
1141     {
1142       int b = scm_getc (port);
1143
1144       if (b == EOF)
1145         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1146                            "unterminated `#| ... |#' comment", SCM_EOL);
1147
1148       if (a == '|' && b == '#')
1149         {
1150           nesting_level--;
1151           b = EOF;
1152         }
1153       else if (a == '#' && b == '|')
1154         {
1155           nesting_level++;
1156           b = EOF;
1157         }
1158
1159       a = b;
1160     }
1161
1162   return SCM_UNSPECIFIED;
1163 }
1164
1165 static SCM
1166 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1167 {
1168   scm_t_wchar c;
1169
1170   c = flush_ws (port, (char *) NULL);
1171   if (EOF == c)
1172     scm_i_input_error ("read_commented_expression", port,
1173                        "no expression after #; comment", SCM_EOL);
1174   scm_ungetc (c, port);
1175   scm_read_expression (port);
1176   return SCM_UNSPECIFIED;
1177 }
1178
1179 static SCM
1180 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1181 {
1182   /* Guile's extended symbol read syntax looks like this:
1183
1184        #{This is all a symbol name}#
1185
1186      So here, CHR is expected to be `{'.  */
1187   int saw_brace = 0;
1188   size_t len = 0;
1189   SCM buf = scm_i_make_string (1024, NULL, 0);
1190
1191   buf = scm_i_string_start_writing (buf);
1192
1193   while ((chr = scm_getc (port)) != EOF)
1194     {
1195       if (saw_brace)
1196         {
1197           if (chr == '#')
1198             {
1199               break;
1200             }
1201           else
1202             {
1203               saw_brace = 0;
1204               scm_i_string_set_x (buf, len++, '}');
1205             }
1206         }
1207
1208       if (chr == '}')
1209         saw_brace = 1;
1210       else if (chr == '\\')
1211         {
1212           /* It used to be that print.c would print extended-read-syntax
1213              symbols with backslashes before "non-standard" chars, but
1214              this routine wouldn't do anything with those escapes.
1215              Bummer.  What we've done is to change print.c to output
1216              R6RS hex escapes for those characters, relying on the fact
1217              that the extended read syntax would never put a `\' before
1218              an `x'.  For now, we just ignore other instances of
1219              backslash in the string.  */
1220           switch ((chr = scm_getc (port)))
1221             {
1222             case EOF:
1223               goto done;
1224             case 'x':
1225               {
1226                 scm_t_wchar c;
1227
1228                 SCM_READ_HEX_ESCAPE (10, ';');
1229                 scm_i_string_set_x (buf, len++, c);
1230                 break;
1231
1232               str_eof:
1233                 chr = EOF;
1234                 goto done;
1235
1236               bad_escaped:
1237                 scm_i_string_stop_writing ();
1238                 scm_i_input_error ("scm_read_extended_symbol", port,
1239                                    "illegal character in escape sequence: ~S",
1240                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1241                 break;
1242               }
1243             default:
1244               scm_i_string_set_x (buf, len++, chr);
1245               break;
1246             }
1247         }
1248       else
1249         scm_i_string_set_x (buf, len++, chr);
1250
1251       if (len >= scm_i_string_length (buf) - 2)
1252         {
1253           SCM addy;
1254
1255           scm_i_string_stop_writing ();
1256           addy = scm_i_make_string (1024, NULL, 0);
1257           buf = scm_string_append (scm_list_2 (buf, addy));
1258           len = 0;
1259           buf = scm_i_string_start_writing (buf);
1260         }
1261     }
1262
1263  done:
1264   scm_i_string_stop_writing ();
1265   if (chr == EOF)
1266     scm_i_input_error ("scm_read_extended_symbol", port,
1267                        "end of file while reading symbol", SCM_EOL);
1268
1269   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1270 }
1271
1272
1273 \f
1274 /* Top-level token readers, i.e., dispatchers.  */
1275
1276 static SCM
1277 scm_read_sharp_extension (int chr, SCM port)
1278 {
1279   SCM proc;
1280
1281   proc = scm_get_hash_procedure (chr);
1282   if (scm_is_true (scm_procedure_p (proc)))
1283     {
1284       long line = SCM_LINUM (port);
1285       int column = SCM_COL (port) - 2;
1286       SCM got;
1287
1288       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1289
1290       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1291         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1292
1293       return got;
1294     }
1295
1296   return SCM_UNSPECIFIED;
1297 }
1298
1299 /* The reader for the sharp `#' character.  It basically dispatches reads
1300    among the above token readers.   */
1301 static SCM
1302 scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
1303 #define FUNC_NAME "scm_lreadr"
1304 {
1305   SCM result;
1306
1307   chr = scm_getc (port);
1308
1309   result = scm_read_sharp_extension (chr, port);
1310   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1311     return result;
1312
1313   switch (chr)
1314     {
1315     case '\\':
1316       return (scm_read_character (chr, port));
1317     case '(':
1318       return (scm_read_vector (chr, port, line, column));
1319     case 's':
1320     case 'u':
1321     case 'f':
1322     case 'c':
1323       /* This one may return either a boolean or an SRFI-4 vector.  */
1324       return (scm_read_srfi4_vector (chr, port, line, column));
1325     case 'v':
1326       return (scm_read_bytevector (chr, port, line, column));
1327     case '*':
1328       return (scm_read_guile_bit_vector (chr, port, line, column));
1329     case 't':
1330     case 'T':
1331     case 'F':
1332       return (scm_read_boolean (chr, port));
1333     case ':':
1334       return (scm_read_keyword (chr, port));
1335     case '0': case '1': case '2': case '3': case '4':
1336     case '5': case '6': case '7': case '8': case '9':
1337     case '@':
1338 #if SCM_ENABLE_DEPRECATED
1339       /* See below for 'i' and 'e'. */
1340     case 'a':
1341     case 'y':
1342     case 'h':
1343     case 'l':
1344 #endif
1345       return (scm_read_array (chr, port, line, column));
1346
1347     case 'i':
1348     case 'e':
1349 #if SCM_ENABLE_DEPRECATED
1350       {
1351         /* When next char is '(', it really is an old-style
1352            uniform array. */
1353         scm_t_wchar next_c = scm_getc (port);
1354         if (next_c != EOF)
1355           scm_ungetc (next_c, port);
1356         if (next_c == '(')
1357           return scm_read_array (chr, port, line, column);
1358         /* Fall through. */
1359       }
1360 #endif
1361     case 'b':
1362     case 'B':
1363     case 'o':
1364     case 'O':
1365     case 'd':
1366     case 'D':
1367     case 'x':
1368     case 'X':
1369     case 'I':
1370     case 'E':
1371       return (scm_read_number_and_radix (chr, port));
1372     case '{':
1373       return (scm_read_extended_symbol (chr, port));
1374     case '!':
1375       return (scm_read_shebang (chr, port));
1376     case ';':
1377       return (scm_read_commented_expression (chr, port));
1378     case '`':
1379     case '\'':
1380     case ',':
1381       return (scm_read_syntax (chr, port));
1382     case 'n':
1383       return (scm_read_nil (chr, port));
1384     default:
1385       result = scm_read_sharp_extension (chr, port);
1386       if (scm_is_eq (result, SCM_UNSPECIFIED))
1387         {
1388           /* To remain compatible with 1.8 and earlier, the following
1389              characters have lower precedence than `read-hash-extend'
1390              characters.  */
1391           switch (chr)
1392             {
1393             case '|':
1394               return scm_read_r6rs_block_comment (chr, port);
1395             default:
1396               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1397                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1398             }
1399         }
1400       else
1401         return result;
1402     }
1403
1404   return SCM_UNSPECIFIED;
1405 }
1406 #undef FUNC_NAME
1407
1408 static SCM
1409 scm_read_expression (SCM port)
1410 #define FUNC_NAME "scm_read_expression"
1411 {
1412   while (1)
1413     {
1414       scm_t_wchar chr;
1415
1416       chr = scm_getc (port);
1417
1418       switch (chr)
1419         {
1420         case SCM_WHITE_SPACES:
1421         case SCM_LINE_INCREMENTORS:
1422           break;
1423         case ';':
1424           (void) scm_read_semicolon_comment (chr, port);
1425           break;
1426         case '[':
1427           if (!SCM_SQUARE_BRACKETS_P)
1428             return (scm_read_mixed_case_symbol (chr, port));
1429           /* otherwise fall through */
1430         case '(':
1431           return (scm_read_sexp (chr, port));
1432         case '"':
1433           return (scm_read_string (chr, port));
1434         case '\'':
1435         case '`':
1436         case ',':
1437           return (scm_read_quote (chr, port));
1438         case '#':
1439           {
1440             long line  = SCM_LINUM (port);
1441             int column = SCM_COL (port) - 1;
1442             SCM result = scm_read_sharp (chr, port, line, column);
1443             if (scm_is_eq (result, SCM_UNSPECIFIED))
1444               /* We read a comment or some such.  */
1445               break;
1446             else
1447               return result;
1448           }
1449         case ')':
1450           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1451           break;
1452         case ']':
1453           if (SCM_SQUARE_BRACKETS_P)
1454             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1455           /* otherwise fall through */
1456         case EOF:
1457           return SCM_EOF_VAL;
1458         case ':':
1459           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1460             return scm_symbol_to_keyword (scm_read_expression (port));
1461           /* Fall through.  */
1462
1463         default:
1464           {
1465             if (((chr >= '0') && (chr <= '9'))
1466                 || (strchr ("+-.", chr)))
1467               return (scm_read_number (chr, port));
1468             else
1469               return (scm_read_mixed_case_symbol (chr, port));
1470           }
1471         }
1472     }
1473 }
1474 #undef FUNC_NAME
1475
1476 \f
1477 /* Actual reader.  */
1478
1479 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1480             (SCM port),
1481             "Read an s-expression from the input port @var{port}, or from\n"
1482             "the current input port if @var{port} is not specified.\n"
1483             "Any whitespace before the next token is discarded.")
1484 #define FUNC_NAME s_scm_read
1485 {
1486   int c;
1487
1488   if (SCM_UNBNDP (port))
1489     port = scm_current_input_port ();
1490   SCM_VALIDATE_OPINPORT (1, port);
1491
1492   c = flush_ws (port, (char *) NULL);
1493   if (EOF == c)
1494     return SCM_EOF_VAL;
1495   scm_ungetc (c, port);
1496
1497   return (scm_read_expression (port));
1498 }
1499 #undef FUNC_NAME
1500
1501
1502 \f
1503
1504 /* Manipulate the read-hash-procedures alist.  This could be written in
1505    Scheme, but maybe it will also be used by C code during initialisation.  */
1506 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1507             (SCM chr, SCM proc),
1508             "Install the procedure @var{proc} for reading expressions\n"
1509             "starting with the character sequence @code{#} and @var{chr}.\n"
1510             "@var{proc} will be called with two arguments:  the character\n"
1511             "@var{chr} and the port to read further data from. The object\n"
1512             "returned will be the return value of @code{read}. \n"
1513             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1514             )
1515 #define FUNC_NAME s_scm_read_hash_extend
1516 {
1517   SCM this;
1518   SCM prev;
1519
1520   SCM_VALIDATE_CHAR (1, chr);
1521   SCM_ASSERT (scm_is_false (proc)
1522               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1523               proc, SCM_ARG2, FUNC_NAME);
1524
1525   /* Check if chr is already in the alist.  */
1526   this = scm_i_read_hash_procedures_ref ();
1527   prev = SCM_BOOL_F;
1528   while (1)
1529     {
1530       if (scm_is_null (this))
1531         {
1532           /* not found, so add it to the beginning.  */
1533           if (scm_is_true (proc))
1534             {
1535               SCM new = scm_cons (scm_cons (chr, proc),
1536                                   scm_i_read_hash_procedures_ref ());
1537               scm_i_read_hash_procedures_set_x (new);
1538             }
1539           break;
1540         }
1541       if (scm_is_eq (chr, SCM_CAAR (this)))
1542         {
1543           /* already in the alist.  */
1544           if (scm_is_false (proc))
1545             {
1546               /* remove it.  */
1547               if (scm_is_false (prev))
1548                 {
1549                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1550                   scm_i_read_hash_procedures_set_x (rest);
1551                 }
1552               else
1553                 scm_set_cdr_x (prev, SCM_CDR (this));
1554             }
1555           else
1556             {
1557               /* replace it.  */
1558               scm_set_cdr_x (SCM_CAR (this), proc);
1559             }
1560           break;
1561         }
1562       prev = this;
1563       this = SCM_CDR (this);
1564     }
1565
1566   return SCM_UNSPECIFIED;
1567 }
1568 #undef FUNC_NAME
1569
1570 /* Recover the read-hash procedure corresponding to char c.  */
1571 static SCM
1572 scm_get_hash_procedure (int c)
1573 {
1574   SCM rest = scm_i_read_hash_procedures_ref ();
1575
1576   while (1)
1577     {
1578       if (scm_is_null (rest))
1579         return SCM_BOOL_F;
1580
1581       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1582         return SCM_CDAR (rest);
1583
1584       rest = SCM_CDR (rest);
1585     }
1586 }
1587
1588 #define SCM_ENCODING_SEARCH_SIZE (500)
1589
1590 /* Search the first few hundred characters of a file for an Emacs-like coding
1591    declaration.  Returns either NULL or a string whose storage has been
1592    allocated with `scm_gc_malloc ()'.  */
1593 char *
1594 scm_i_scan_for_encoding (SCM port)
1595 {
1596   scm_t_port *pt;
1597   char header[SCM_ENCODING_SEARCH_SIZE+1];
1598   size_t bytes_read, encoding_length, i;
1599   char *encoding = NULL;
1600   int utf8_bom = 0;
1601   char *pos, *encoding_start;
1602   int in_comment;
1603
1604   pt = SCM_PTAB_ENTRY (port);
1605
1606   if (pt->rw_active == SCM_PORT_WRITE)
1607     scm_flush (port);
1608
1609   if (pt->rw_random)
1610     pt->rw_active = SCM_PORT_READ;
1611
1612   if (pt->read_pos == pt->read_end)
1613     {
1614       /* We can use the read buffer, and thus avoid a seek. */
1615       if (scm_fill_input (port) == EOF)
1616         return NULL;
1617
1618       bytes_read = pt->read_end - pt->read_pos;
1619       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1620         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1621
1622       if (bytes_read <= 1)
1623         /* An unbuffered port -- don't scan.  */
1624         return NULL;
1625
1626       memcpy (header, pt->read_pos, bytes_read);
1627       header[bytes_read] = '\0';
1628     }
1629   else
1630     {
1631       /* Try to read some bytes and then seek back.  Not all ports
1632          support seeking back; and indeed some file ports (like
1633          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1634          check performed by SCM_FPORT_FDES---but fail to seek
1635          backwards.  Hence this block comes second.  We prefer to use
1636          the read buffer in-place.  */
1637       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1638         return NULL;
1639
1640       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1641       header[bytes_read] = '\0';
1642       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1643     }
1644
1645   if (bytes_read > 3
1646       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1647     utf8_bom = 1;
1648
1649   /* search past "coding[:=]" */
1650   pos = header;
1651   while (1)
1652     {
1653       if ((pos = strstr(pos, "coding")) == NULL)
1654         return NULL;
1655
1656       pos += strlen("coding");
1657       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1658           (*pos == ':' || *pos == '='))
1659         {
1660           pos ++;
1661           break;
1662         }
1663     }
1664
1665   /* skip spaces */
1666   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1667          (*pos == ' ' || *pos == '\t'))
1668     pos ++;
1669
1670   /* grab the next token */
1671   encoding_start = pos;
1672   i = 0;
1673   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1674          && encoding_start + i - header < bytes_read
1675          && (isalnum ((int) encoding_start[i])
1676              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1677     i++;
1678
1679   encoding_length = i;
1680   if (encoding_length == 0)
1681     return NULL;
1682
1683   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1684   for (i = 0; i < encoding_length; i++)
1685     encoding[i] = toupper ((int) encoding[i]);
1686
1687   /* push backwards to make sure we were in a comment */
1688   in_comment = 0;
1689   pos = encoding_start;
1690   while (pos >= header)
1691     {
1692       if (*pos == ';')
1693         {
1694           in_comment = 1;
1695           break;
1696         }
1697       else if (*pos == '\n' || pos == header)
1698         {
1699           /* This wasn't in a semicolon comment. Check for a
1700            hash-bang comment. */
1701           char *beg = strstr (header, "#!");
1702           char *end = strstr (header, "!#");
1703           if (beg < encoding_start && encoding_start + encoding_length <= end)
1704             in_comment = 1;
1705           break;
1706         }
1707       else
1708         {
1709           pos --;
1710           continue;
1711         }
1712     }
1713   if (!in_comment)
1714     /* This wasn't in a comment */
1715     return NULL;
1716
1717   if (utf8_bom && strcmp(encoding, "UTF-8"))
1718     scm_misc_error (NULL,
1719                     "the port input declares the encoding ~s but is encoded as UTF-8",
1720                     scm_list_1 (scm_from_locale_string (encoding)));
1721
1722   return encoding;
1723 }
1724
1725 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1726             (SCM port),
1727             "Scans the port for an Emacs-like character coding declaration\n"
1728             "near the top of the contents of a port with random-accessible contents.\n"
1729             "The coding declaration is of the form\n"
1730             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1731             "\n"
1732             "Returns a string containing the character encoding of the file\n"
1733             "if a declaration was found, or @code{#f} otherwise.\n")
1734 #define FUNC_NAME s_scm_file_encoding
1735 {
1736   char *enc;
1737   SCM s_enc;
1738
1739   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1740
1741   enc = scm_i_scan_for_encoding (port);
1742   if (enc == NULL)
1743     return SCM_BOOL_F;
1744   else
1745     {
1746       s_enc = scm_from_locale_string (enc);
1747       return s_enc;
1748     }
1749
1750   return SCM_BOOL_F;
1751 }
1752 #undef FUNC_NAME
1753
1754 void
1755 scm_init_read ()
1756 {
1757   SCM read_hash_procs;
1758
1759   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1760
1761   scm_i_read_hash_procedures =
1762     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1763
1764   scm_init_opts (scm_read_options, scm_read_opts);
1765 #include "libguile/read.x"
1766 }
1767
1768 /*
1769   Local Variables:
1770   c-file-style: "gnu"
1771   End:
1772 */