libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static inline SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static inline void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static inline int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   register scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360
 361
 362 static SCM
 363 scm_read_sexp (scm_t_wchar chr, SCM port)
 364 #define FUNC_NAME "scm_i_lreadparen"
 365 {
 366   int c;
 367   SCM tmp, tl, ans = SCM_EOL;
 368   const int terminating_char = ((chr == '[') ? ']' : ')');
 369
 370   /* Need to capture line and column numbers here. */
 371   long line = SCM_LINUM (port);
 372   int column = SCM_COL (port) - 1;
 373
 374   c = flush_ws (port, FUNC_NAME);
 375   if (terminating_char == c)
 376     return SCM_EOL;
 377
 378   scm_ungetc (c, port);
 379   if (scm_is_eq (scm_sym_dot,
 380                  (tmp = scm_read_expression (port))))
 381     {
 382       ans = scm_read_expression (port);
 383       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 384         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 385                            SCM_EOL);
 386       return ans;
 387     }
 388
 389   /* Build the head of the list structure. */
 390   ans = tl = scm_cons (tmp, SCM_EOL);
 391
 392   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 393     {
 394       SCM new_tail;
 395
 396       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 397         scm_i_input_error (FUNC_NAME, port,
 398                            "in pair: mismatched close paren: ~A",
 399                            scm_list_1 (SCM_MAKE_CHAR (c)));
 400
 401       scm_ungetc (c, port);
 402       tmp = scm_read_expression (port);
 403
 404       if (scm_is_eq (scm_sym_dot, tmp))
 405         {
 406           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 407
 408           c = flush_ws (port, FUNC_NAME);
 409           if (terminating_char != c)
 410             scm_i_input_error (FUNC_NAME, port,
 411                                "in pair: missing close paren", SCM_EOL);
 412           goto exit;
 413         }
 414
 415       new_tail = scm_cons (tmp, SCM_EOL);
 416       SCM_SETCDR (tl, new_tail);
 417       tl = new_tail;
 418     }
 419
 420  exit:
 421   if (SCM_RECORD_POSITIONS_P)
 422     scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
 423
 424   return ans;
 425 }
 426 #undef FUNC_NAME
 427
 428
 429 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 430    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 431    found.  */
 432 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 433   do                                                               \
 434     {                                                              \
 435       scm_t_wchar a;                                               \
 436       size_t i = 0;                                                \
 437       c = 0;                                                       \
 438       while (i < ndigits)                                          \
 439         {                                                          \
 440           a = scm_getc (port);                                     \
 441           if (a == EOF)                                            \
 442             goto str_eof;                                          \
 443           if (terminator                                           \
 444               && (a == (scm_t_wchar) terminator)                   \
 445               && (i > 0))                                          \
 446             break;                                                 \
 447           if ('0' <= a && a <= '9')                                \
 448             a -= '0';                                              \
 449           else if ('A' <= a && a <= 'F')                           \
 450             a = a - 'A' + 10;                                      \
 451           else if ('a' <= a && a <= 'f')                           \
 452             a = a - 'a' + 10;                                      \
 453           else                                                     \
 454             {                                                      \
 455               c = a;                                               \
 456               goto bad_escaped;                                    \
 457             }                                                      \
 458           c = c * 16 + a;                                          \
 459           i ++;                                                    \
 460         }                                                          \
 461     } while (0)
 462
 463 static void
 464 skip_intraline_whitespace (SCM port)
 465 {
 466   scm_t_wchar c;
 467
 468   do
 469     {
 470       c = scm_getc (port);
 471       if (c == EOF)
 472         return;
 473     }
 474   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 475
 476   scm_ungetc (c, port);
 477 }
 478
 479 static SCM
 480 scm_read_string (int chr, SCM port)
 481 #define FUNC_NAME "scm_lreadr"
 482 {
 483   /* For strings smaller than C_STR, this function creates only one Scheme
 484      object (the string returned).  */
 485
 486   SCM str = SCM_BOOL_F;
 487   unsigned c_str_len = 0;
 488   scm_t_wchar c;
 489
 490   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 491   while ('"' != (c = scm_getc (port)))
 492     {
 493       if (c == EOF)
 494         {
 495         str_eof:
 496           scm_i_input_error (FUNC_NAME, port,
 497                              "end of file in string constant", SCM_EOL);
 498         }
 499
 500       if (c_str_len + 1 >= scm_i_string_length (str))
 501         {
 502           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 503
 504           str = scm_string_append (scm_list_2 (str, addy));
 505         }
 506
 507       if (c == '\\')
 508         {
 509           switch (c = scm_getc (port))
 510             {
 511             case EOF:
 512               goto str_eof;
 513             case '"':
 514             case '\\':
 515               break;
 516             case '\n':
 517               if (SCM_HUNGRY_EOL_ESCAPES_P)
 518                 skip_intraline_whitespace (port);
 519               continue;
 520             case '0':
 521               c = '\0';
 522               break;
 523             case 'f':
 524               c = '\f';
 525               break;
 526             case 'n':
 527               c = '\n';
 528               break;
 529             case 'r':
 530               c = '\r';
 531               break;
 532             case 't':
 533               c = '\t';
 534               break;
 535             case 'a':
 536               c = '\007';
 537               break;
 538             case 'v':
 539               c = '\v';
 540               break;
 541             case 'b':
 542               c = '\010';
 543               break;
 544             case 'x':
 545               if (SCM_R6RS_ESCAPES_P)
 546                 SCM_READ_HEX_ESCAPE (10, ';');
 547               else
 548                 SCM_READ_HEX_ESCAPE (2, '\0');
 549               break;
 550             case 'u':
 551               if (!SCM_R6RS_ESCAPES_P)
 552                 {
 553                   SCM_READ_HEX_ESCAPE (4, '\0');
 554                   break;
 555                 }
 556             case 'U':
 557               if (!SCM_R6RS_ESCAPES_P)
 558                 {
 559                   SCM_READ_HEX_ESCAPE (6, '\0');
 560                   break;
 561                 }
 562             default:
 563             bad_escaped:
 564               scm_i_input_error (FUNC_NAME, port,
 565                                  "illegal character in escape sequence: ~S",
 566                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 567             }
 568         }
 569       str = scm_i_string_start_writing (str);
 570       scm_i_string_set_x (str, c_str_len++, c);
 571       scm_i_string_stop_writing ();
 572     }
 573
 574   if (c_str_len > 0)
 575     {
 576       return scm_i_substring_copy (str, 0, c_str_len);
 577     }
 578
 579   return scm_nullstr;
 580 }
 581 #undef FUNC_NAME
 582
 583
 584 static SCM
 585 scm_read_number (scm_t_wchar chr, SCM port)
 586 {
 587   SCM result, str = SCM_EOL;
 588   char buffer[READER_BUFFER_SIZE];
 589   char *overflow_buffer = NULL;
 590   size_t bytes_read;
 591   int overflow;
 592   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 593
 594   scm_ungetc (chr, port);
 595   overflow = read_complete_token (port, buffer, sizeof (buffer),
 596                                   &overflow_buffer, &bytes_read);
 597
 598   if (!overflow)
 599     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 600   else
 601     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 602                             pt->ilseq_handler);
 603
 604   result = scm_string_to_number (str, SCM_UNDEFINED);
 605   if (!scm_is_true (result))
 606     {
 607       /* Return a symbol instead of a number */
 608       if (SCM_CASE_INSENSITIVE_P)
 609         str = scm_string_downcase_x (str);
 610       result = scm_string_to_symbol (str);
 611     }
 612
 613   if (overflow)
 614     free (overflow_buffer);
 615   SCM_COL (port) += scm_i_string_length (str);
 616   return result;
 617 }
 618
 619 static SCM
 620 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 621 {
 622   SCM result;
 623   int ends_with_colon = 0;
 624   size_t bytes_read;
 625   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 626   int overflow;
 627   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 628   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 629   SCM str;
 630
 631   scm_ungetc (chr, port);
 632   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 633                                   &overflow_buffer, &bytes_read);
 634   if (bytes_read > 0)
 635     {
 636       if (!overflow)
 637         ends_with_colon = buffer[bytes_read - 1] == ':';
 638       else
 639         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 640     }
 641
 642   if (postfix && ends_with_colon && (bytes_read > 1))
 643     {
 644       if (!overflow)
 645         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 646       else
 647         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 648                                 pt->ilseq_handler);
 649
 650       if (SCM_CASE_INSENSITIVE_P)
 651         str = scm_string_downcase_x (str);
 652       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 653     }
 654   else
 655     {
 656       if (!overflow)
 657         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 658       else
 659         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 660                                 pt->ilseq_handler);
 661
 662       if (SCM_CASE_INSENSITIVE_P)
 663         str = scm_string_downcase_x (str);
 664       result = scm_string_to_symbol (str);
 665     }
 666
 667   if (overflow)
 668     free (overflow_buffer);
 669   SCM_COL (port) += scm_i_string_length (str);
 670   return result;
 671 }
 672
 673 static SCM
 674 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 675 #define FUNC_NAME "scm_lreadr"
 676 {
 677   SCM result;
 678   size_t read;
 679   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 680   int overflow;
 681   unsigned int radix;
 682   SCM str;
 683   scm_t_port *pt;
 684
 685   switch (chr)
 686     {
 687     case 'B':
 688     case 'b':
 689       radix = 2;
 690       break;
 691
 692     case 'o':
 693     case 'O':
 694       radix = 8;
 695       break;
 696
 697     case 'd':
 698     case 'D':
 699       radix = 10;
 700       break;
 701
 702     case 'x':
 703     case 'X':
 704       radix = 16;
 705       break;
 706
 707     default:
 708       scm_ungetc (chr, port);
 709       scm_ungetc ('#', port);
 710       radix = 10;
 711     }
 712
 713   overflow = read_complete_token (port, buffer, sizeof (buffer),
 714                                   &overflow_buffer, &read);
 715
 716   pt = SCM_PTAB_ENTRY (port);
 717   if (!overflow)
 718     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 719   else
 720     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 721                             pt->ilseq_handler);
 722
 723   result = scm_string_to_number (str, scm_from_uint (radix));
 724
 725   if (overflow)
 726     free (overflow_buffer);
 727
 728   SCM_COL (port) += scm_i_string_length (str);
 729
 730   if (scm_is_true (result))
 731     return result;
 732
 733   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 734
 735   return SCM_BOOL_F;
 736 }
 737 #undef FUNC_NAME
 738
 739 static SCM
 740 scm_read_quote (int chr, SCM port)
 741 {
 742   SCM p;
 743   long line = SCM_LINUM (port);
 744   int column = SCM_COL (port) - 1;
 745
 746   switch (chr)
 747     {
 748     case '`':
 749       p = scm_sym_quasiquote;
 750       break;
 751
 752     case '\'':
 753       p = scm_sym_quote;
 754       break;
 755
 756     case ',':
 757       {
 758         scm_t_wchar c;
 759
 760         c = scm_getc (port);
 761         if ('@' == c)
 762           p = scm_sym_uq_splicing;
 763         else
 764           {
 765             scm_ungetc (c, port);
 766             p = scm_sym_unquote;
 767           }
 768         break;
 769       }
 770
 771     default:
 772       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 773                "scm_read_quote", chr);
 774       abort ();
 775     }
 776
 777   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 778   if (SCM_RECORD_POSITIONS_P)
 779     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 780
 781   return p;
 782 }
 783
 784 SCM_SYMBOL (sym_syntax, "syntax");
 785 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 786 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 787 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 788
 789 static SCM
 790 scm_read_syntax (int chr, SCM port)
 791 {
 792   SCM p;
 793   long line = SCM_LINUM (port);
 794   int column = SCM_COL (port) - 1;
 795
 796   switch (chr)
 797     {
 798     case '`':
 799       p = sym_quasisyntax;
 800       break;
 801
 802     case '\'':
 803       p = sym_syntax;
 804       break;
 805
 806     case ',':
 807       {
 808         int c;
 809
 810         c = scm_getc (port);
 811         if ('@' == c)
 812           p = sym_unsyntax_splicing;
 813         else
 814           {
 815             scm_ungetc (c, port);
 816             p = sym_unsyntax;
 817           }
 818         break;
 819       }
 820
 821     default:
 822       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 823                "scm_read_syntax", chr);
 824       abort ();
 825     }
 826
 827   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 828   if (SCM_RECORD_POSITIONS_P)
 829     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 830
 831   return p;
 832 }
 833
 834 static inline SCM
 835 scm_read_nil (int chr, SCM port)
 836 {
 837   SCM id = scm_read_mixed_case_symbol (chr, port);
 838
 839   if (!scm_is_eq (id, sym_nil))
 840     scm_i_input_error ("scm_read_nil", port,
 841                        "unexpected input while reading #nil: ~a",
 842                        scm_list_1 (id));
 843
 844   return SCM_ELISP_NIL;
 845 }
 846
 847 static inline SCM
 848 scm_read_semicolon_comment (int chr, SCM port)
 849 {
 850   int c;
 851
 852   /* We use the get_byte here because there is no need to get the
 853      locale correct with comment input. This presumes that newline
 854      always represents itself no matter what the encoding is.  */
 855   for (c = scm_get_byte_or_eof (port);
 856        (c != EOF) && (c != '\n');
 857        c = scm_get_byte_or_eof (port));
 858
 859   return SCM_UNSPECIFIED;
 860 }
 861
 862 \f
 863 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 864
 865 static SCM
 866 scm_read_boolean (int chr, SCM port)
 867 {
 868   switch (chr)
 869     {
 870     case 't':
 871     case 'T':
 872       return SCM_BOOL_T;
 873
 874     case 'f':
 875     case 'F':
 876       return SCM_BOOL_F;
 877     }
 878
 879   return SCM_UNSPECIFIED;
 880 }
 881
 882 static SCM
 883 scm_read_character (scm_t_wchar chr, SCM port)
 884 #define FUNC_NAME "scm_lreadr"
 885 {
 886   char buffer[READER_CHAR_NAME_MAX_SIZE];
 887   SCM charname;
 888   size_t charname_len, bytes_read;
 889   scm_t_wchar cp;
 890   int overflow;
 891   scm_t_port *pt;
 892
 893   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 894   if (overflow)
 895     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 896
 897   if (bytes_read == 0)
 898     {
 899       chr = scm_getc (port);
 900       if (chr == EOF)
 901         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 902                            "while reading character", SCM_EOL);
 903
 904       /* CHR must be a token delimiter, like a whitespace.  */
 905       return (SCM_MAKE_CHAR (chr));
 906     }
 907
 908   pt = SCM_PTAB_ENTRY (port);
 909
 910   /* Simple ASCII characters can be processed immediately.  Also, simple
 911      ISO-8859-1 characters can be processed immediately if the encoding for this
 912      port is ISO-8859-1.  */
 913   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 914     {
 915       SCM_COL (port) += 1;
 916       return SCM_MAKE_CHAR (buffer[0]);
 917     }
 918
 919   /* Otherwise, convert the buffer into a proper scheme string for
 920      processing.  */
 921   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 922                                pt->ilseq_handler);
 923   charname_len = scm_i_string_length (charname);
 924   SCM_COL (port) += charname_len;
 925   cp = scm_i_string_ref (charname, 0);
 926   if (charname_len == 1)
 927     return SCM_MAKE_CHAR (cp);
 928
 929   /* Ignore dotted circles, which may be used to keep combining characters from
 930      combining with the backslash in #\charname.  */
 931   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 932     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 933
 934   if (cp >= '0' && cp < '8')
 935     {
 936       /* Dirk:FIXME::  This type of character syntax is not R5RS
 937        * compliant.  Further, it should be verified that the constant
 938        * does only consist of octal digits.  */
 939       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 940       if (SCM_I_INUMP (p))
 941         {
 942           scm_t_wchar c = scm_to_uint32 (p);
 943           if (SCM_IS_UNICODE_CHAR (c))
 944             return SCM_MAKE_CHAR (c);
 945           else
 946             scm_i_input_error (FUNC_NAME, port,
 947                                "out-of-range octal character escape: ~a",
 948                                scm_list_1 (charname));
 949         }
 950     }
 951
 952   if (cp == 'x' && (charname_len > 1))
 953     {
 954       SCM p;
 955
 956       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 957       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 958                                 scm_from_uint (16));
 959       if (SCM_I_INUMP (p))
 960         {
 961           scm_t_wchar c = scm_to_uint32 (p);
 962           if (SCM_IS_UNICODE_CHAR (c))
 963             return SCM_MAKE_CHAR (c);
 964           else
 965             scm_i_input_error (FUNC_NAME, port,
 966                                "out-of-range hex character escape: ~a",
 967                                scm_list_1 (charname));
 968         }
 969     }
 970
 971   /* The names of characters should never have non-Latin1
 972      characters.  */
 973   if (scm_i_is_narrow_string (charname)
 974       || scm_i_try_narrow_string (charname))
 975     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 976                                        charname_len);
 977       if (scm_is_true (ch))
 978         return ch;
 979     }
 980
 981   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 982                      scm_list_1 (charname));
 983
 984   return SCM_UNSPECIFIED;
 985 }
 986 #undef FUNC_NAME
 987
 988 static inline SCM
 989 scm_read_keyword (int chr, SCM port)
 990 {
 991   SCM symbol;
 992
 993   /* Read the symbol that comprises the keyword.  Doing this instead of
 994      invoking a specific symbol reader function allows `scm_read_keyword ()'
 995      to adapt to the delimiters currently valid of symbols.
 996
 997      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 998   symbol = scm_read_expression (port);
 999   if (!scm_is_symbol (symbol))
1000     scm_i_input_error ("scm_read_keyword", port,
1001                        "keyword prefix `~a' not followed by a symbol: ~s",
1002                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1003
1004   return (scm_symbol_to_keyword (symbol));
1005 }
1006
1007 static inline SCM
1008 scm_read_vector (int chr, SCM port)
1009 {
1010   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1011      guarantee that it's going to do what we want.  After all, this is an
1012      implementation detail of `scm_read_vector ()', not a desirable
1013      property.  */
1014   return (scm_vector (scm_read_sexp (chr, port)));
1015 }
1016
1017 static inline SCM
1018 scm_read_srfi4_vector (int chr, SCM port)
1019 {
1020   return scm_i_read_array (port, chr);
1021 }
1022
1023 static SCM
1024 scm_read_bytevector (scm_t_wchar chr, SCM port)
1025 {
1026   chr = scm_getc (port);
1027   if (chr != 'u')
1028     goto syntax;
1029
1030   chr = scm_getc (port);
1031   if (chr != '8')
1032     goto syntax;
1033
1034   chr = scm_getc (port);
1035   if (chr != '(')
1036     goto syntax;
1037
1038   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1039
1040  syntax:
1041   scm_i_input_error ("read_bytevector", port,
1042                      "invalid bytevector prefix",
1043                      SCM_MAKE_CHAR (chr));
1044   return SCM_UNSPECIFIED;
1045 }
1046
1047 static SCM
1048 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1049 {
1050   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1051      terribly inefficient but who cares?  */
1052   SCM s_bits = SCM_EOL;
1053
1054   for (chr = scm_getc (port);
1055        (chr != EOF) && ((chr == '0') || (chr == '1'));
1056        chr = scm_getc (port))
1057     {
1058       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1059     }
1060
1061   if (chr != EOF)
1062     scm_ungetc (chr, port);
1063
1064   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1065 }
1066
1067 static inline SCM
1068 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1069 {
1070   int bang_seen = 0;
1071
1072   for (;;)
1073     {
1074       int c = scm_getc (port);
1075
1076       if (c == EOF)
1077         scm_i_input_error ("skip_block_comment", port,
1078                            "unterminated `#! ... !#' comment", SCM_EOL);
1079
1080       if (c == '!')
1081         bang_seen = 1;
1082       else if (c == '#' && bang_seen)
1083         break;
1084       else
1085         bang_seen = 0;
1086     }
1087
1088   return SCM_UNSPECIFIED;
1089 }
1090
1091 static SCM
1092 scm_read_shebang (scm_t_wchar chr, SCM port)
1093 {
1094   int c = 0;
1095   if ((c = scm_get_byte_or_eof (port)) != 'r')
1096     {
1097       scm_ungetc (c, port);
1098       return scm_read_scsh_block_comment (chr, port);
1099     }
1100   if ((c = scm_get_byte_or_eof (port)) != '6')
1101     {
1102       scm_ungetc (c, port);
1103       scm_ungetc ('r', port);
1104       return scm_read_scsh_block_comment (chr, port);
1105     }
1106   if ((c = scm_get_byte_or_eof (port)) != 'r')
1107     {
1108       scm_ungetc (c, port);
1109       scm_ungetc ('6', port);
1110       scm_ungetc ('r', port);
1111       return scm_read_scsh_block_comment (chr, port);
1112     }
1113   if ((c = scm_get_byte_or_eof (port)) != 's')
1114     {
1115       scm_ungetc (c, port);
1116       scm_ungetc ('r', port);
1117       scm_ungetc ('6', port);
1118       scm_ungetc ('r', port);
1119       return scm_read_scsh_block_comment (chr, port);
1120     }
1121
1122   return SCM_UNSPECIFIED;
1123 }
1124
1125 static SCM
1126 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1127 {
1128   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1129      nested.  So care must be taken.  */
1130   int nesting_level = 1;
1131   int opening_seen = 0, closing_seen = 0;
1132
1133   while (nesting_level > 0)
1134     {
1135       int c = scm_getc (port);
1136
1137       if (c == EOF)
1138         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1139                            "unterminated `#| ... |#' comment", SCM_EOL);
1140
1141       if (opening_seen)
1142         {
1143           if (c == '|')
1144             nesting_level++;
1145           opening_seen = 0;
1146         }
1147       else if (closing_seen)
1148         {
1149           if (c == '#')
1150             nesting_level--;
1151           closing_seen = 0;
1152         }
1153       else if (c == '|')
1154         closing_seen = 1;
1155       else if (c == '#')
1156         opening_seen = 1;
1157       else
1158         opening_seen = closing_seen = 0;
1159     }
1160
1161   return SCM_UNSPECIFIED;
1162 }
1163
1164 static SCM
1165 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1166 {
1167   scm_t_wchar c;
1168
1169   c = flush_ws (port, (char *) NULL);
1170   if (EOF == c)
1171     scm_i_input_error ("read_commented_expression", port,
1172                        "no expression after #; comment", SCM_EOL);
1173   scm_ungetc (c, port);
1174   scm_read_expression (port);
1175   return SCM_UNSPECIFIED;
1176 }
1177
1178 static SCM
1179 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1180 {
1181   /* Guile's extended symbol read syntax looks like this:
1182
1183        #{This is all a symbol name}#
1184
1185      So here, CHR is expected to be `{'.  */
1186   int saw_brace = 0;
1187   size_t len = 0;
1188   SCM buf = scm_i_make_string (1024, NULL, 0);
1189
1190   buf = scm_i_string_start_writing (buf);
1191
1192   while ((chr = scm_getc (port)) != EOF)
1193     {
1194       if (saw_brace)
1195         {
1196           if (chr == '#')
1197             {
1198               break;
1199             }
1200           else
1201             {
1202               saw_brace = 0;
1203               scm_i_string_set_x (buf, len++, '}');
1204             }
1205         }
1206
1207       if (chr == '}')
1208         saw_brace = 1;
1209       else if (chr == '\\')
1210         {
1211           /* It used to be that print.c would print extended-read-syntax
1212              symbols with backslashes before "non-standard" chars, but
1213              this routine wouldn't do anything with those escapes.
1214              Bummer.  What we've done is to change print.c to output
1215              R6RS hex escapes for those characters, relying on the fact
1216              that the extended read syntax would never put a `\' before
1217              an `x'.  For now, we just ignore other instances of
1218              backslash in the string.  */
1219           switch ((chr = scm_getc (port)))
1220             {
1221             case EOF:
1222               goto done;
1223             case 'x':
1224               {
1225                 scm_t_wchar c;
1226
1227                 SCM_READ_HEX_ESCAPE (10, ';');
1228                 scm_i_string_set_x (buf, len++, c);
1229                 break;
1230
1231               str_eof:
1232                 chr = EOF;
1233                 goto done;
1234
1235               bad_escaped:
1236                 scm_i_string_stop_writing ();
1237                 scm_i_input_error ("scm_read_extended_symbol", port,
1238                                    "illegal character in escape sequence: ~S",
1239                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1240                 break;
1241               }
1242             default:
1243               scm_i_string_set_x (buf, len++, chr);
1244               break;
1245             }
1246         }
1247       else
1248         scm_i_string_set_x (buf, len++, chr);
1249
1250       if (len >= scm_i_string_length (buf) - 2)
1251         {
1252           SCM addy;
1253
1254           scm_i_string_stop_writing ();
1255           addy = scm_i_make_string (1024, NULL, 0);
1256           buf = scm_string_append (scm_list_2 (buf, addy));
1257           len = 0;
1258           buf = scm_i_string_start_writing (buf);
1259         }
1260     }
1261
1262  done:
1263   scm_i_string_stop_writing ();
1264   if (chr == EOF)
1265     scm_i_input_error ("scm_read_extended_symbol", port,
1266                        "end of file while reading symbol", SCM_EOL);
1267
1268   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1269 }
1270
1271
1272 \f
1273 /* Top-level token readers, i.e., dispatchers.  */
1274
1275 static SCM
1276 scm_read_sharp_extension (int chr, SCM port)
1277 {
1278   SCM proc;
1279
1280   proc = scm_get_hash_procedure (chr);
1281   if (scm_is_true (scm_procedure_p (proc)))
1282     {
1283       long line = SCM_LINUM (port);
1284       int column = SCM_COL (port) - 2;
1285       SCM got;
1286
1287       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1288
1289       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1290         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1291
1292       return got;
1293     }
1294
1295   return SCM_UNSPECIFIED;
1296 }
1297
1298 /* The reader for the sharp `#' character.  It basically dispatches reads
1299    among the above token readers.   */
1300 static SCM
1301 scm_read_sharp (scm_t_wchar chr, SCM port)
1302 #define FUNC_NAME "scm_lreadr"
1303 {
1304   SCM result;
1305
1306   chr = scm_getc (port);
1307
1308   result = scm_read_sharp_extension (chr, port);
1309   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1310     return result;
1311
1312   switch (chr)
1313     {
1314     case '\\':
1315       return (scm_read_character (chr, port));
1316     case '(':
1317       return (scm_read_vector (chr, port));
1318     case 's':
1319     case 'u':
1320     case 'f':
1321     case 'c':
1322       /* This one may return either a boolean or an SRFI-4 vector.  */
1323       return (scm_read_srfi4_vector (chr, port));
1324     case 'v':
1325       return (scm_read_bytevector (chr, port));
1326     case '*':
1327       return (scm_read_guile_bit_vector (chr, port));
1328     case 't':
1329     case 'T':
1330     case 'F':
1331       /* This one may return either a boolean or an SRFI-4 vector.  */
1332       return (scm_read_boolean (chr, port));
1333     case ':':
1334       return (scm_read_keyword (chr, port));
1335     case '0': case '1': case '2': case '3': case '4':
1336     case '5': case '6': case '7': case '8': case '9':
1337     case '@':
1338 #if SCM_ENABLE_DEPRECATED
1339       /* See below for 'i' and 'e'. */
1340     case 'a':
1341     case 'y':
1342     case 'h':
1343     case 'l':
1344 #endif
1345       return (scm_i_read_array (port, chr));
1346
1347     case 'i':
1348     case 'e':
1349 #if SCM_ENABLE_DEPRECATED
1350       {
1351         /* When next char is '(', it really is an old-style
1352            uniform array. */
1353         scm_t_wchar next_c = scm_getc (port);
1354         if (next_c != EOF)
1355           scm_ungetc (next_c, port);
1356         if (next_c == '(')
1357           return scm_i_read_array (port, chr);
1358         /* Fall through. */
1359       }
1360 #endif
1361     case 'b':
1362     case 'B':
1363     case 'o':
1364     case 'O':
1365     case 'd':
1366     case 'D':
1367     case 'x':
1368     case 'X':
1369     case 'I':
1370     case 'E':
1371       return (scm_read_number_and_radix (chr, port));
1372     case '{':
1373       return (scm_read_extended_symbol (chr, port));
1374     case '!':
1375       return (scm_read_shebang (chr, port));
1376     case ';':
1377       return (scm_read_commented_expression (chr, port));
1378     case '`':
1379     case '\'':
1380     case ',':
1381       return (scm_read_syntax (chr, port));
1382     case 'n':
1383       return (scm_read_nil (chr, port));
1384     default:
1385       result = scm_read_sharp_extension (chr, port);
1386       if (scm_is_eq (result, SCM_UNSPECIFIED))
1387         {
1388           /* To remain compatible with 1.8 and earlier, the following
1389              characters have lower precedence than `read-hash-extend'
1390              characters.  */
1391           switch (chr)
1392             {
1393             case '|':
1394               return scm_read_r6rs_block_comment (chr, port);
1395             default:
1396               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1397                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1398             }
1399         }
1400       else
1401         return result;
1402     }
1403
1404   return SCM_UNSPECIFIED;
1405 }
1406 #undef FUNC_NAME
1407
1408 static SCM
1409 scm_read_expression (SCM port)
1410 #define FUNC_NAME "scm_read_expression"
1411 {
1412   while (1)
1413     {
1414       register scm_t_wchar chr;
1415
1416       chr = scm_getc (port);
1417
1418       switch (chr)
1419         {
1420         case SCM_WHITE_SPACES:
1421         case SCM_LINE_INCREMENTORS:
1422           break;
1423         case ';':
1424           (void) scm_read_semicolon_comment (chr, port);
1425           break;
1426         case '[':
1427           if (!SCM_SQUARE_BRACKETS_P)
1428             return (scm_read_mixed_case_symbol (chr, port));
1429           /* otherwise fall through */
1430         case '(':
1431           return (scm_read_sexp (chr, port));
1432         case '"':
1433           return (scm_read_string (chr, port));
1434         case '\'':
1435         case '`':
1436         case ',':
1437           return (scm_read_quote (chr, port));
1438         case '#':
1439           {
1440             SCM result;
1441             result = scm_read_sharp (chr, port);
1442             if (scm_is_eq (result, SCM_UNSPECIFIED))
1443               /* We read a comment or some such.  */
1444               break;
1445             else
1446               return result;
1447           }
1448         case ')':
1449           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1450           break;
1451         case ']':
1452           if (SCM_SQUARE_BRACKETS_P)
1453             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1454           /* otherwise fall through */
1455         case EOF:
1456           return SCM_EOF_VAL;
1457         case ':':
1458           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1459             return scm_symbol_to_keyword (scm_read_expression (port));
1460           /* Fall through.  */
1461
1462         default:
1463           {
1464             if (((chr >= '0') && (chr <= '9'))
1465                 || (strchr ("+-.", chr)))
1466               return (scm_read_number (chr, port));
1467             else
1468               return (scm_read_mixed_case_symbol (chr, port));
1469           }
1470         }
1471     }
1472 }
1473 #undef FUNC_NAME
1474
1475 \f
1476 /* Actual reader.  */
1477
1478 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1479             (SCM port),
1480             "Read an s-expression from the input port @var{port}, or from\n"
1481             "the current input port if @var{port} is not specified.\n"
1482             "Any whitespace before the next token is discarded.")
1483 #define FUNC_NAME s_scm_read
1484 {
1485   int c;
1486
1487   if (SCM_UNBNDP (port))
1488     port = scm_current_input_port ();
1489   SCM_VALIDATE_OPINPORT (1, port);
1490
1491   c = flush_ws (port, (char *) NULL);
1492   if (EOF == c)
1493     return SCM_EOF_VAL;
1494   scm_ungetc (c, port);
1495
1496   return (scm_read_expression (port));
1497 }
1498 #undef FUNC_NAME
1499
1500
1501 \f
1502
1503 /* Manipulate the read-hash-procedures alist.  This could be written in
1504    Scheme, but maybe it will also be used by C code during initialisation.  */
1505 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1506             (SCM chr, SCM proc),
1507             "Install the procedure @var{proc} for reading expressions\n"
1508             "starting with the character sequence @code{#} and @var{chr}.\n"
1509             "@var{proc} will be called with two arguments:  the character\n"
1510             "@var{chr} and the port to read further data from. The object\n"
1511             "returned will be the return value of @code{read}. \n"
1512             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1513             )
1514 #define FUNC_NAME s_scm_read_hash_extend
1515 {
1516   SCM this;
1517   SCM prev;
1518
1519   SCM_VALIDATE_CHAR (1, chr);
1520   SCM_ASSERT (scm_is_false (proc)
1521               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1522               proc, SCM_ARG2, FUNC_NAME);
1523
1524   /* Check if chr is already in the alist.  */
1525   this = scm_i_read_hash_procedures_ref ();
1526   prev = SCM_BOOL_F;
1527   while (1)
1528     {
1529       if (scm_is_null (this))
1530         {
1531           /* not found, so add it to the beginning.  */
1532           if (scm_is_true (proc))
1533             {
1534               SCM new = scm_cons (scm_cons (chr, proc),
1535                                   scm_i_read_hash_procedures_ref ());
1536               scm_i_read_hash_procedures_set_x (new);
1537             }
1538           break;
1539         }
1540       if (scm_is_eq (chr, SCM_CAAR (this)))
1541         {
1542           /* already in the alist.  */
1543           if (scm_is_false (proc))
1544             {
1545               /* remove it.  */
1546               if (scm_is_false (prev))
1547                 {
1548                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1549                   scm_i_read_hash_procedures_set_x (rest);
1550                 }
1551               else
1552                 scm_set_cdr_x (prev, SCM_CDR (this));
1553             }
1554           else
1555             {
1556               /* replace it.  */
1557               scm_set_cdr_x (SCM_CAR (this), proc);
1558             }
1559           break;
1560         }
1561       prev = this;
1562       this = SCM_CDR (this);
1563     }
1564
1565   return SCM_UNSPECIFIED;
1566 }
1567 #undef FUNC_NAME
1568
1569 /* Recover the read-hash procedure corresponding to char c.  */
1570 static SCM
1571 scm_get_hash_procedure (int c)
1572 {
1573   SCM rest = scm_i_read_hash_procedures_ref ();
1574
1575   while (1)
1576     {
1577       if (scm_is_null (rest))
1578         return SCM_BOOL_F;
1579
1580       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1581         return SCM_CDAR (rest);
1582
1583       rest = SCM_CDR (rest);
1584     }
1585 }
1586
1587 #define SCM_ENCODING_SEARCH_SIZE (500)
1588
1589 /* Search the first few hundred characters of a file for an Emacs-like coding
1590    declaration.  Returns either NULL or a string whose storage has been
1591    allocated with `scm_gc_malloc ()'.  */
1592 char *
1593 scm_i_scan_for_encoding (SCM port)
1594 {
1595   scm_t_port *pt;
1596   char header[SCM_ENCODING_SEARCH_SIZE+1];
1597   size_t bytes_read, encoding_length, i;
1598   char *encoding = NULL;
1599   int utf8_bom = 0;
1600   char *pos, *encoding_start;
1601   int in_comment;
1602
1603   pt = SCM_PTAB_ENTRY (port);
1604
1605   if (pt->rw_active == SCM_PORT_WRITE)
1606     scm_flush (port);
1607
1608   if (pt->rw_random)
1609     pt->rw_active = SCM_PORT_READ;
1610
1611   if (pt->read_pos == pt->read_end)
1612     {
1613       /* We can use the read buffer, and thus avoid a seek. */
1614       if (scm_fill_input (port) == EOF)
1615         return NULL;
1616
1617       bytes_read = pt->read_end - pt->read_pos;
1618       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1619         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1620
1621       if (bytes_read <= 1)
1622         /* An unbuffered port -- don't scan.  */
1623         return NULL;
1624
1625       memcpy (header, pt->read_pos, bytes_read);
1626       header[bytes_read] = '\0';
1627     }
1628   else
1629     {
1630       /* Try to read some bytes and then seek back.  Not all ports
1631          support seeking back; and indeed some file ports (like
1632          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1633          check performed by SCM_FPORT_FDES---but fail to seek
1634          backwards.  Hence this block comes second.  We prefer to use
1635          the read buffer in-place.  */
1636       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1637         return NULL;
1638
1639       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1640       header[bytes_read] = '\0';
1641       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1642     }
1643
1644   if (bytes_read > 3
1645       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1646     utf8_bom = 1;
1647
1648   /* search past "coding[:=]" */
1649   pos = header;
1650   while (1)
1651     {
1652       if ((pos = strstr(pos, "coding")) == NULL)
1653         return NULL;
1654
1655       pos += strlen("coding");
1656       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1657           (*pos == ':' || *pos == '='))
1658         {
1659           pos ++;
1660           break;
1661         }
1662     }
1663
1664   /* skip spaces */
1665   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1666          (*pos == ' ' || *pos == '\t'))
1667     pos ++;
1668
1669   /* grab the next token */
1670   encoding_start = pos;
1671   i = 0;
1672   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1673          && encoding_start + i - header < bytes_read
1674          && (isalnum ((int) encoding_start[i])
1675              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1676     i++;
1677
1678   encoding_length = i;
1679   if (encoding_length == 0)
1680     return NULL;
1681
1682   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1683   for (i = 0; i < encoding_length; i++)
1684     encoding[i] = toupper ((int) encoding[i]);
1685
1686   /* push backwards to make sure we were in a comment */
1687   in_comment = 0;
1688   pos = encoding_start;
1689   while (pos >= header)
1690     {
1691       if (*pos == ';')
1692         {
1693           in_comment = 1;
1694           break;
1695         }
1696       else if (*pos == '\n' || pos == header)
1697         {
1698           /* This wasn't in a semicolon comment. Check for a
1699            hash-bang comment. */
1700           char *beg = strstr (header, "#!");
1701           char *end = strstr (header, "!#");
1702           if (beg < encoding_start && encoding_start + encoding_length <= end)
1703             in_comment = 1;
1704           break;
1705         }
1706       else
1707         {
1708           pos --;
1709           continue;
1710         }
1711     }
1712   if (!in_comment)
1713     /* This wasn't in a comment */
1714     return NULL;
1715
1716   if (utf8_bom && strcmp(encoding, "UTF-8"))
1717     scm_misc_error (NULL,
1718                     "the port input declares the encoding ~s but is encoded as UTF-8",
1719                     scm_list_1 (scm_from_locale_string (encoding)));
1720
1721   return encoding;
1722 }
1723
1724 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1725             (SCM port),
1726             "Scans the port for an Emacs-like character coding declaration\n"
1727             "near the top of the contents of a port with random-accessible contents.\n"
1728             "The coding declaration is of the form\n"
1729             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1730             "\n"
1731             "Returns a string containing the character encoding of the file\n"
1732             "if a declaration was found, or @code{#f} otherwise.\n")
1733 #define FUNC_NAME s_scm_file_encoding
1734 {
1735   char *enc;
1736   SCM s_enc;
1737
1738   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1739
1740   enc = scm_i_scan_for_encoding (port);
1741   if (enc == NULL)
1742     return SCM_BOOL_F;
1743   else
1744     {
1745       s_enc = scm_from_locale_string (enc);
1746       return s_enc;
1747     }
1748
1749   return SCM_BOOL_F;
1750 }
1751 #undef FUNC_NAME
1752
1753 void
1754 scm_init_read ()
1755 {
1756   SCM read_hash_procs;
1757
1758   read_hash_procs = scm_make_fluid ();
1759   scm_fluid_set_x (read_hash_procs, SCM_EOL);
1760
1761   scm_i_read_hash_procedures =
1762     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1763
1764   scm_init_opts (scm_read_options, scm_read_opts);
1765 #include "libguile/read.x"
1766 }
1767
1768 /*
1769   Local Variables:
1770   c-file-style: "gnu"
1771   End:
1772 */