libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360
 361
 362 static SCM
 363 scm_read_sexp (scm_t_wchar chr, SCM port)
 364 #define FUNC_NAME "scm_i_lreadparen"
 365 {
 366   int c;
 367   SCM tmp, tl, ans = SCM_EOL;
 368   const int terminating_char = ((chr == '[') ? ']' : ')');
 369
 370   /* Need to capture line and column numbers here. */
 371   long line = SCM_LINUM (port);
 372   int column = SCM_COL (port) - 1;
 373
 374   c = flush_ws (port, FUNC_NAME);
 375   if (terminating_char == c)
 376     return SCM_EOL;
 377
 378   scm_ungetc (c, port);
 379   tmp = scm_read_expression (port);
 380
 381   /* Note that it is possible for scm_read_expression to return
 382      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 383      check that it's a real dot by checking `c'.  */
 384   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 385     {
 386       ans = scm_read_expression (port);
 387       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 388         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 389                            SCM_EOL);
 390       return ans;
 391     }
 392
 393   /* Build the head of the list structure. */
 394   ans = tl = scm_cons (tmp, SCM_EOL);
 395
 396   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 397     {
 398       SCM new_tail;
 399
 400       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 401         scm_i_input_error (FUNC_NAME, port,
 402                            "in pair: mismatched close paren: ~A",
 403                            scm_list_1 (SCM_MAKE_CHAR (c)));
 404
 405       scm_ungetc (c, port);
 406       tmp = scm_read_expression (port);
 407
 408       /* See above note about scm_sym_dot.  */
 409       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 410         {
 411           SCM_SETCDR (tl, scm_read_expression (port));
 412
 413           c = flush_ws (port, FUNC_NAME);
 414           if (terminating_char != c)
 415             scm_i_input_error (FUNC_NAME, port,
 416                                "in pair: missing close paren", SCM_EOL);
 417           goto exit;
 418         }
 419
 420       new_tail = scm_cons (tmp, SCM_EOL);
 421       SCM_SETCDR (tl, new_tail);
 422       tl = new_tail;
 423     }
 424
 425  exit:
 426   if (SCM_RECORD_POSITIONS_P)
 427     scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
 428
 429   return ans;
 430 }
 431 #undef FUNC_NAME
 432
 433
 434 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 435    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 436    found.  */
 437 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 438   do                                                               \
 439     {                                                              \
 440       scm_t_wchar a;                                               \
 441       size_t i = 0;                                                \
 442       c = 0;                                                       \
 443       while (i < ndigits)                                          \
 444         {                                                          \
 445           a = scm_getc (port);                                     \
 446           if (a == EOF)                                            \
 447             goto str_eof;                                          \
 448           if (terminator                                           \
 449               && (a == (scm_t_wchar) terminator)                   \
 450               && (i > 0))                                          \
 451             break;                                                 \
 452           if ('0' <= a && a <= '9')                                \
 453             a -= '0';                                              \
 454           else if ('A' <= a && a <= 'F')                           \
 455             a = a - 'A' + 10;                                      \
 456           else if ('a' <= a && a <= 'f')                           \
 457             a = a - 'a' + 10;                                      \
 458           else                                                     \
 459             {                                                      \
 460               c = a;                                               \
 461               goto bad_escaped;                                    \
 462             }                                                      \
 463           c = c * 16 + a;                                          \
 464           i ++;                                                    \
 465         }                                                          \
 466     } while (0)
 467
 468 static void
 469 skip_intraline_whitespace (SCM port)
 470 {
 471   scm_t_wchar c;
 472
 473   do
 474     {
 475       c = scm_getc (port);
 476       if (c == EOF)
 477         return;
 478     }
 479   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 480
 481   scm_ungetc (c, port);
 482 }
 483
 484 static SCM
 485 scm_read_string (int chr, SCM port)
 486 #define FUNC_NAME "scm_lreadr"
 487 {
 488   /* For strings smaller than C_STR, this function creates only one Scheme
 489      object (the string returned).  */
 490
 491   SCM str = SCM_BOOL_F;
 492   unsigned c_str_len = 0;
 493   scm_t_wchar c;
 494
 495   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 496   while ('"' != (c = scm_getc (port)))
 497     {
 498       if (c == EOF)
 499         {
 500         str_eof:
 501           scm_i_input_error (FUNC_NAME, port,
 502                              "end of file in string constant", SCM_EOL);
 503         }
 504
 505       if (c_str_len + 1 >= scm_i_string_length (str))
 506         {
 507           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 508
 509           str = scm_string_append (scm_list_2 (str, addy));
 510         }
 511
 512       if (c == '\\')
 513         {
 514           switch (c = scm_getc (port))
 515             {
 516             case EOF:
 517               goto str_eof;
 518             case '"':
 519             case '\\':
 520               break;
 521             case '\n':
 522               if (SCM_HUNGRY_EOL_ESCAPES_P)
 523                 skip_intraline_whitespace (port);
 524               continue;
 525             case '0':
 526               c = '\0';
 527               break;
 528             case 'f':
 529               c = '\f';
 530               break;
 531             case 'n':
 532               c = '\n';
 533               break;
 534             case 'r':
 535               c = '\r';
 536               break;
 537             case 't':
 538               c = '\t';
 539               break;
 540             case 'a':
 541               c = '\007';
 542               break;
 543             case 'v':
 544               c = '\v';
 545               break;
 546             case 'b':
 547               c = '\010';
 548               break;
 549             case 'x':
 550               if (SCM_R6RS_ESCAPES_P)
 551                 SCM_READ_HEX_ESCAPE (10, ';');
 552               else
 553                 SCM_READ_HEX_ESCAPE (2, '\0');
 554               break;
 555             case 'u':
 556               if (!SCM_R6RS_ESCAPES_P)
 557                 {
 558                   SCM_READ_HEX_ESCAPE (4, '\0');
 559                   break;
 560                 }
 561             case 'U':
 562               if (!SCM_R6RS_ESCAPES_P)
 563                 {
 564                   SCM_READ_HEX_ESCAPE (6, '\0');
 565                   break;
 566                 }
 567             default:
 568             bad_escaped:
 569               scm_i_input_error (FUNC_NAME, port,
 570                                  "illegal character in escape sequence: ~S",
 571                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 572             }
 573         }
 574       str = scm_i_string_start_writing (str);
 575       scm_i_string_set_x (str, c_str_len++, c);
 576       scm_i_string_stop_writing ();
 577     }
 578
 579   if (c_str_len > 0)
 580     {
 581       return scm_i_substring_copy (str, 0, c_str_len);
 582     }
 583
 584   return scm_nullstr;
 585 }
 586 #undef FUNC_NAME
 587
 588
 589 static SCM
 590 scm_read_number (scm_t_wchar chr, SCM port)
 591 {
 592   SCM result, str = SCM_EOL;
 593   char buffer[READER_BUFFER_SIZE];
 594   char *overflow_buffer = NULL;
 595   size_t bytes_read;
 596   int overflow;
 597   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 598
 599   scm_ungetc (chr, port);
 600   overflow = read_complete_token (port, buffer, sizeof (buffer),
 601                                   &overflow_buffer, &bytes_read);
 602
 603   if (!overflow)
 604     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 605   else
 606     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 607                             pt->ilseq_handler);
 608
 609   result = scm_string_to_number (str, SCM_UNDEFINED);
 610   if (!scm_is_true (result))
 611     {
 612       /* Return a symbol instead of a number */
 613       if (SCM_CASE_INSENSITIVE_P)
 614         str = scm_string_downcase_x (str);
 615       result = scm_string_to_symbol (str);
 616     }
 617
 618   if (overflow)
 619     free (overflow_buffer);
 620   SCM_COL (port) += scm_i_string_length (str);
 621   return result;
 622 }
 623
 624 static SCM
 625 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 626 {
 627   SCM result;
 628   int ends_with_colon = 0;
 629   size_t bytes_read;
 630   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 631   int overflow;
 632   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 633   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 634   SCM str;
 635
 636   scm_ungetc (chr, port);
 637   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 638                                   &overflow_buffer, &bytes_read);
 639   if (bytes_read > 0)
 640     {
 641       if (!overflow)
 642         ends_with_colon = buffer[bytes_read - 1] == ':';
 643       else
 644         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 645     }
 646
 647   if (postfix && ends_with_colon && (bytes_read > 1))
 648     {
 649       if (!overflow)
 650         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 651       else
 652         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 653                                 pt->ilseq_handler);
 654
 655       if (SCM_CASE_INSENSITIVE_P)
 656         str = scm_string_downcase_x (str);
 657       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 658     }
 659   else
 660     {
 661       if (!overflow)
 662         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 663       else
 664         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 665                                 pt->ilseq_handler);
 666
 667       if (SCM_CASE_INSENSITIVE_P)
 668         str = scm_string_downcase_x (str);
 669       result = scm_string_to_symbol (str);
 670     }
 671
 672   if (overflow)
 673     free (overflow_buffer);
 674   SCM_COL (port) += scm_i_string_length (str);
 675   return result;
 676 }
 677
 678 static SCM
 679 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 680 #define FUNC_NAME "scm_lreadr"
 681 {
 682   SCM result;
 683   size_t read;
 684   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 685   int overflow;
 686   unsigned int radix;
 687   SCM str;
 688   scm_t_port *pt;
 689
 690   switch (chr)
 691     {
 692     case 'B':
 693     case 'b':
 694       radix = 2;
 695       break;
 696
 697     case 'o':
 698     case 'O':
 699       radix = 8;
 700       break;
 701
 702     case 'd':
 703     case 'D':
 704       radix = 10;
 705       break;
 706
 707     case 'x':
 708     case 'X':
 709       radix = 16;
 710       break;
 711
 712     default:
 713       scm_ungetc (chr, port);
 714       scm_ungetc ('#', port);
 715       radix = 10;
 716     }
 717
 718   overflow = read_complete_token (port, buffer, sizeof (buffer),
 719                                   &overflow_buffer, &read);
 720
 721   pt = SCM_PTAB_ENTRY (port);
 722   if (!overflow)
 723     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 724   else
 725     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 726                             pt->ilseq_handler);
 727
 728   result = scm_string_to_number (str, scm_from_uint (radix));
 729
 730   if (overflow)
 731     free (overflow_buffer);
 732
 733   SCM_COL (port) += scm_i_string_length (str);
 734
 735   if (scm_is_true (result))
 736     return result;
 737
 738   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 739
 740   return SCM_BOOL_F;
 741 }
 742 #undef FUNC_NAME
 743
 744 static SCM
 745 scm_read_quote (int chr, SCM port)
 746 {
 747   SCM p;
 748   long line = SCM_LINUM (port);
 749   int column = SCM_COL (port) - 1;
 750
 751   switch (chr)
 752     {
 753     case '`':
 754       p = scm_sym_quasiquote;
 755       break;
 756
 757     case '\'':
 758       p = scm_sym_quote;
 759       break;
 760
 761     case ',':
 762       {
 763         scm_t_wchar c;
 764
 765         c = scm_getc (port);
 766         if ('@' == c)
 767           p = scm_sym_uq_splicing;
 768         else
 769           {
 770             scm_ungetc (c, port);
 771             p = scm_sym_unquote;
 772           }
 773         break;
 774       }
 775
 776     default:
 777       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 778                "scm_read_quote", chr);
 779       abort ();
 780     }
 781
 782   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 783   if (SCM_RECORD_POSITIONS_P)
 784     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 785
 786   return p;
 787 }
 788
 789 SCM_SYMBOL (sym_syntax, "syntax");
 790 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 791 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 792 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 793
 794 static SCM
 795 scm_read_syntax (int chr, SCM port)
 796 {
 797   SCM p;
 798   long line = SCM_LINUM (port);
 799   int column = SCM_COL (port) - 1;
 800
 801   switch (chr)
 802     {
 803     case '`':
 804       p = sym_quasisyntax;
 805       break;
 806
 807     case '\'':
 808       p = sym_syntax;
 809       break;
 810
 811     case ',':
 812       {
 813         int c;
 814
 815         c = scm_getc (port);
 816         if ('@' == c)
 817           p = sym_unsyntax_splicing;
 818         else
 819           {
 820             scm_ungetc (c, port);
 821             p = sym_unsyntax;
 822           }
 823         break;
 824       }
 825
 826     default:
 827       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 828                "scm_read_syntax", chr);
 829       abort ();
 830     }
 831
 832   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 833   if (SCM_RECORD_POSITIONS_P)
 834     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 835
 836   return p;
 837 }
 838
 839 static SCM
 840 scm_read_nil (int chr, SCM port)
 841 {
 842   SCM id = scm_read_mixed_case_symbol (chr, port);
 843
 844   if (!scm_is_eq (id, sym_nil))
 845     scm_i_input_error ("scm_read_nil", port,
 846                        "unexpected input while reading #nil: ~a",
 847                        scm_list_1 (id));
 848
 849   return SCM_ELISP_NIL;
 850 }
 851
 852 static SCM
 853 scm_read_semicolon_comment (int chr, SCM port)
 854 {
 855   int c;
 856
 857   /* We use the get_byte here because there is no need to get the
 858      locale correct with comment input. This presumes that newline
 859      always represents itself no matter what the encoding is.  */
 860   for (c = scm_get_byte_or_eof (port);
 861        (c != EOF) && (c != '\n');
 862        c = scm_get_byte_or_eof (port));
 863
 864   return SCM_UNSPECIFIED;
 865 }
 866
 867 \f
 868 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 869
 870 static SCM
 871 scm_read_boolean (int chr, SCM port)
 872 {
 873   switch (chr)
 874     {
 875     case 't':
 876     case 'T':
 877       return SCM_BOOL_T;
 878
 879     case 'f':
 880     case 'F':
 881       return SCM_BOOL_F;
 882     }
 883
 884   return SCM_UNSPECIFIED;
 885 }
 886
 887 static SCM
 888 scm_read_character (scm_t_wchar chr, SCM port)
 889 #define FUNC_NAME "scm_lreadr"
 890 {
 891   char buffer[READER_CHAR_NAME_MAX_SIZE];
 892   SCM charname;
 893   size_t charname_len, bytes_read;
 894   scm_t_wchar cp;
 895   int overflow;
 896   scm_t_port *pt;
 897
 898   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 899   if (overflow)
 900     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 901
 902   if (bytes_read == 0)
 903     {
 904       chr = scm_getc (port);
 905       if (chr == EOF)
 906         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 907                            "while reading character", SCM_EOL);
 908
 909       /* CHR must be a token delimiter, like a whitespace.  */
 910       return (SCM_MAKE_CHAR (chr));
 911     }
 912
 913   pt = SCM_PTAB_ENTRY (port);
 914
 915   /* Simple ASCII characters can be processed immediately.  Also, simple
 916      ISO-8859-1 characters can be processed immediately if the encoding for this
 917      port is ISO-8859-1.  */
 918   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 919     {
 920       SCM_COL (port) += 1;
 921       return SCM_MAKE_CHAR (buffer[0]);
 922     }
 923
 924   /* Otherwise, convert the buffer into a proper scheme string for
 925      processing.  */
 926   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 927                                pt->ilseq_handler);
 928   charname_len = scm_i_string_length (charname);
 929   SCM_COL (port) += charname_len;
 930   cp = scm_i_string_ref (charname, 0);
 931   if (charname_len == 1)
 932     return SCM_MAKE_CHAR (cp);
 933
 934   /* Ignore dotted circles, which may be used to keep combining characters from
 935      combining with the backslash in #\charname.  */
 936   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 937     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 938
 939   if (cp >= '0' && cp < '8')
 940     {
 941       /* Dirk:FIXME::  This type of character syntax is not R5RS
 942        * compliant.  Further, it should be verified that the constant
 943        * does only consist of octal digits.  */
 944       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 945       if (SCM_I_INUMP (p))
 946         {
 947           scm_t_wchar c = scm_to_uint32 (p);
 948           if (SCM_IS_UNICODE_CHAR (c))
 949             return SCM_MAKE_CHAR (c);
 950           else
 951             scm_i_input_error (FUNC_NAME, port,
 952                                "out-of-range octal character escape: ~a",
 953                                scm_list_1 (charname));
 954         }
 955     }
 956
 957   if (cp == 'x' && (charname_len > 1))
 958     {
 959       SCM p;
 960
 961       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 962       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 963                                 scm_from_uint (16));
 964       if (SCM_I_INUMP (p))
 965         {
 966           scm_t_wchar c = scm_to_uint32 (p);
 967           if (SCM_IS_UNICODE_CHAR (c))
 968             return SCM_MAKE_CHAR (c);
 969           else
 970             scm_i_input_error (FUNC_NAME, port,
 971                                "out-of-range hex character escape: ~a",
 972                                scm_list_1 (charname));
 973         }
 974     }
 975
 976   /* The names of characters should never have non-Latin1
 977      characters.  */
 978   if (scm_i_is_narrow_string (charname)
 979       || scm_i_try_narrow_string (charname))
 980     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 981                                        charname_len);
 982       if (scm_is_true (ch))
 983         return ch;
 984     }
 985
 986   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 987                      scm_list_1 (charname));
 988
 989   return SCM_UNSPECIFIED;
 990 }
 991 #undef FUNC_NAME
 992
 993 static SCM
 994 scm_read_keyword (int chr, SCM port)
 995 {
 996   SCM symbol;
 997
 998   /* Read the symbol that comprises the keyword.  Doing this instead of
 999      invoking a specific symbol reader function allows `scm_read_keyword ()'
1000      to adapt to the delimiters currently valid of symbols.
1001
1002      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1003   symbol = scm_read_expression (port);
1004   if (!scm_is_symbol (symbol))
1005     scm_i_input_error ("scm_read_keyword", port,
1006                        "keyword prefix `~a' not followed by a symbol: ~s",
1007                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009   return (scm_symbol_to_keyword (symbol));
1010 }
1011
1012 static SCM
1013 scm_read_vector (int chr, SCM port)
1014 {
1015   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016      guarantee that it's going to do what we want.  After all, this is an
1017      implementation detail of `scm_read_vector ()', not a desirable
1018      property.  */
1019   return (scm_vector (scm_read_sexp (chr, port)));
1020 }
1021
1022 static SCM
1023 scm_read_srfi4_vector (int chr, SCM port)
1024 {
1025   return scm_i_read_array (port, chr);
1026 }
1027
1028 static SCM
1029 scm_read_bytevector (scm_t_wchar chr, SCM port)
1030 {
1031   chr = scm_getc (port);
1032   if (chr != 'u')
1033     goto syntax;
1034
1035   chr = scm_getc (port);
1036   if (chr != '8')
1037     goto syntax;
1038
1039   chr = scm_getc (port);
1040   if (chr != '(')
1041     goto syntax;
1042
1043   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045  syntax:
1046   scm_i_input_error ("read_bytevector", port,
1047                      "invalid bytevector prefix",
1048                      SCM_MAKE_CHAR (chr));
1049   return SCM_UNSPECIFIED;
1050 }
1051
1052 static SCM
1053 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1054 {
1055   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1056      terribly inefficient but who cares?  */
1057   SCM s_bits = SCM_EOL;
1058
1059   for (chr = scm_getc (port);
1060        (chr != EOF) && ((chr == '0') || (chr == '1'));
1061        chr = scm_getc (port))
1062     {
1063       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1064     }
1065
1066   if (chr != EOF)
1067     scm_ungetc (chr, port);
1068
1069   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070 }
1071
1072 static SCM
1073 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1074 {
1075   int bang_seen = 0;
1076
1077   for (;;)
1078     {
1079       int c = scm_getc (port);
1080
1081       if (c == EOF)
1082         scm_i_input_error ("skip_block_comment", port,
1083                            "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085       if (c == '!')
1086         bang_seen = 1;
1087       else if (c == '#' && bang_seen)
1088         break;
1089       else
1090         bang_seen = 0;
1091     }
1092
1093   return SCM_UNSPECIFIED;
1094 }
1095
1096 static SCM
1097 scm_read_shebang (scm_t_wchar chr, SCM port)
1098 {
1099   int c = 0;
1100   if ((c = scm_get_byte_or_eof (port)) != 'r')
1101     {
1102       scm_ungetc (c, port);
1103       return scm_read_scsh_block_comment (chr, port);
1104     }
1105   if ((c = scm_get_byte_or_eof (port)) != '6')
1106     {
1107       scm_ungetc (c, port);
1108       scm_ungetc ('r', port);
1109       return scm_read_scsh_block_comment (chr, port);
1110     }
1111   if ((c = scm_get_byte_or_eof (port)) != 'r')
1112     {
1113       scm_ungetc (c, port);
1114       scm_ungetc ('6', port);
1115       scm_ungetc ('r', port);
1116       return scm_read_scsh_block_comment (chr, port);
1117     }
1118   if ((c = scm_get_byte_or_eof (port)) != 's')
1119     {
1120       scm_ungetc (c, port);
1121       scm_ungetc ('r', port);
1122       scm_ungetc ('6', port);
1123       scm_ungetc ('r', port);
1124       return scm_read_scsh_block_comment (chr, port);
1125     }
1126
1127   return SCM_UNSPECIFIED;
1128 }
1129
1130 static SCM
1131 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132 {
1133   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134      nested.  So care must be taken.  */
1135   int nesting_level = 1;
1136
1137   int a = scm_getc (port);
1138
1139   if (a == EOF)
1140     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1141                        "unterminated `#| ... |#' comment", SCM_EOL);
1142
1143   while (nesting_level > 0)
1144     {
1145       int b = scm_getc (port);
1146
1147       if (b == EOF)
1148         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1149                            "unterminated `#| ... |#' comment", SCM_EOL);
1150
1151       if (a == '|' && b == '#')
1152         {
1153           nesting_level--;
1154           b = EOF;
1155         }
1156       else if (a == '#' && b == '|')
1157         {
1158           nesting_level++;
1159           b = EOF;
1160         }
1161
1162       a = b;
1163     }
1164
1165   return SCM_UNSPECIFIED;
1166 }
1167
1168 static SCM
1169 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1170 {
1171   scm_t_wchar c;
1172
1173   c = flush_ws (port, (char *) NULL);
1174   if (EOF == c)
1175     scm_i_input_error ("read_commented_expression", port,
1176                        "no expression after #; comment", SCM_EOL);
1177   scm_ungetc (c, port);
1178   scm_read_expression (port);
1179   return SCM_UNSPECIFIED;
1180 }
1181
1182 static SCM
1183 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1184 {
1185   /* Guile's extended symbol read syntax looks like this:
1186
1187        #{This is all a symbol name}#
1188
1189      So here, CHR is expected to be `{'.  */
1190   int saw_brace = 0;
1191   size_t len = 0;
1192   SCM buf = scm_i_make_string (1024, NULL, 0);
1193
1194   buf = scm_i_string_start_writing (buf);
1195
1196   while ((chr = scm_getc (port)) != EOF)
1197     {
1198       if (saw_brace)
1199         {
1200           if (chr == '#')
1201             {
1202               break;
1203             }
1204           else
1205             {
1206               saw_brace = 0;
1207               scm_i_string_set_x (buf, len++, '}');
1208             }
1209         }
1210
1211       if (chr == '}')
1212         saw_brace = 1;
1213       else if (chr == '\\')
1214         {
1215           /* It used to be that print.c would print extended-read-syntax
1216              symbols with backslashes before "non-standard" chars, but
1217              this routine wouldn't do anything with those escapes.
1218              Bummer.  What we've done is to change print.c to output
1219              R6RS hex escapes for those characters, relying on the fact
1220              that the extended read syntax would never put a `\' before
1221              an `x'.  For now, we just ignore other instances of
1222              backslash in the string.  */
1223           switch ((chr = scm_getc (port)))
1224             {
1225             case EOF:
1226               goto done;
1227             case 'x':
1228               {
1229                 scm_t_wchar c;
1230
1231                 SCM_READ_HEX_ESCAPE (10, ';');
1232                 scm_i_string_set_x (buf, len++, c);
1233                 break;
1234
1235               str_eof:
1236                 chr = EOF;
1237                 goto done;
1238
1239               bad_escaped:
1240                 scm_i_string_stop_writing ();
1241                 scm_i_input_error ("scm_read_extended_symbol", port,
1242                                    "illegal character in escape sequence: ~S",
1243                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1244                 break;
1245               }
1246             default:
1247               scm_i_string_set_x (buf, len++, chr);
1248               break;
1249             }
1250         }
1251       else
1252         scm_i_string_set_x (buf, len++, chr);
1253
1254       if (len >= scm_i_string_length (buf) - 2)
1255         {
1256           SCM addy;
1257
1258           scm_i_string_stop_writing ();
1259           addy = scm_i_make_string (1024, NULL, 0);
1260           buf = scm_string_append (scm_list_2 (buf, addy));
1261           len = 0;
1262           buf = scm_i_string_start_writing (buf);
1263         }
1264     }
1265
1266  done:
1267   scm_i_string_stop_writing ();
1268   if (chr == EOF)
1269     scm_i_input_error ("scm_read_extended_symbol", port,
1270                        "end of file while reading symbol", SCM_EOL);
1271
1272   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1273 }
1274
1275
1276 \f
1277 /* Top-level token readers, i.e., dispatchers.  */
1278
1279 static SCM
1280 scm_read_sharp_extension (int chr, SCM port)
1281 {
1282   SCM proc;
1283
1284   proc = scm_get_hash_procedure (chr);
1285   if (scm_is_true (scm_procedure_p (proc)))
1286     {
1287       long line = SCM_LINUM (port);
1288       int column = SCM_COL (port) - 2;
1289       SCM got;
1290
1291       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1292
1293       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1294         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1295
1296       return got;
1297     }
1298
1299   return SCM_UNSPECIFIED;
1300 }
1301
1302 /* The reader for the sharp `#' character.  It basically dispatches reads
1303    among the above token readers.   */
1304 static SCM
1305 scm_read_sharp (scm_t_wchar chr, SCM port)
1306 #define FUNC_NAME "scm_lreadr"
1307 {
1308   SCM result;
1309
1310   chr = scm_getc (port);
1311
1312   result = scm_read_sharp_extension (chr, port);
1313   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1314     return result;
1315
1316   switch (chr)
1317     {
1318     case '\\':
1319       return (scm_read_character (chr, port));
1320     case '(':
1321       return (scm_read_vector (chr, port));
1322     case 's':
1323     case 'u':
1324     case 'f':
1325     case 'c':
1326       /* This one may return either a boolean or an SRFI-4 vector.  */
1327       return (scm_read_srfi4_vector (chr, port));
1328     case 'v':
1329       return (scm_read_bytevector (chr, port));
1330     case '*':
1331       return (scm_read_guile_bit_vector (chr, port));
1332     case 't':
1333     case 'T':
1334     case 'F':
1335       return (scm_read_boolean (chr, port));
1336     case ':':
1337       return (scm_read_keyword (chr, port));
1338     case '0': case '1': case '2': case '3': case '4':
1339     case '5': case '6': case '7': case '8': case '9':
1340     case '@':
1341 #if SCM_ENABLE_DEPRECATED
1342       /* See below for 'i' and 'e'. */
1343     case 'a':
1344     case 'y':
1345     case 'h':
1346     case 'l':
1347 #endif
1348       return (scm_i_read_array (port, chr));
1349
1350     case 'i':
1351     case 'e':
1352 #if SCM_ENABLE_DEPRECATED
1353       {
1354         /* When next char is '(', it really is an old-style
1355            uniform array. */
1356         scm_t_wchar next_c = scm_getc (port);
1357         if (next_c != EOF)
1358           scm_ungetc (next_c, port);
1359         if (next_c == '(')
1360           return scm_i_read_array (port, chr);
1361         /* Fall through. */
1362       }
1363 #endif
1364     case 'b':
1365     case 'B':
1366     case 'o':
1367     case 'O':
1368     case 'd':
1369     case 'D':
1370     case 'x':
1371     case 'X':
1372     case 'I':
1373     case 'E':
1374       return (scm_read_number_and_radix (chr, port));
1375     case '{':
1376       return (scm_read_extended_symbol (chr, port));
1377     case '!':
1378       return (scm_read_shebang (chr, port));
1379     case ';':
1380       return (scm_read_commented_expression (chr, port));
1381     case '`':
1382     case '\'':
1383     case ',':
1384       return (scm_read_syntax (chr, port));
1385     case 'n':
1386       return (scm_read_nil (chr, port));
1387     default:
1388       result = scm_read_sharp_extension (chr, port);
1389       if (scm_is_eq (result, SCM_UNSPECIFIED))
1390         {
1391           /* To remain compatible with 1.8 and earlier, the following
1392              characters have lower precedence than `read-hash-extend'
1393              characters.  */
1394           switch (chr)
1395             {
1396             case '|':
1397               return scm_read_r6rs_block_comment (chr, port);
1398             default:
1399               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1400                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1401             }
1402         }
1403       else
1404         return result;
1405     }
1406
1407   return SCM_UNSPECIFIED;
1408 }
1409 #undef FUNC_NAME
1410
1411 static SCM
1412 scm_read_expression (SCM port)
1413 #define FUNC_NAME "scm_read_expression"
1414 {
1415   while (1)
1416     {
1417       scm_t_wchar chr;
1418
1419       chr = scm_getc (port);
1420
1421       switch (chr)
1422         {
1423         case SCM_WHITE_SPACES:
1424         case SCM_LINE_INCREMENTORS:
1425           break;
1426         case ';':
1427           (void) scm_read_semicolon_comment (chr, port);
1428           break;
1429         case '[':
1430           if (!SCM_SQUARE_BRACKETS_P)
1431             return (scm_read_mixed_case_symbol (chr, port));
1432           /* otherwise fall through */
1433         case '(':
1434           return (scm_read_sexp (chr, port));
1435         case '"':
1436           return (scm_read_string (chr, port));
1437         case '\'':
1438         case '`':
1439         case ',':
1440           return (scm_read_quote (chr, port));
1441         case '#':
1442           {
1443             SCM result;
1444             result = scm_read_sharp (chr, port);
1445             if (scm_is_eq (result, SCM_UNSPECIFIED))
1446               /* We read a comment or some such.  */
1447               break;
1448             else
1449               return result;
1450           }
1451         case ')':
1452           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1453           break;
1454         case ']':
1455           if (SCM_SQUARE_BRACKETS_P)
1456             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1457           /* otherwise fall through */
1458         case EOF:
1459           return SCM_EOF_VAL;
1460         case ':':
1461           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1462             return scm_symbol_to_keyword (scm_read_expression (port));
1463           /* Fall through.  */
1464
1465         default:
1466           {
1467             if (((chr >= '0') && (chr <= '9'))
1468                 || (strchr ("+-.", chr)))
1469               return (scm_read_number (chr, port));
1470             else
1471               return (scm_read_mixed_case_symbol (chr, port));
1472           }
1473         }
1474     }
1475 }
1476 #undef FUNC_NAME
1477
1478 \f
1479 /* Actual reader.  */
1480
1481 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1482             (SCM port),
1483             "Read an s-expression from the input port @var{port}, or from\n"
1484             "the current input port if @var{port} is not specified.\n"
1485             "Any whitespace before the next token is discarded.")
1486 #define FUNC_NAME s_scm_read
1487 {
1488   int c;
1489
1490   if (SCM_UNBNDP (port))
1491     port = scm_current_input_port ();
1492   SCM_VALIDATE_OPINPORT (1, port);
1493
1494   c = flush_ws (port, (char *) NULL);
1495   if (EOF == c)
1496     return SCM_EOF_VAL;
1497   scm_ungetc (c, port);
1498
1499   return (scm_read_expression (port));
1500 }
1501 #undef FUNC_NAME
1502
1503
1504 \f
1505
1506 /* Manipulate the read-hash-procedures alist.  This could be written in
1507    Scheme, but maybe it will also be used by C code during initialisation.  */
1508 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1509             (SCM chr, SCM proc),
1510             "Install the procedure @var{proc} for reading expressions\n"
1511             "starting with the character sequence @code{#} and @var{chr}.\n"
1512             "@var{proc} will be called with two arguments:  the character\n"
1513             "@var{chr} and the port to read further data from. The object\n"
1514             "returned will be the return value of @code{read}. \n"
1515             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1516             )
1517 #define FUNC_NAME s_scm_read_hash_extend
1518 {
1519   SCM this;
1520   SCM prev;
1521
1522   SCM_VALIDATE_CHAR (1, chr);
1523   SCM_ASSERT (scm_is_false (proc)
1524               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1525               proc, SCM_ARG2, FUNC_NAME);
1526
1527   /* Check if chr is already in the alist.  */
1528   this = scm_i_read_hash_procedures_ref ();
1529   prev = SCM_BOOL_F;
1530   while (1)
1531     {
1532       if (scm_is_null (this))
1533         {
1534           /* not found, so add it to the beginning.  */
1535           if (scm_is_true (proc))
1536             {
1537               SCM new = scm_cons (scm_cons (chr, proc),
1538                                   scm_i_read_hash_procedures_ref ());
1539               scm_i_read_hash_procedures_set_x (new);
1540             }
1541           break;
1542         }
1543       if (scm_is_eq (chr, SCM_CAAR (this)))
1544         {
1545           /* already in the alist.  */
1546           if (scm_is_false (proc))
1547             {
1548               /* remove it.  */
1549               if (scm_is_false (prev))
1550                 {
1551                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1552                   scm_i_read_hash_procedures_set_x (rest);
1553                 }
1554               else
1555                 scm_set_cdr_x (prev, SCM_CDR (this));
1556             }
1557           else
1558             {
1559               /* replace it.  */
1560               scm_set_cdr_x (SCM_CAR (this), proc);
1561             }
1562           break;
1563         }
1564       prev = this;
1565       this = SCM_CDR (this);
1566     }
1567
1568   return SCM_UNSPECIFIED;
1569 }
1570 #undef FUNC_NAME
1571
1572 /* Recover the read-hash procedure corresponding to char c.  */
1573 static SCM
1574 scm_get_hash_procedure (int c)
1575 {
1576   SCM rest = scm_i_read_hash_procedures_ref ();
1577
1578   while (1)
1579     {
1580       if (scm_is_null (rest))
1581         return SCM_BOOL_F;
1582
1583       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1584         return SCM_CDAR (rest);
1585
1586       rest = SCM_CDR (rest);
1587     }
1588 }
1589
1590 #define SCM_ENCODING_SEARCH_SIZE (500)
1591
1592 /* Search the first few hundred characters of a file for an Emacs-like coding
1593    declaration.  Returns either NULL or a string whose storage has been
1594    allocated with `scm_gc_malloc ()'.  */
1595 char *
1596 scm_i_scan_for_encoding (SCM port)
1597 {
1598   scm_t_port *pt;
1599   char header[SCM_ENCODING_SEARCH_SIZE+1];
1600   size_t bytes_read, encoding_length, i;
1601   char *encoding = NULL;
1602   int utf8_bom = 0;
1603   char *pos, *encoding_start;
1604   int in_comment;
1605
1606   pt = SCM_PTAB_ENTRY (port);
1607
1608   if (pt->rw_active == SCM_PORT_WRITE)
1609     scm_flush (port);
1610
1611   if (pt->rw_random)
1612     pt->rw_active = SCM_PORT_READ;
1613
1614   if (pt->read_pos == pt->read_end)
1615     {
1616       /* We can use the read buffer, and thus avoid a seek. */
1617       if (scm_fill_input (port) == EOF)
1618         return NULL;
1619
1620       bytes_read = pt->read_end - pt->read_pos;
1621       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1622         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1623
1624       if (bytes_read <= 1)
1625         /* An unbuffered port -- don't scan.  */
1626         return NULL;
1627
1628       memcpy (header, pt->read_pos, bytes_read);
1629       header[bytes_read] = '\0';
1630     }
1631   else
1632     {
1633       /* Try to read some bytes and then seek back.  Not all ports
1634          support seeking back; and indeed some file ports (like
1635          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1636          check performed by SCM_FPORT_FDES---but fail to seek
1637          backwards.  Hence this block comes second.  We prefer to use
1638          the read buffer in-place.  */
1639       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1640         return NULL;
1641
1642       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1643       header[bytes_read] = '\0';
1644       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1645     }
1646
1647   if (bytes_read > 3
1648       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1649     utf8_bom = 1;
1650
1651   /* search past "coding[:=]" */
1652   pos = header;
1653   while (1)
1654     {
1655       if ((pos = strstr(pos, "coding")) == NULL)
1656         return NULL;
1657
1658       pos += strlen("coding");
1659       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1660           (*pos == ':' || *pos == '='))
1661         {
1662           pos ++;
1663           break;
1664         }
1665     }
1666
1667   /* skip spaces */
1668   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1669          (*pos == ' ' || *pos == '\t'))
1670     pos ++;
1671
1672   /* grab the next token */
1673   encoding_start = pos;
1674   i = 0;
1675   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1676          && encoding_start + i - header < bytes_read
1677          && (isalnum ((int) encoding_start[i])
1678              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1679     i++;
1680
1681   encoding_length = i;
1682   if (encoding_length == 0)
1683     return NULL;
1684
1685   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1686   for (i = 0; i < encoding_length; i++)
1687     encoding[i] = toupper ((int) encoding[i]);
1688
1689   /* push backwards to make sure we were in a comment */
1690   in_comment = 0;
1691   pos = encoding_start;
1692   while (pos >= header)
1693     {
1694       if (*pos == ';')
1695         {
1696           in_comment = 1;
1697           break;
1698         }
1699       else if (*pos == '\n' || pos == header)
1700         {
1701           /* This wasn't in a semicolon comment. Check for a
1702            hash-bang comment. */
1703           char *beg = strstr (header, "#!");
1704           char *end = strstr (header, "!#");
1705           if (beg < encoding_start && encoding_start + encoding_length <= end)
1706             in_comment = 1;
1707           break;
1708         }
1709       else
1710         {
1711           pos --;
1712           continue;
1713         }
1714     }
1715   if (!in_comment)
1716     /* This wasn't in a comment */
1717     return NULL;
1718
1719   if (utf8_bom && strcmp(encoding, "UTF-8"))
1720     scm_misc_error (NULL,
1721                     "the port input declares the encoding ~s but is encoded as UTF-8",
1722                     scm_list_1 (scm_from_locale_string (encoding)));
1723
1724   return encoding;
1725 }
1726
1727 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1728             (SCM port),
1729             "Scans the port for an Emacs-like character coding declaration\n"
1730             "near the top of the contents of a port with random-accessible contents.\n"
1731             "The coding declaration is of the form\n"
1732             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1733             "\n"
1734             "Returns a string containing the character encoding of the file\n"
1735             "if a declaration was found, or @code{#f} otherwise.\n")
1736 #define FUNC_NAME s_scm_file_encoding
1737 {
1738   char *enc;
1739   SCM s_enc;
1740
1741   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1742
1743   enc = scm_i_scan_for_encoding (port);
1744   if (enc == NULL)
1745     return SCM_BOOL_F;
1746   else
1747     {
1748       s_enc = scm_from_locale_string (enc);
1749       return s_enc;
1750     }
1751
1752   return SCM_BOOL_F;
1753 }
1754 #undef FUNC_NAME
1755
1756 void
1757 scm_init_read ()
1758 {
1759   SCM read_hash_procs;
1760
1761   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1762
1763   scm_i_read_hash_procedures =
1764     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1765
1766   scm_init_opts (scm_read_options, scm_read_opts);
1767 #include "libguile/read.x"
1768 }
1769
1770 /*
1771   Local Variables:
1772   c-file-style: "gnu"
1773   End:
1774 */