libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63 SCM_SYMBOL (sym_nil, "nil");
  64
  65 scm_t_option scm_read_opts[] = {
  66   { SCM_OPTION_BOOLEAN, "copy", 0,
  67     "Copy source code expressions." },
  68   { SCM_OPTION_BOOLEAN, "positions", 0,
  69     "Record positions of source code expressions." },
  70   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  71     "Convert symbols to lower case."},
  72   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
  73     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  74   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  75     "Use R6RS variable-length character and string hex escapes."},
  76   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  77     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  78   { 0, },
  79 };
  80
  81 /*
  82   Give meaningful error messages for errors
  83
  84   We use the format
  85
  86   FILE:LINE:COL: MESSAGE
  87   This happened in ....
  88
  89   This is not standard GNU format, but the test-suite likes the real
  90   message to be in front.
  91
  92  */
  93
  94
  95 void
  96 scm_i_input_error (char const *function,
  97                    SCM port, const char *message, SCM arg)
  98 {
  99   SCM fn = (scm_is_string (SCM_FILENAME(port))
 100             ? SCM_FILENAME(port)
 101             : scm_from_locale_string ("#<unknown port>"));
 102
 103   SCM string_port = scm_open_output_string ();
 104   SCM string = SCM_EOL;
 105   scm_simple_format (string_port,
 106                      scm_from_locale_string ("~A:~S:~S: ~A"),
 107                      scm_list_4 (fn,
 108                                  scm_from_long (SCM_LINUM (port) + 1),
 109                                  scm_from_int (SCM_COL (port) + 1),
 110                                  scm_from_locale_string (message)));
 111
 112   string = scm_get_output_string (string_port);
 113   scm_close_output_port (string_port);
 114   scm_error_scm (scm_from_locale_symbol ("read-error"),
 115                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 116                  string,
 117                  arg,
 118                  SCM_BOOL_F);
 119 }
 120
 121
 122 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 123             (SCM setting),
 124             "Option interface for the read options. Instead of using\n"
 125             "this procedure directly, use the procedures @code{read-enable},\n"
 126             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 127 #define FUNC_NAME s_scm_read_options
 128 {
 129   SCM ans = scm_options (setting,
 130                          scm_read_opts,
 131                          FUNC_NAME);
 132   if (SCM_COPY_SOURCE_P)
 133     SCM_RECORD_POSITIONS_P = 1;
 134   return ans;
 135 }
 136 #undef FUNC_NAME
 137
 138 /* A fluid referring to an association list mapping extra hash
 139    characters to procedures.  */
 140 static SCM *scm_i_read_hash_procedures;
 141
 142 static inline SCM
 143 scm_i_read_hash_procedures_ref (void)
 144 {
 145   return scm_fluid_ref (*scm_i_read_hash_procedures);
 146 }
 147
 148 static inline void
 149 scm_i_read_hash_procedures_set_x (SCM value)
 150 {
 151   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 152 }
 153
 154 \f
 155 /* Token readers.  */
 156
 157
 158 /* Size of the C buffer used to read symbols and numbers.  */
 159 #define READER_BUFFER_SIZE            128
 160
 161 /* Size of the C buffer used to read strings.  */
 162 #define READER_STRING_BUFFER_SIZE     512
 163
 164 /* The maximum size of Scheme character names.  */
 165 #define READER_CHAR_NAME_MAX_SIZE      50
 166
 167
 168 /* `isblank' is only in C99.  */
 169 #define CHAR_IS_BLANK_(_chr)                                    \
 170   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 171    || ((_chr) == '\f') || ((_chr) == '\r'))
 172
 173 #ifdef MSDOS
 174 # define CHAR_IS_BLANK(_chr)                    \
 175   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 176 #else
 177 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 178 #endif
 179
 180
 181 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 182    structure'').  */
 183 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 184   (CHAR_IS_BLANK (c)                                            \
 185    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 186    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 187
 188 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 189
 190 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 191    Structure''.  */
 192 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 193   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 194    || ((_chr) == 'd') || ((_chr) == 'l'))
 195
 196 /* Read an SCSH block comment.  */
 197 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 198 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 199 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 200 static SCM scm_read_shebang (scm_t_wchar, SCM);
 201 static SCM scm_get_hash_procedure (int);
 202
 203 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 204    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 205    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 206    bytes actually read.  */
 207 static inline int
 208 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 209  {
 210    *read = 0;
 211
 212    while (*read < buf_size)
 213      {
 214        int chr;
 215
 216        chr = scm_get_byte_or_eof (port);
 217
 218        if (chr == EOF)
 219         return 0;
 220       else if (CHAR_IS_DELIMITER (chr))
 221         {
 222           scm_unget_byte (chr, port);
 223           return 0;
 224         }
 225       else
 226         {
 227           *buf = (char) chr;
 228           buf++, (*read)++;
 229         }
 230      }
 231
 232    return 1;
 233  }
 234
 235 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 236    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 237    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 238    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 239    will be set the number of bytes actually read.  */
 240 static int
 241 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 242                            char **overflow_buffer, size_t *read)
 243 {
 244   int overflow = 0;
 245   size_t bytes_read, overflow_size;
 246
 247   *overflow_buffer = NULL;
 248   overflow_size = 0;
 249
 250   do
 251     {
 252       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 253       if (bytes_read == 0)
 254         break;
 255       if (overflow || overflow_size != 0)
 256         {
 257           if (overflow_size == 0)
 258             {
 259               *overflow_buffer = scm_malloc (bytes_read);
 260               memcpy (*overflow_buffer, buffer, bytes_read);
 261               overflow_size = bytes_read;
 262             }
 263           else
 264             {
 265               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 266               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 267               overflow_size += bytes_read;
 268             }
 269         }
 270     }
 271   while (overflow);
 272
 273   if (overflow_size)
 274     *read = overflow_size;
 275   else
 276     *read = bytes_read;
 277
 278   return (overflow_size != 0);
 279 }
 280
 281 /* Skip whitespace from PORT and return the first non-whitespace character
 282    read.  Raise an error on end-of-file.  */
 283 static int
 284 flush_ws (SCM port, const char *eoferr)
 285 {
 286   register scm_t_wchar c;
 287   while (1)
 288     switch (c = scm_getc (port))
 289       {
 290       case EOF:
 291       goteof:
 292         if (eoferr)
 293           {
 294             scm_i_input_error (eoferr,
 295                                port,
 296                                "end of file",
 297                                SCM_EOL);
 298           }
 299         return c;
 300
 301       case ';':
 302       lp:
 303         switch (c = scm_getc (port))
 304           {
 305           case EOF:
 306             goto goteof;
 307           default:
 308             goto lp;
 309           case SCM_LINE_INCREMENTORS:
 310             break;
 311           }
 312         break;
 313
 314       case '#':
 315         switch (c = scm_getc (port))
 316           {
 317           case EOF:
 318             eoferr = "read_sharp";
 319             goto goteof;
 320           case '!':
 321             scm_read_shebang (c, port);
 322             break;
 323           case ';':
 324             scm_read_commented_expression (c, port);
 325             break;
 326           case '|':
 327             if (scm_is_false (scm_get_hash_procedure (c)))
 328               {
 329                 scm_read_r6rs_block_comment (c, port);
 330                 break;
 331               }
 332             /* fall through */
 333           default:
 334             scm_ungetc (c, port);
 335             return '#';
 336           }
 337         break;
 338
 339       case SCM_LINE_INCREMENTORS:
 340       case SCM_SINGLE_SPACES:
 341       case '\t':
 342         break;
 343
 344       default:
 345         return c;
 346       }
 347
 348   return 0;
 349 }
 350
 351
 352 \f
 353 /* Token readers.  */
 354
 355 static SCM scm_read_expression (SCM port);
 356 static SCM scm_read_sharp (int chr, SCM port);
 357 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 358
 359
 360 static SCM
 361 scm_read_sexp (scm_t_wchar chr, SCM port)
 362 #define FUNC_NAME "scm_i_lreadparen"
 363 {
 364   register int c;
 365   register SCM tmp;
 366   register SCM tl, ans = SCM_EOL;
 367   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 368   const int terminating_char = ((chr == '[') ? ']' : ')');
 369
 370   /* Need to capture line and column numbers here. */
 371   long line = SCM_LINUM (port);
 372   int column = SCM_COL (port) - 1;
 373
 374
 375   c = flush_ws (port, FUNC_NAME);
 376   if (terminating_char == c)
 377     return SCM_EOL;
 378
 379   scm_ungetc (c, port);
 380   if (scm_is_eq (scm_sym_dot,
 381                  (tmp = scm_read_expression (port))))
 382     {
 383       ans = scm_read_expression (port);
 384       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 385         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 386                            SCM_EOL);
 387       return ans;
 388     }
 389
 390   /* Build the head of the list structure. */
 391   ans = tl = scm_cons (tmp, SCM_EOL);
 392
 393   if (SCM_COPY_SOURCE_P)
 394     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 395                            ? copy
 396                            : tmp,
 397                            SCM_EOL);
 398
 399   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 400     {
 401       SCM new_tail;
 402
 403       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 404         scm_i_input_error (FUNC_NAME, port,
 405                            "in pair: mismatched close paren: ~A",
 406                            scm_list_1 (SCM_MAKE_CHAR (c)));
 407
 408       scm_ungetc (c, port);
 409       tmp = scm_read_expression (port);
 410
 411       if (scm_is_eq (scm_sym_dot, tmp))
 412         {
 413           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 414
 415           if (SCM_COPY_SOURCE_P)
 416             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 417                                        SCM_EOL));
 418
 419           c = flush_ws (port, FUNC_NAME);
 420           if (terminating_char != c)
 421             scm_i_input_error (FUNC_NAME, port,
 422                                "in pair: missing close paren", SCM_EOL);
 423           goto exit;
 424         }
 425
 426       new_tail = scm_cons (tmp, SCM_EOL);
 427       SCM_SETCDR (tl, new_tail);
 428       tl = new_tail;
 429
 430       if (SCM_COPY_SOURCE_P)
 431         {
 432           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 433                                     ? copy
 434                                     : tmp, SCM_EOL);
 435           SCM_SETCDR (tl2, new_tail2);
 436           tl2 = new_tail2;
 437         }
 438     }
 439
 440  exit:
 441   if (SCM_RECORD_POSITIONS_P)
 442     scm_whash_insert (scm_source_whash,
 443                       ans,
 444                       scm_make_srcprops (line, column,
 445                                          SCM_FILENAME (port),
 446                                          SCM_COPY_SOURCE_P
 447                                          ? ans2
 448                                          : SCM_UNDEFINED,
 449                                          SCM_EOL));
 450   return ans;
 451 }
 452 #undef FUNC_NAME
 453
 454
 455 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 456    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 457    found.  */
 458 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 459   do                                                               \
 460     {                                                              \
 461       scm_t_wchar a;                                               \
 462       size_t i = 0;                                                \
 463       c = 0;                                                       \
 464       while (i < ndigits)                                          \
 465         {                                                          \
 466           a = scm_getc (port);                                     \
 467           if (a == EOF)                                            \
 468             goto str_eof;                                          \
 469           if (terminator                                           \
 470               && (a == (scm_t_wchar) terminator)                   \
 471               && (i > 0))                                          \
 472             break;                                                 \
 473           if ('0' <= a && a <= '9')                                \
 474             a -= '0';                                              \
 475           else if ('A' <= a && a <= 'F')                           \
 476             a = a - 'A' + 10;                                      \
 477           else if ('a' <= a && a <= 'f')                           \
 478             a = a - 'a' + 10;                                      \
 479           else                                                     \
 480             {                                                      \
 481               c = a;                                               \
 482               goto bad_escaped;                                    \
 483             }                                                      \
 484           c = c * 16 + a;                                          \
 485           i ++;                                                    \
 486         }                                                          \
 487     } while (0)
 488
 489 static SCM
 490 scm_read_string (int chr, SCM port)
 491 #define FUNC_NAME "scm_lreadr"
 492 {
 493   /* For strings smaller than C_STR, this function creates only one Scheme
 494      object (the string returned).  */
 495
 496   SCM str = SCM_BOOL_F;
 497   unsigned c_str_len = 0;
 498   scm_t_wchar c;
 499
 500   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 501   while ('"' != (c = scm_getc (port)))
 502     {
 503       if (c == EOF)
 504         {
 505         str_eof:
 506           scm_i_input_error (FUNC_NAME, port,
 507                              "end of file in string constant", SCM_EOL);
 508         }
 509
 510       if (c_str_len + 1 >= scm_i_string_length (str))
 511         {
 512           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 513
 514           str = scm_string_append (scm_list_2 (str, addy));
 515         }
 516
 517       if (c == '\\')
 518         {
 519           switch (c = scm_getc (port))
 520             {
 521             case EOF:
 522               goto str_eof;
 523             case '"':
 524             case '\\':
 525               break;
 526             case '\n':
 527               continue;
 528             case '0':
 529               c = '\0';
 530               break;
 531             case 'f':
 532               c = '\f';
 533               break;
 534             case 'n':
 535               c = '\n';
 536               break;
 537             case 'r':
 538               c = '\r';
 539               break;
 540             case 't':
 541               c = '\t';
 542               break;
 543             case 'a':
 544               c = '\007';
 545               break;
 546             case 'v':
 547               c = '\v';
 548               break;
 549             case 'b':
 550               c = '\010';
 551               break;
 552             case 'x':
 553               if (SCM_R6RS_ESCAPES_P)
 554                 SCM_READ_HEX_ESCAPE (10, ';');
 555               else
 556                 SCM_READ_HEX_ESCAPE (2, '\0');
 557               break;
 558             case 'u':
 559               if (!SCM_R6RS_ESCAPES_P)
 560                 {
 561                   SCM_READ_HEX_ESCAPE (4, '\0');
 562                   break;
 563                 }
 564             case 'U':
 565               if (!SCM_R6RS_ESCAPES_P)
 566                 {
 567                   SCM_READ_HEX_ESCAPE (6, '\0');
 568                   break;
 569                 }
 570             default:
 571             bad_escaped:
 572               scm_i_input_error (FUNC_NAME, port,
 573                                  "illegal character in escape sequence: ~S",
 574                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 575             }
 576         }
 577       str = scm_i_string_start_writing (str);
 578       scm_i_string_set_x (str, c_str_len++, c);
 579       scm_i_string_stop_writing ();
 580     }
 581
 582   if (c_str_len > 0)
 583     {
 584       return scm_i_substring_copy (str, 0, c_str_len);
 585     }
 586
 587   return scm_nullstr;
 588 }
 589 #undef FUNC_NAME
 590
 591
 592 static SCM
 593 scm_read_number (scm_t_wchar chr, SCM port)
 594 {
 595   SCM result, str = SCM_EOL;
 596   char buffer[READER_BUFFER_SIZE];
 597   char *overflow_buffer = NULL;
 598   size_t bytes_read;
 599   int overflow;
 600   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 601
 602   scm_ungetc (chr, port);
 603   overflow = read_complete_token (port, buffer, sizeof (buffer),
 604                                   &overflow_buffer, &bytes_read);
 605
 606   if (!overflow)
 607     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 608   else
 609     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 610                             pt->ilseq_handler);
 611
 612   result = scm_string_to_number (str, SCM_UNDEFINED);
 613   if (!scm_is_true (result))
 614     {
 615       /* Return a symbol instead of a number */
 616       if (SCM_CASE_INSENSITIVE_P)
 617         str = scm_string_downcase_x (str);
 618       result = scm_string_to_symbol (str);
 619     }
 620
 621   if (overflow)
 622     free (overflow_buffer);
 623   SCM_COL (port) += scm_i_string_length (str);
 624   return result;
 625 }
 626
 627 static SCM
 628 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 629 {
 630   SCM result;
 631   int ends_with_colon = 0;
 632   size_t bytes_read;
 633   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 634   int overflow;
 635   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 636   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 637   SCM str;
 638
 639   scm_ungetc (chr, port);
 640   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 641                                   &overflow_buffer, &bytes_read);
 642   if (bytes_read > 0)
 643     {
 644       if (!overflow)
 645         ends_with_colon = buffer[bytes_read - 1] == ':';
 646       else
 647         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 648     }
 649
 650   if (postfix && ends_with_colon && (bytes_read > 1))
 651     {
 652       if (!overflow)
 653         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 654       else
 655         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 656                                 pt->ilseq_handler);
 657
 658       if (SCM_CASE_INSENSITIVE_P)
 659         str = scm_string_downcase_x (str);
 660       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 661     }
 662   else
 663     {
 664       if (!overflow)
 665         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 666       else
 667         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 668                                 pt->ilseq_handler);
 669
 670       if (SCM_CASE_INSENSITIVE_P)
 671         str = scm_string_downcase_x (str);
 672       result = scm_string_to_symbol (str);
 673     }
 674
 675   if (overflow)
 676     free (overflow_buffer);
 677   SCM_COL (port) += scm_i_string_length (str);
 678   return result;
 679 }
 680
 681 static SCM
 682 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 683 #define FUNC_NAME "scm_lreadr"
 684 {
 685   SCM result;
 686   size_t read;
 687   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 688   int overflow;
 689   unsigned int radix;
 690   SCM str;
 691   scm_t_port *pt;
 692
 693   switch (chr)
 694     {
 695     case 'B':
 696     case 'b':
 697       radix = 2;
 698       break;
 699
 700     case 'o':
 701     case 'O':
 702       radix = 8;
 703       break;
 704
 705     case 'd':
 706     case 'D':
 707       radix = 10;
 708       break;
 709
 710     case 'x':
 711     case 'X':
 712       radix = 16;
 713       break;
 714
 715     default:
 716       scm_ungetc (chr, port);
 717       scm_ungetc ('#', port);
 718       radix = 10;
 719     }
 720
 721   overflow = read_complete_token (port, buffer, sizeof (buffer),
 722                                   &overflow_buffer, &read);
 723
 724   pt = SCM_PTAB_ENTRY (port);
 725   if (!overflow)
 726     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 727   else
 728     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 729                             pt->ilseq_handler);
 730
 731   result = scm_string_to_number (str, scm_from_uint (radix));
 732
 733   if (overflow)
 734     free (overflow_buffer);
 735
 736   SCM_COL (port) += scm_i_string_length (str);
 737
 738   if (scm_is_true (result))
 739     return result;
 740
 741   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 742
 743   return SCM_BOOL_F;
 744 }
 745 #undef FUNC_NAME
 746
 747 static SCM
 748 scm_read_quote (int chr, SCM port)
 749 {
 750   SCM p;
 751   long line = SCM_LINUM (port);
 752   int column = SCM_COL (port) - 1;
 753
 754   switch (chr)
 755     {
 756     case '`':
 757       p = scm_sym_quasiquote;
 758       break;
 759
 760     case '\'':
 761       p = scm_sym_quote;
 762       break;
 763
 764     case ',':
 765       {
 766         scm_t_wchar c;
 767
 768         c = scm_getc (port);
 769         if ('@' == c)
 770           p = scm_sym_uq_splicing;
 771         else
 772           {
 773             scm_ungetc (c, port);
 774             p = scm_sym_unquote;
 775           }
 776         break;
 777       }
 778
 779     default:
 780       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 781                "scm_read_quote", chr);
 782       abort ();
 783     }
 784
 785   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 786   if (SCM_RECORD_POSITIONS_P)
 787     scm_whash_insert (scm_source_whash, p,
 788                       scm_make_srcprops (line, column,
 789                                          SCM_FILENAME (port),
 790                                          SCM_COPY_SOURCE_P
 791                                          ? (scm_cons2 (SCM_CAR (p),
 792                                                        SCM_CAR (SCM_CDR (p)),
 793                                                        SCM_EOL))
 794                                          : SCM_UNDEFINED,
 795                                          SCM_EOL));
 796
 797
 798   return p;
 799 }
 800
 801 SCM_SYMBOL (sym_syntax, "syntax");
 802 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 803 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 804 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 805
 806 static SCM
 807 scm_read_syntax (int chr, SCM port)
 808 {
 809   SCM p;
 810   long line = SCM_LINUM (port);
 811   int column = SCM_COL (port) - 1;
 812
 813   switch (chr)
 814     {
 815     case '`':
 816       p = sym_quasisyntax;
 817       break;
 818
 819     case '\'':
 820       p = sym_syntax;
 821       break;
 822
 823     case ',':
 824       {
 825         int c;
 826
 827         c = scm_getc (port);
 828         if ('@' == c)
 829           p = sym_unsyntax_splicing;
 830         else
 831           {
 832             scm_ungetc (c, port);
 833             p = sym_unsyntax;
 834           }
 835         break;
 836       }
 837
 838     default:
 839       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 840                "scm_read_syntax", chr);
 841       abort ();
 842     }
 843
 844   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 845   if (SCM_RECORD_POSITIONS_P)
 846     scm_whash_insert (scm_source_whash, p,
 847                       scm_make_srcprops (line, column,
 848                                          SCM_FILENAME (port),
 849                                          SCM_COPY_SOURCE_P
 850                                          ? (scm_cons2 (SCM_CAR (p),
 851                                                        SCM_CAR (SCM_CDR (p)),
 852                                                        SCM_EOL))
 853                                          : SCM_UNDEFINED,
 854                                          SCM_EOL));
 855
 856
 857   return p;
 858 }
 859
 860 static inline SCM
 861 scm_read_nil (int chr, SCM port)
 862 {
 863   SCM id = scm_read_mixed_case_symbol (chr, port);
 864
 865   if (!scm_is_eq (id, sym_nil))
 866     scm_i_input_error ("scm_read_nil", port,
 867                        "unexpected input while reading #nil: ~a",
 868                        scm_list_1 (id));
 869
 870   return SCM_ELISP_NIL;
 871 }
 872
 873 static inline SCM
 874 scm_read_semicolon_comment (int chr, SCM port)
 875 {
 876   int c;
 877
 878   /* We use the get_byte here because there is no need to get the
 879      locale correct with comment input. This presumes that newline
 880      always represents itself no matter what the encoding is.  */
 881   for (c = scm_get_byte_or_eof (port);
 882        (c != EOF) && (c != '\n');
 883        c = scm_get_byte_or_eof (port));
 884
 885   return SCM_UNSPECIFIED;
 886 }
 887
 888 \f
 889 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 890
 891 static SCM
 892 scm_read_boolean (int chr, SCM port)
 893 {
 894   switch (chr)
 895     {
 896     case 't':
 897     case 'T':
 898       return SCM_BOOL_T;
 899
 900     case 'f':
 901     case 'F':
 902       return SCM_BOOL_F;
 903     }
 904
 905   return SCM_UNSPECIFIED;
 906 }
 907
 908 static SCM
 909 scm_read_character (scm_t_wchar chr, SCM port)
 910 #define FUNC_NAME "scm_lreadr"
 911 {
 912   char buffer[READER_CHAR_NAME_MAX_SIZE];
 913   SCM charname;
 914   size_t charname_len, bytes_read;
 915   scm_t_wchar cp;
 916   int overflow;
 917   scm_t_port *pt;
 918
 919   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 920   if (overflow)
 921     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 922
 923   if (bytes_read == 0)
 924     {
 925       chr = scm_getc (port);
 926       if (chr == EOF)
 927         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 928                            "while reading character", SCM_EOL);
 929
 930       /* CHR must be a token delimiter, like a whitespace.  */
 931       return (SCM_MAKE_CHAR (chr));
 932     }
 933
 934   pt = SCM_PTAB_ENTRY (port);
 935
 936   /* Simple ASCII characters can be processed immediately.  Also, simple
 937      ISO-8859-1 characters can be processed immediately if the encoding for this
 938      port is ISO-8859-1.  */
 939   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 940     {
 941       SCM_COL (port) += 1;
 942       return SCM_MAKE_CHAR (buffer[0]);
 943     }
 944
 945   /* Otherwise, convert the buffer into a proper scheme string for
 946      processing.  */
 947   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 948                                pt->ilseq_handler);
 949   charname_len = scm_i_string_length (charname);
 950   SCM_COL (port) += charname_len;
 951   cp = scm_i_string_ref (charname, 0);
 952   if (charname_len == 1)
 953     return SCM_MAKE_CHAR (cp);
 954
 955   /* Ignore dotted circles, which may be used to keep combining characters from
 956      combining with the backslash in #\charname.  */
 957   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 958     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 959
 960   if (cp >= '0' && cp < '8')
 961     {
 962       /* Dirk:FIXME::  This type of character syntax is not R5RS
 963        * compliant.  Further, it should be verified that the constant
 964        * does only consist of octal digits.  */
 965       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 966       if (SCM_I_INUMP (p))
 967         {
 968           scm_t_wchar c = scm_to_uint32 (p);
 969           if (SCM_IS_UNICODE_CHAR (c))
 970             return SCM_MAKE_CHAR (c);
 971           else
 972             scm_i_input_error (FUNC_NAME, port,
 973                                "out-of-range octal character escape: ~a",
 974                                scm_list_1 (charname));
 975         }
 976     }
 977
 978   if (cp == 'x' && (charname_len > 1))
 979     {
 980       SCM p;
 981
 982       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 983       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 984                                 scm_from_uint (16));
 985       if (SCM_I_INUMP (p))
 986         {
 987           scm_t_wchar c = scm_to_uint32 (p);
 988           if (SCM_IS_UNICODE_CHAR (c))
 989             return SCM_MAKE_CHAR (c);
 990           else
 991             scm_i_input_error (FUNC_NAME, port,
 992                                "out-of-range hex character escape: ~a",
 993                                scm_list_1 (charname));
 994         }
 995     }
 996
 997   /* The names of characters should never have non-Latin1
 998      characters.  */
 999   if (scm_i_is_narrow_string (charname)
1000       || scm_i_try_narrow_string (charname))
1001     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1002                                        charname_len);
1003       if (scm_is_true (ch))
1004         return ch;
1005     }
1006
1007   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1008                      scm_list_1 (charname));
1009
1010   return SCM_UNSPECIFIED;
1011 }
1012 #undef FUNC_NAME
1013
1014 static inline SCM
1015 scm_read_keyword (int chr, SCM port)
1016 {
1017   SCM symbol;
1018
1019   /* Read the symbol that comprises the keyword.  Doing this instead of
1020      invoking a specific symbol reader function allows `scm_read_keyword ()'
1021      to adapt to the delimiters currently valid of symbols.
1022
1023      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1024   symbol = scm_read_expression (port);
1025   if (!scm_is_symbol (symbol))
1026     scm_i_input_error ("scm_read_keyword", port,
1027                        "keyword prefix `~a' not followed by a symbol: ~s",
1028                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1029
1030   return (scm_symbol_to_keyword (symbol));
1031 }
1032
1033 static inline SCM
1034 scm_read_vector (int chr, SCM port)
1035 {
1036   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1037      guarantee that it's going to do what we want.  After all, this is an
1038      implementation detail of `scm_read_vector ()', not a desirable
1039      property.  */
1040   return (scm_vector (scm_read_sexp (chr, port)));
1041 }
1042
1043 static inline SCM
1044 scm_read_srfi4_vector (int chr, SCM port)
1045 {
1046   return scm_i_read_array (port, chr);
1047 }
1048
1049 static SCM
1050 scm_read_bytevector (scm_t_wchar chr, SCM port)
1051 {
1052   chr = scm_getc (port);
1053   if (chr != 'u')
1054     goto syntax;
1055
1056   chr = scm_getc (port);
1057   if (chr != '8')
1058     goto syntax;
1059
1060   chr = scm_getc (port);
1061   if (chr != '(')
1062     goto syntax;
1063
1064   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1065
1066  syntax:
1067   scm_i_input_error ("read_bytevector", port,
1068                      "invalid bytevector prefix",
1069                      SCM_MAKE_CHAR (chr));
1070   return SCM_UNSPECIFIED;
1071 }
1072
1073 static SCM
1074 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1075 {
1076   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1077      terribly inefficient but who cares?  */
1078   SCM s_bits = SCM_EOL;
1079
1080   for (chr = scm_getc (port);
1081        (chr != EOF) && ((chr == '0') || (chr == '1'));
1082        chr = scm_getc (port))
1083     {
1084       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1085     }
1086
1087   if (chr != EOF)
1088     scm_ungetc (chr, port);
1089
1090   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1091 }
1092
1093 static inline SCM
1094 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1095 {
1096   int bang_seen = 0;
1097
1098   /* We can use the get_byte here because there is no need to get the
1099      locale correct when reading comments. This presumes that
1100      hash and exclamation points always represent themselves no
1101      matter what the source encoding is.*/
1102   for (;;)
1103     {
1104       int c = scm_get_byte_or_eof (port);
1105
1106       if (c == EOF)
1107         scm_i_input_error ("skip_block_comment", port,
1108                            "unterminated `#! ... !#' comment", SCM_EOL);
1109
1110       if (c == '!')
1111         bang_seen = 1;
1112       else if (c == '#' && bang_seen)
1113         break;
1114       else
1115         bang_seen = 0;
1116     }
1117
1118   return SCM_UNSPECIFIED;
1119 }
1120
1121 static inline SCM
1122 scm_read_shebang (scm_t_wchar chr, SCM port)
1123 {
1124   int c = 0;
1125   if ((c = scm_get_byte_or_eof (port)) != 'r')
1126     {
1127       scm_ungetc (c, port);
1128       return scm_read_scsh_block_comment (chr, port);
1129     }
1130   if ((c = scm_get_byte_or_eof (port)) != '6')
1131     {
1132       scm_ungetc (c, port);
1133       scm_ungetc ('r', port);
1134       return scm_read_scsh_block_comment (chr, port);
1135     }
1136   if ((c = scm_get_byte_or_eof (port)) != 'r')
1137     {
1138       scm_ungetc (c, port);
1139       scm_ungetc ('6', port);
1140       scm_ungetc ('r', port);
1141       return scm_read_scsh_block_comment (chr, port);
1142     }
1143   if ((c = scm_get_byte_or_eof (port)) != 's')
1144     {
1145       scm_ungetc (c, port);
1146       scm_ungetc ('r', port);
1147       scm_ungetc ('6', port);
1148       scm_ungetc ('r', port);
1149       return scm_read_scsh_block_comment (chr, port);
1150     }
1151
1152   return SCM_UNSPECIFIED;
1153 }
1154
1155 static SCM
1156 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1157 {
1158   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1159      nested.  So care must be taken.  */
1160   int nesting_level = 1;
1161   int opening_seen = 0, closing_seen = 0;
1162
1163   while (nesting_level > 0)
1164     {
1165       int c = scm_getc (port);
1166
1167       if (c == EOF)
1168         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1169                            "unterminated `#| ... |#' comment", SCM_EOL);
1170
1171       if (opening_seen)
1172         {
1173           if (c == '|')
1174             nesting_level++;
1175           opening_seen = 0;
1176         }
1177       else if (closing_seen)
1178         {
1179           if (c == '#')
1180             nesting_level--;
1181           closing_seen = 0;
1182         }
1183       else if (c == '|')
1184         closing_seen = 1;
1185       else if (c == '#')
1186         opening_seen = 1;
1187       else
1188         opening_seen = closing_seen = 0;
1189     }
1190
1191   return SCM_UNSPECIFIED;
1192 }
1193
1194 static SCM
1195 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1196 {
1197   scm_t_wchar c;
1198
1199   c = flush_ws (port, (char *) NULL);
1200   if (EOF == c)
1201     scm_i_input_error ("read_commented_expression", port,
1202                        "no expression after #; comment", SCM_EOL);
1203   scm_ungetc (c, port);
1204   scm_read_expression (port);
1205   return SCM_UNSPECIFIED;
1206 }
1207
1208 static SCM
1209 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1210 {
1211   /* Guile's extended symbol read syntax looks like this:
1212
1213        #{This is all a symbol name}#
1214
1215      So here, CHR is expected to be `{'.  */
1216   int saw_brace = 0, finished = 0;
1217   size_t len = 0;
1218   SCM buf = scm_i_make_string (1024, NULL);
1219
1220   buf = scm_i_string_start_writing (buf);
1221
1222   while ((chr = scm_getc (port)) != EOF)
1223     {
1224       if (saw_brace)
1225         {
1226           if (chr == '#')
1227             {
1228               finished = 1;
1229               break;
1230             }
1231           else
1232             {
1233               saw_brace = 0;
1234               scm_i_string_set_x (buf, len++, '}');
1235               scm_i_string_set_x (buf, len++, chr);
1236             }
1237         }
1238       else if (chr == '}')
1239         saw_brace = 1;
1240       else
1241         scm_i_string_set_x (buf, len++, chr);
1242
1243       if (len >= scm_i_string_length (buf) - 2)
1244         {
1245           SCM addy;
1246
1247           scm_i_string_stop_writing ();
1248           addy = scm_i_make_string (1024, NULL);
1249           buf = scm_string_append (scm_list_2 (buf, addy));
1250           len = 0;
1251           buf = scm_i_string_start_writing (buf);
1252         }
1253
1254       if (finished)
1255         break;
1256     }
1257   scm_i_string_stop_writing ();
1258
1259   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1260 }
1261
1262
1263 \f
1264 /* Top-level token readers, i.e., dispatchers.  */
1265
1266 static SCM
1267 scm_read_sharp_extension (int chr, SCM port)
1268 {
1269   SCM proc;
1270
1271   proc = scm_get_hash_procedure (chr);
1272   if (scm_is_true (scm_procedure_p (proc)))
1273     {
1274       long line = SCM_LINUM (port);
1275       int column = SCM_COL (port) - 2;
1276       SCM got;
1277
1278       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1279       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1280         {
1281           if (SCM_RECORD_POSITIONS_P)
1282             return (recsexpr (got, line, column,
1283                               SCM_FILENAME (port)));
1284           else
1285             return got;
1286         }
1287     }
1288
1289   return SCM_UNSPECIFIED;
1290 }
1291
1292 /* The reader for the sharp `#' character.  It basically dispatches reads
1293    among the above token readers.   */
1294 static SCM
1295 scm_read_sharp (scm_t_wchar chr, SCM port)
1296 #define FUNC_NAME "scm_lreadr"
1297 {
1298   SCM result;
1299
1300   chr = scm_getc (port);
1301
1302   result = scm_read_sharp_extension (chr, port);
1303   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1304     return result;
1305
1306   switch (chr)
1307     {
1308     case '\\':
1309       return (scm_read_character (chr, port));
1310     case '(':
1311       return (scm_read_vector (chr, port));
1312     case 's':
1313     case 'u':
1314     case 'f':
1315       /* This one may return either a boolean or an SRFI-4 vector.  */
1316       return (scm_read_srfi4_vector (chr, port));
1317     case 'v':
1318       return (scm_read_bytevector (chr, port));
1319     case '*':
1320       return (scm_read_guile_bit_vector (chr, port));
1321     case 't':
1322     case 'T':
1323     case 'F':
1324       /* This one may return either a boolean or an SRFI-4 vector.  */
1325       return (scm_read_boolean (chr, port));
1326     case ':':
1327       return (scm_read_keyword (chr, port));
1328     case '0': case '1': case '2': case '3': case '4':
1329     case '5': case '6': case '7': case '8': case '9':
1330     case '@':
1331 #if SCM_ENABLE_DEPRECATED
1332       /* See below for 'i' and 'e'. */
1333     case 'a':
1334     case 'c':
1335     case 'y':
1336     case 'h':
1337     case 'l':
1338 #endif
1339       return (scm_i_read_array (port, chr));
1340
1341     case 'i':
1342     case 'e':
1343 #if SCM_ENABLE_DEPRECATED
1344       {
1345         /* When next char is '(', it really is an old-style
1346            uniform array. */
1347         scm_t_wchar next_c = scm_getc (port);
1348         if (next_c != EOF)
1349           scm_ungetc (next_c, port);
1350         if (next_c == '(')
1351           return scm_i_read_array (port, chr);
1352         /* Fall through. */
1353       }
1354 #endif
1355     case 'b':
1356     case 'B':
1357     case 'o':
1358     case 'O':
1359     case 'd':
1360     case 'D':
1361     case 'x':
1362     case 'X':
1363     case 'I':
1364     case 'E':
1365       return (scm_read_number_and_radix (chr, port));
1366     case '{':
1367       return (scm_read_extended_symbol (chr, port));
1368     case '!':
1369       return (scm_read_shebang (chr, port));
1370     case ';':
1371       return (scm_read_commented_expression (chr, port));
1372     case '`':
1373     case '\'':
1374     case ',':
1375       return (scm_read_syntax (chr, port));
1376     case 'n':
1377       return (scm_read_nil (chr, port));
1378     default:
1379       result = scm_read_sharp_extension (chr, port);
1380       if (scm_is_eq (result, SCM_UNSPECIFIED))
1381         {
1382           /* To remain compatible with 1.8 and earlier, the following
1383              characters have lower precedence than `read-hash-extend'
1384              characters.  */
1385           switch (chr)
1386             {
1387             case '|':
1388               return scm_read_r6rs_block_comment (chr, port);
1389             default:
1390               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1391                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1392             }
1393         }
1394       else
1395         return result;
1396     }
1397
1398   return SCM_UNSPECIFIED;
1399 }
1400 #undef FUNC_NAME
1401
1402 static SCM
1403 scm_read_expression (SCM port)
1404 #define FUNC_NAME "scm_read_expression"
1405 {
1406   while (1)
1407     {
1408       register scm_t_wchar chr;
1409
1410       chr = scm_getc (port);
1411
1412       switch (chr)
1413         {
1414         case SCM_WHITE_SPACES:
1415         case SCM_LINE_INCREMENTORS:
1416           break;
1417         case ';':
1418           (void) scm_read_semicolon_comment (chr, port);
1419           break;
1420         case '[':
1421           if (!SCM_SQUARE_BRACKETS_P)
1422             return (scm_read_mixed_case_symbol (chr, port));
1423           /* otherwise fall through */
1424         case '(':
1425           return (scm_read_sexp (chr, port));
1426         case '"':
1427           return (scm_read_string (chr, port));
1428         case '\'':
1429         case '`':
1430         case ',':
1431           return (scm_read_quote (chr, port));
1432         case '#':
1433           {
1434             SCM result;
1435             result = scm_read_sharp (chr, port);
1436             if (scm_is_eq (result, SCM_UNSPECIFIED))
1437               /* We read a comment or some such.  */
1438               break;
1439             else
1440               return result;
1441           }
1442         case ')':
1443           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1444           break;
1445         case ']':
1446           if (SCM_SQUARE_BRACKETS_P)
1447             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1448           /* otherwise fall through */
1449         case EOF:
1450           return SCM_EOF_VAL;
1451         case ':':
1452           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1453             return scm_symbol_to_keyword (scm_read_expression (port));
1454           /* Fall through.  */
1455
1456         default:
1457           {
1458             if (((chr >= '0') && (chr <= '9'))
1459                 || (strchr ("+-.", chr)))
1460               return (scm_read_number (chr, port));
1461             else
1462               return (scm_read_mixed_case_symbol (chr, port));
1463           }
1464         }
1465     }
1466 }
1467 #undef FUNC_NAME
1468
1469 \f
1470 /* Actual reader.  */
1471
1472 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1473             (SCM port),
1474             "Read an s-expression from the input port @var{port}, or from\n"
1475             "the current input port if @var{port} is not specified.\n"
1476             "Any whitespace before the next token is discarded.")
1477 #define FUNC_NAME s_scm_read
1478 {
1479   int c;
1480
1481   if (SCM_UNBNDP (port))
1482     port = scm_current_input_port ();
1483   SCM_VALIDATE_OPINPORT (1, port);
1484
1485   c = flush_ws (port, (char *) NULL);
1486   if (EOF == c)
1487     return SCM_EOF_VAL;
1488   scm_ungetc (c, port);
1489
1490   return (scm_read_expression (port));
1491 }
1492 #undef FUNC_NAME
1493
1494
1495 \f
1496
1497 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1498 static SCM
1499 recsexpr (SCM obj, long line, int column, SCM filename)
1500 {
1501   if (!scm_is_pair(obj)) {
1502     return obj;
1503   } else {
1504     SCM tmp = obj, copy;
1505     /* If this sexpr is visible in the read:sharp source, we want to
1506        keep that information, so only record non-constant cons cells
1507        which haven't previously been read by the reader. */
1508     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1509       {
1510         if (SCM_COPY_SOURCE_P)
1511           {
1512             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1513                              SCM_UNDEFINED);
1514             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1515               {
1516                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1517                                                       line,
1518                                                       column,
1519                                                       filename),
1520                                             SCM_UNDEFINED));
1521                 copy = SCM_CDR (copy);
1522               }
1523             SCM_SETCDR (copy, tmp);
1524           }
1525         else
1526           {
1527             recsexpr (SCM_CAR (obj), line, column, filename);
1528             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1529               recsexpr (SCM_CAR (tmp), line, column, filename);
1530             copy = SCM_UNDEFINED;
1531           }
1532         scm_whash_insert (scm_source_whash,
1533                           obj,
1534                           scm_make_srcprops (line,
1535                                              column,
1536                                              filename,
1537                                              copy,
1538                                              SCM_EOL));
1539       }
1540     return obj;
1541   }
1542 }
1543
1544 /* Manipulate the read-hash-procedures alist.  This could be written in
1545    Scheme, but maybe it will also be used by C code during initialisation.  */
1546 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1547             (SCM chr, SCM proc),
1548             "Install the procedure @var{proc} for reading expressions\n"
1549             "starting with the character sequence @code{#} and @var{chr}.\n"
1550             "@var{proc} will be called with two arguments:  the character\n"
1551             "@var{chr} and the port to read further data from. The object\n"
1552             "returned will be the return value of @code{read}. \n"
1553             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1554             )
1555 #define FUNC_NAME s_scm_read_hash_extend
1556 {
1557   SCM this;
1558   SCM prev;
1559
1560   SCM_VALIDATE_CHAR (1, chr);
1561   SCM_ASSERT (scm_is_false (proc)
1562               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1563               proc, SCM_ARG2, FUNC_NAME);
1564
1565   /* Check if chr is already in the alist.  */
1566   this = scm_i_read_hash_procedures_ref ();
1567   prev = SCM_BOOL_F;
1568   while (1)
1569     {
1570       if (scm_is_null (this))
1571         {
1572           /* not found, so add it to the beginning.  */
1573           if (scm_is_true (proc))
1574             {
1575               SCM new = scm_cons (scm_cons (chr, proc),
1576                                   scm_i_read_hash_procedures_ref ());
1577               scm_i_read_hash_procedures_set_x (new);
1578             }
1579           break;
1580         }
1581       if (scm_is_eq (chr, SCM_CAAR (this)))
1582         {
1583           /* already in the alist.  */
1584           if (scm_is_false (proc))
1585             {
1586               /* remove it.  */
1587               if (scm_is_false (prev))
1588                 {
1589                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1590                   scm_i_read_hash_procedures_set_x (rest);
1591                 }
1592               else
1593                 scm_set_cdr_x (prev, SCM_CDR (this));
1594             }
1595           else
1596             {
1597               /* replace it.  */
1598               scm_set_cdr_x (SCM_CAR (this), proc);
1599             }
1600           break;
1601         }
1602       prev = this;
1603       this = SCM_CDR (this);
1604     }
1605
1606   return SCM_UNSPECIFIED;
1607 }
1608 #undef FUNC_NAME
1609
1610 /* Recover the read-hash procedure corresponding to char c.  */
1611 static SCM
1612 scm_get_hash_procedure (int c)
1613 {
1614   SCM rest = scm_i_read_hash_procedures_ref ();
1615
1616   while (1)
1617     {
1618       if (scm_is_null (rest))
1619         return SCM_BOOL_F;
1620
1621       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1622         return SCM_CDAR (rest);
1623
1624       rest = SCM_CDR (rest);
1625     }
1626 }
1627
1628 #define SCM_ENCODING_SEARCH_SIZE (500)
1629
1630 /* Search the first few hundred characters of a file for an Emacs-like coding
1631    declaration.  Returns either NULL or a string whose storage has been
1632    allocated with `scm_gc_malloc ()'.  */
1633 char *
1634 scm_i_scan_for_encoding (SCM port)
1635 {
1636   char header[SCM_ENCODING_SEARCH_SIZE+1];
1637   size_t bytes_read, encoding_length, i;
1638   char *encoding = NULL;
1639   int utf8_bom = 0;
1640   char *pos, *encoding_start;
1641   int in_comment;
1642
1643   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1644     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1645        "guile <(echo '(display "hello")')") so bail out.  */
1646     return NULL;
1647
1648   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1649   header[bytes_read] = '\0';
1650
1651   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1652
1653   if (bytes_read > 3
1654       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1655     utf8_bom = 1;
1656
1657   /* search past "coding[:=]" */
1658   pos = header;
1659   while (1)
1660     {
1661       if ((pos = strstr(pos, "coding")) == NULL)
1662         return NULL;
1663
1664       pos += strlen("coding");
1665       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1666           (*pos == ':' || *pos == '='))
1667         {
1668           pos ++;
1669           break;
1670         }
1671     }
1672
1673   /* skip spaces */
1674   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1675          (*pos == ' ' || *pos == '\t'))
1676     pos ++;
1677
1678   /* grab the next token */
1679   encoding_start = pos;
1680   i = 0;
1681   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1682          && encoding_start + i - header < bytes_read
1683          && (isalnum ((int) encoding_start[i])
1684              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1685     i++;
1686
1687   encoding_length = i;
1688   if (encoding_length == 0)
1689     return NULL;
1690
1691   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1692   for (i = 0; i < encoding_length; i++)
1693     encoding[i] = toupper ((int) encoding[i]);
1694
1695   /* push backwards to make sure we were in a comment */
1696   in_comment = 0;
1697   pos = encoding_start;
1698   while (pos >= header)
1699     {
1700       if (*pos == '\n')
1701         {
1702           /* This wasn't in a semicolon comment. Check for a
1703            hash-bang comment. */
1704           char *beg = strstr (header, "#!");
1705           char *end = strstr (header, "!#");
1706           if (beg < encoding_start && encoding_start + encoding_length < end)
1707             in_comment = 1;
1708           break;
1709         }
1710       if (*pos == ';')
1711         {
1712           in_comment = 1;
1713           break;
1714         }
1715       pos --;
1716     }
1717   if (!in_comment)
1718     /* This wasn't in a comment */
1719     return NULL;
1720
1721   if (utf8_bom && strcmp(encoding, "UTF-8"))
1722     scm_misc_error (NULL,
1723                     "the port input declares the encoding ~s but is encoded as UTF-8",
1724                     scm_list_1 (scm_from_locale_string (encoding)));
1725
1726   return encoding;
1727 }
1728
1729 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1730             (SCM port),
1731             "Scans the port for an Emacs-like character coding declaration\n"
1732             "near the top of the contents of a port with random-acessible contents.\n"
1733             "The coding declaration is of the form\n"
1734             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1735             "\n"
1736             "Returns a string containing the character encoding of the file\n"
1737             "if a declaration was found, or @code{#f} otherwise.\n")
1738 #define FUNC_NAME s_scm_file_encoding
1739 {
1740   char *enc;
1741   SCM s_enc;
1742
1743   enc = scm_i_scan_for_encoding (port);
1744   if (enc == NULL)
1745     return SCM_BOOL_F;
1746   else
1747     {
1748       s_enc = scm_from_locale_string (enc);
1749       return s_enc;
1750     }
1751
1752   return SCM_BOOL_F;
1753 }
1754 #undef FUNC_NAME
1755
1756 void
1757 scm_init_read ()
1758 {
1759   SCM read_hash_procs;
1760
1761   read_hash_procs = scm_make_fluid ();
1762   scm_fluid_set_x (read_hash_procs, SCM_EOL);
1763
1764   scm_i_read_hash_procedures =
1765     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1766
1767   scm_init_opts (scm_read_options, scm_read_opts);
1768 #include "libguile/read.x"
1769 }
1770
1771 /*
1772   Local Variables:
1773   c-file-style: "gnu"
1774   End:
1775 */