libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63 SCM_SYMBOL (sym_nil, "nil");
  64
  65 scm_t_option scm_read_opts[] = {
  66   { SCM_OPTION_BOOLEAN, "copy", 0,
  67     "Copy source code expressions." },
  68   { SCM_OPTION_BOOLEAN, "positions", 0,
  69     "Record positions of source code expressions." },
  70   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  71     "Convert symbols to lower case."},
  72   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  73     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  74   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  75     "Support Elisp vector syntax, namely `[...]'."},
  76   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  77     "Support `\\(' and `\\)' in strings."},
  78   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  79     "Use R6RS variable-length character and string hex escapes."},
  80   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  81     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  82   { 0, },
  83 };
  84
  85 /*
  86   Give meaningful error messages for errors
  87
  88   We use the format
  89
  90   FILE:LINE:COL: MESSAGE
  91   This happened in ....
  92
  93   This is not standard GNU format, but the test-suite likes the real
  94   message to be in front.
  95
  96  */
  97
  98
  99 void
 100 scm_i_input_error (char const *function,
 101                    SCM port, const char *message, SCM arg)
 102 {
 103   SCM fn = (scm_is_string (SCM_FILENAME(port))
 104             ? SCM_FILENAME(port)
 105             : scm_from_locale_string ("#<unknown port>"));
 106
 107   SCM string_port = scm_open_output_string ();
 108   SCM string = SCM_EOL;
 109   scm_simple_format (string_port,
 110                      scm_from_locale_string ("~A:~S:~S: ~A"),
 111                      scm_list_4 (fn,
 112                                  scm_from_long (SCM_LINUM (port) + 1),
 113                                  scm_from_int (SCM_COL (port) + 1),
 114                                  scm_from_locale_string (message)));
 115
 116   string = scm_get_output_string (string_port);
 117   scm_close_output_port (string_port);
 118   scm_error_scm (scm_from_locale_symbol ("read-error"),
 119                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 120                  string,
 121                  arg,
 122                  SCM_BOOL_F);
 123 }
 124
 125
 126 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 127             (SCM setting),
 128             "Option interface for the read options. Instead of using\n"
 129             "this procedure directly, use the procedures @code{read-enable},\n"
 130             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 131 #define FUNC_NAME s_scm_read_options
 132 {
 133   SCM ans = scm_options (setting,
 134                          scm_read_opts,
 135                          FUNC_NAME);
 136   if (SCM_COPY_SOURCE_P)
 137     SCM_RECORD_POSITIONS_P = 1;
 138   return ans;
 139 }
 140 #undef FUNC_NAME
 141
 142 /* An association list mapping extra hash characters to procedures.  */
 143 static SCM *scm_read_hash_procedures;
 144
 145
 146 \f
 147 /* Token readers.  */
 148
 149
 150 /* Size of the C buffer used to read symbols and numbers.  */
 151 #define READER_BUFFER_SIZE            128
 152
 153 /* Size of the C buffer used to read strings.  */
 154 #define READER_STRING_BUFFER_SIZE     512
 155
 156 /* The maximum size of Scheme character names.  */
 157 #define READER_CHAR_NAME_MAX_SIZE      50
 158
 159
 160 /* `isblank' is only in C99.  */
 161 #define CHAR_IS_BLANK_(_chr)                                    \
 162   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 163    || ((_chr) == '\f') || ((_chr) == '\r'))
 164
 165 #ifdef MSDOS
 166 # define CHAR_IS_BLANK(_chr)                    \
 167   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 168 #else
 169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 170 #endif
 171
 172
 173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 174    structure'').  */
 175 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 176   (CHAR_IS_BLANK (c)                                            \
 177    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 178    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 179
 180 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 181
 182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 183    Structure''.  */
 184 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 185   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 186    || ((_chr) == 'd') || ((_chr) == 'l'))
 187
 188 /* Read an SCSH block comment.  */
 189 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 190 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 191 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 192 static SCM scm_read_shebang (scm_t_wchar, SCM);
 193 static SCM scm_get_hash_procedure (int);
 194
 195 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 196    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 197    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 198    bytes actually read.  */
 199 static inline int
 200 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 201  {
 202    *read = 0;
 203
 204    while (*read < buf_size)
 205      {
 206        int chr;
 207
 208        chr = scm_get_byte_or_eof (port);
 209
 210        if (chr == EOF)
 211         return 0;
 212       else if (CHAR_IS_DELIMITER (chr))
 213         {
 214           scm_unget_byte (chr, port);
 215           return 0;
 216         }
 217       else
 218         {
 219           *buf = (char) chr;
 220           buf++, (*read)++;
 221         }
 222      }
 223
 224    return 1;
 225  }
 226
 227 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 228    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 229    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 230    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 231    will be set the number of bytes actually read.  */
 232 static int
 233 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 234                            char **overflow_buffer, size_t *read)
 235 {
 236   int overflow = 0;
 237   size_t bytes_read, overflow_size;
 238
 239   *overflow_buffer = NULL;
 240   overflow_size = 0;
 241
 242   do
 243     {
 244       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 245       if (bytes_read == 0)
 246         break;
 247       if (overflow || overflow_size != 0)
 248         {
 249           if (overflow_size == 0)
 250             {
 251               *overflow_buffer = scm_malloc (bytes_read);
 252               memcpy (*overflow_buffer, buffer, bytes_read);
 253               overflow_size = bytes_read;
 254             }
 255           else
 256             {
 257               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 258               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 259               overflow_size += bytes_read;
 260             }
 261         }
 262     }
 263   while (overflow);
 264
 265   if (overflow_size)
 266     *read = overflow_size;
 267   else
 268     *read = bytes_read;
 269
 270   return (overflow_size != 0);
 271 }
 272
 273 /* Skip whitespace from PORT and return the first non-whitespace character
 274    read.  Raise an error on end-of-file.  */
 275 static int
 276 flush_ws (SCM port, const char *eoferr)
 277 {
 278   register scm_t_wchar c;
 279   while (1)
 280     switch (c = scm_getc (port))
 281       {
 282       case EOF:
 283       goteof:
 284         if (eoferr)
 285           {
 286             scm_i_input_error (eoferr,
 287                                port,
 288                                "end of file",
 289                                SCM_EOL);
 290           }
 291         return c;
 292
 293       case ';':
 294       lp:
 295         switch (c = scm_getc (port))
 296           {
 297           case EOF:
 298             goto goteof;
 299           default:
 300             goto lp;
 301           case SCM_LINE_INCREMENTORS:
 302             break;
 303           }
 304         break;
 305
 306       case '#':
 307         switch (c = scm_getc (port))
 308           {
 309           case EOF:
 310             eoferr = "read_sharp";
 311             goto goteof;
 312           case '!':
 313             scm_read_shebang (c, port);
 314             break;
 315           case ';':
 316             scm_read_commented_expression (c, port);
 317             break;
 318           case '|':
 319             if (scm_is_false (scm_get_hash_procedure (c)))
 320               {
 321                 scm_read_r6rs_block_comment (c, port);
 322                 break;
 323               }
 324             /* fall through */
 325           default:
 326             scm_ungetc (c, port);
 327             return '#';
 328           }
 329         break;
 330
 331       case SCM_LINE_INCREMENTORS:
 332       case SCM_SINGLE_SPACES:
 333       case '\t':
 334         break;
 335
 336       default:
 337         return c;
 338       }
 339
 340   return 0;
 341 }
 342
 343
 344 \f
 345 /* Token readers.  */
 346
 347 static SCM scm_read_expression (SCM port);
 348 static SCM scm_read_sharp (int chr, SCM port);
 349 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 350
 351
 352 static SCM
 353 scm_read_sexp (scm_t_wchar chr, SCM port)
 354 #define FUNC_NAME "scm_i_lreadparen"
 355 {
 356   register int c;
 357   register SCM tmp;
 358   register SCM tl, ans = SCM_EOL;
 359   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 360   const int terminating_char = ((chr == '[') ? ']' : ')');
 361
 362   /* Need to capture line and column numbers here. */
 363   long line = SCM_LINUM (port);
 364   int column = SCM_COL (port) - 1;
 365
 366
 367   c = flush_ws (port, FUNC_NAME);
 368   if (terminating_char == c)
 369     return SCM_EOL;
 370
 371   scm_ungetc (c, port);
 372   if (scm_is_eq (scm_sym_dot,
 373                  (tmp = scm_read_expression (port))))
 374     {
 375       ans = scm_read_expression (port);
 376       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 377         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 378                            SCM_EOL);
 379       return ans;
 380     }
 381
 382   /* Build the head of the list structure. */
 383   ans = tl = scm_cons (tmp, SCM_EOL);
 384
 385   if (SCM_COPY_SOURCE_P)
 386     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 387                            ? copy
 388                            : tmp,
 389                            SCM_EOL);
 390
 391   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 392     {
 393       SCM new_tail;
 394
 395       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 396         scm_i_input_error (FUNC_NAME, port,
 397                            "in pair: mismatched close paren: ~A",
 398                            scm_list_1 (SCM_MAKE_CHAR (c)));
 399
 400       scm_ungetc (c, port);
 401       tmp = scm_read_expression (port);
 402
 403       if (scm_is_eq (scm_sym_dot, tmp))
 404         {
 405           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 406
 407           if (SCM_COPY_SOURCE_P)
 408             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 409                                        SCM_EOL));
 410
 411           c = flush_ws (port, FUNC_NAME);
 412           if (terminating_char != c)
 413             scm_i_input_error (FUNC_NAME, port,
 414                                "in pair: missing close paren", SCM_EOL);
 415           goto exit;
 416         }
 417
 418       new_tail = scm_cons (tmp, SCM_EOL);
 419       SCM_SETCDR (tl, new_tail);
 420       tl = new_tail;
 421
 422       if (SCM_COPY_SOURCE_P)
 423         {
 424           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 425                                     ? copy
 426                                     : tmp, SCM_EOL);
 427           SCM_SETCDR (tl2, new_tail2);
 428           tl2 = new_tail2;
 429         }
 430     }
 431
 432  exit:
 433   if (SCM_RECORD_POSITIONS_P)
 434     scm_whash_insert (scm_source_whash,
 435                       ans,
 436                       scm_make_srcprops (line, column,
 437                                          SCM_FILENAME (port),
 438                                          SCM_COPY_SOURCE_P
 439                                          ? ans2
 440                                          : SCM_UNDEFINED,
 441                                          SCM_EOL));
 442   return ans;
 443 }
 444 #undef FUNC_NAME
 445
 446
 447 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 448    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 449    found.  */
 450 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 451   do                                                               \
 452     {                                                              \
 453       scm_t_wchar a;                                               \
 454       size_t i = 0;                                                \
 455       c = 0;                                                       \
 456       while (i < ndigits)                                          \
 457         {                                                          \
 458           a = scm_getc (port);                                     \
 459           if (a == EOF)                                            \
 460             goto str_eof;                                          \
 461           if (terminator                                           \
 462               && (a == (scm_t_wchar) terminator)                   \
 463               && (i > 0))                                          \
 464             break;                                                 \
 465           if ('0' <= a && a <= '9')                                \
 466             a -= '0';                                              \
 467           else if ('A' <= a && a <= 'F')                           \
 468             a = a - 'A' + 10;                                      \
 469           else if ('a' <= a && a <= 'f')                           \
 470             a = a - 'a' + 10;                                      \
 471           else                                                     \
 472             {                                                      \
 473               c = a;                                               \
 474               goto bad_escaped;                                    \
 475             }                                                      \
 476           c = c * 16 + a;                                          \
 477           i ++;                                                    \
 478         }                                                          \
 479     } while (0)
 480
 481 static SCM
 482 scm_read_string (int chr, SCM port)
 483 #define FUNC_NAME "scm_lreadr"
 484 {
 485   /* For strings smaller than C_STR, this function creates only one Scheme
 486      object (the string returned).  */
 487
 488   SCM str = SCM_BOOL_F;
 489   unsigned c_str_len = 0;
 490   scm_t_wchar c;
 491
 492   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 493   while ('"' != (c = scm_getc (port)))
 494     {
 495       if (c == EOF)
 496         {
 497         str_eof:
 498           scm_i_input_error (FUNC_NAME, port,
 499                              "end of file in string constant", SCM_EOL);
 500         }
 501
 502       if (c_str_len + 1 >= scm_i_string_length (str))
 503         {
 504           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 505
 506           str = scm_string_append (scm_list_2 (str, addy));
 507         }
 508
 509       if (c == '\\')
 510         {
 511           switch (c = scm_getc (port))
 512             {
 513             case EOF:
 514               goto str_eof;
 515             case '"':
 516             case '\\':
 517               break;
 518             case '(':
 519             case ')':
 520               if (SCM_ESCAPED_PARENS_P)
 521                 break;
 522               goto bad_escaped;
 523             case '\n':
 524               continue;
 525             case '0':
 526               c = '\0';
 527               break;
 528             case 'f':
 529               c = '\f';
 530               break;
 531             case 'n':
 532               c = '\n';
 533               break;
 534             case 'r':
 535               c = '\r';
 536               break;
 537             case 't':
 538               c = '\t';
 539               break;
 540             case 'a':
 541               c = '\007';
 542               break;
 543             case 'v':
 544               c = '\v';
 545               break;
 546             case 'b':
 547               c = '\010';
 548               break;
 549             case 'x':
 550               if (SCM_R6RS_ESCAPES_P)
 551                 SCM_READ_HEX_ESCAPE (10, ';');
 552               else
 553                 SCM_READ_HEX_ESCAPE (2, '\0');
 554               break;
 555             case 'u':
 556               if (!SCM_R6RS_ESCAPES_P)
 557                 {
 558                   SCM_READ_HEX_ESCAPE (4, '\0');
 559                   break;
 560                 }
 561             case 'U':
 562               if (!SCM_R6RS_ESCAPES_P)
 563                 {
 564                   SCM_READ_HEX_ESCAPE (6, '\0');
 565                   break;
 566                 }
 567             default:
 568             bad_escaped:
 569               scm_i_input_error (FUNC_NAME, port,
 570                                  "illegal character in escape sequence: ~S",
 571                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 572             }
 573         }
 574       str = scm_i_string_start_writing (str);
 575       scm_i_string_set_x (str, c_str_len++, c);
 576       scm_i_string_stop_writing ();
 577     }
 578
 579   if (c_str_len > 0)
 580     {
 581       return scm_i_substring_copy (str, 0, c_str_len);
 582     }
 583
 584   return scm_nullstr;
 585 }
 586 #undef FUNC_NAME
 587
 588
 589 static SCM
 590 scm_read_number (scm_t_wchar chr, SCM port)
 591 {
 592   SCM result, str = SCM_EOL;
 593   char buffer[READER_BUFFER_SIZE];
 594   char *overflow_buffer = NULL;
 595   size_t bytes_read;
 596   int overflow;
 597   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 598
 599   scm_ungetc (chr, port);
 600   overflow = read_complete_token (port, buffer, sizeof (buffer),
 601                                   &overflow_buffer, &bytes_read);
 602
 603   if (!overflow)
 604     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 605   else
 606     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 607                             pt->ilseq_handler);
 608
 609   result = scm_string_to_number (str, SCM_UNDEFINED);
 610   if (!scm_is_true (result))
 611     {
 612       /* Return a symbol instead of a number */
 613       if (SCM_CASE_INSENSITIVE_P)
 614         str = scm_string_downcase_x (str);
 615       result = scm_string_to_symbol (str);
 616     }
 617
 618   if (overflow)
 619     free (overflow_buffer);
 620   SCM_COL (port) += scm_i_string_length (str);
 621   return result;
 622 }
 623
 624 static SCM
 625 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 626 {
 627   SCM result;
 628   int ends_with_colon = 0;
 629   size_t bytes_read;
 630   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 631   int overflow;
 632   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 633   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 634   SCM str;
 635
 636   scm_ungetc (chr, port);
 637   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 638                                   &overflow_buffer, &bytes_read);
 639   if (bytes_read > 0)
 640     {
 641       if (!overflow)
 642         ends_with_colon = buffer[bytes_read - 1] == ':';
 643       else
 644         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 645     }
 646
 647   if (postfix && ends_with_colon && (bytes_read > 1))
 648     {
 649       if (!overflow)
 650         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 651       else
 652         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 653                                 pt->ilseq_handler);
 654
 655       if (SCM_CASE_INSENSITIVE_P)
 656         str = scm_string_downcase_x (str);
 657       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 658     }
 659   else
 660     {
 661       if (!overflow)
 662         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 663       else
 664         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 665                                 pt->ilseq_handler);
 666
 667       if (SCM_CASE_INSENSITIVE_P)
 668         str = scm_string_downcase_x (str);
 669       result = scm_string_to_symbol (str);
 670     }
 671
 672   if (overflow)
 673     free (overflow_buffer);
 674   SCM_COL (port) += scm_i_string_length (str);
 675   return result;
 676 }
 677
 678 static SCM
 679 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 680 #define FUNC_NAME "scm_lreadr"
 681 {
 682   SCM result;
 683   size_t read;
 684   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 685   int overflow;
 686   unsigned int radix;
 687   SCM str;
 688   scm_t_port *pt;
 689
 690   switch (chr)
 691     {
 692     case 'B':
 693     case 'b':
 694       radix = 2;
 695       break;
 696
 697     case 'o':
 698     case 'O':
 699       radix = 8;
 700       break;
 701
 702     case 'd':
 703     case 'D':
 704       radix = 10;
 705       break;
 706
 707     case 'x':
 708     case 'X':
 709       radix = 16;
 710       break;
 711
 712     default:
 713       scm_ungetc (chr, port);
 714       scm_ungetc ('#', port);
 715       radix = 10;
 716     }
 717
 718   overflow = read_complete_token (port, buffer, sizeof (buffer),
 719                                   &overflow_buffer, &read);
 720
 721   pt = SCM_PTAB_ENTRY (port);
 722   if (!overflow)
 723     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 724   else
 725     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 726                             pt->ilseq_handler);
 727
 728   result = scm_string_to_number (str, scm_from_uint (radix));
 729
 730   if (overflow)
 731     free (overflow_buffer);
 732
 733   SCM_COL (port) += scm_i_string_length (str);
 734
 735   if (scm_is_true (result))
 736     return result;
 737
 738   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 739
 740   return SCM_BOOL_F;
 741 }
 742 #undef FUNC_NAME
 743
 744 static SCM
 745 scm_read_quote (int chr, SCM port)
 746 {
 747   SCM p;
 748   long line = SCM_LINUM (port);
 749   int column = SCM_COL (port) - 1;
 750
 751   switch (chr)
 752     {
 753     case '`':
 754       p = scm_sym_quasiquote;
 755       break;
 756
 757     case '\'':
 758       p = scm_sym_quote;
 759       break;
 760
 761     case ',':
 762       {
 763         scm_t_wchar c;
 764
 765         c = scm_getc (port);
 766         if ('@' == c)
 767           p = scm_sym_uq_splicing;
 768         else
 769           {
 770             scm_ungetc (c, port);
 771             p = scm_sym_unquote;
 772           }
 773         break;
 774       }
 775
 776     default:
 777       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 778                "scm_read_quote", chr);
 779       abort ();
 780     }
 781
 782   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 783   if (SCM_RECORD_POSITIONS_P)
 784     scm_whash_insert (scm_source_whash, p,
 785                       scm_make_srcprops (line, column,
 786                                          SCM_FILENAME (port),
 787                                          SCM_COPY_SOURCE_P
 788                                          ? (scm_cons2 (SCM_CAR (p),
 789                                                        SCM_CAR (SCM_CDR (p)),
 790                                                        SCM_EOL))
 791                                          : SCM_UNDEFINED,
 792                                          SCM_EOL));
 793
 794
 795   return p;
 796 }
 797
 798 SCM_SYMBOL (sym_syntax, "syntax");
 799 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 800 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 801 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 802
 803 static SCM
 804 scm_read_syntax (int chr, SCM port)
 805 {
 806   SCM p;
 807   long line = SCM_LINUM (port);
 808   int column = SCM_COL (port) - 1;
 809
 810   switch (chr)
 811     {
 812     case '`':
 813       p = sym_quasisyntax;
 814       break;
 815
 816     case '\'':
 817       p = sym_syntax;
 818       break;
 819
 820     case ',':
 821       {
 822         int c;
 823
 824         c = scm_getc (port);
 825         if ('@' == c)
 826           p = sym_unsyntax_splicing;
 827         else
 828           {
 829             scm_ungetc (c, port);
 830             p = sym_unsyntax;
 831           }
 832         break;
 833       }
 834
 835     default:
 836       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 837                "scm_read_syntax", chr);
 838       abort ();
 839     }
 840
 841   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 842   if (SCM_RECORD_POSITIONS_P)
 843     scm_whash_insert (scm_source_whash, p,
 844                       scm_make_srcprops (line, column,
 845                                          SCM_FILENAME (port),
 846                                          SCM_COPY_SOURCE_P
 847                                          ? (scm_cons2 (SCM_CAR (p),
 848                                                        SCM_CAR (SCM_CDR (p)),
 849                                                        SCM_EOL))
 850                                          : SCM_UNDEFINED,
 851                                          SCM_EOL));
 852
 853
 854   return p;
 855 }
 856
 857 static inline SCM
 858 scm_read_nil (int chr, SCM port)
 859 {
 860   SCM id = scm_read_mixed_case_symbol (chr, port);
 861
 862   if (!scm_is_eq (id, sym_nil))
 863     scm_i_input_error ("scm_read_nil", port,
 864                        "unexpected input while reading #nil: ~a",
 865                        scm_list_1 (id));
 866
 867   return SCM_ELISP_NIL;
 868 }
 869
 870 static inline SCM
 871 scm_read_semicolon_comment (int chr, SCM port)
 872 {
 873   int c;
 874
 875   /* We use the get_byte here because there is no need to get the
 876      locale correct with comment input. This presumes that newline
 877      always represents itself no matter what the encoding is.  */
 878   for (c = scm_get_byte_or_eof (port);
 879        (c != EOF) && (c != '\n');
 880        c = scm_get_byte_or_eof (port));
 881
 882   return SCM_UNSPECIFIED;
 883 }
 884
 885 \f
 886 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 887
 888 static SCM
 889 scm_read_boolean (int chr, SCM port)
 890 {
 891   switch (chr)
 892     {
 893     case 't':
 894     case 'T':
 895       return SCM_BOOL_T;
 896
 897     case 'f':
 898     case 'F':
 899       return SCM_BOOL_F;
 900     }
 901
 902   return SCM_UNSPECIFIED;
 903 }
 904
 905 static SCM
 906 scm_read_character (scm_t_wchar chr, SCM port)
 907 #define FUNC_NAME "scm_lreadr"
 908 {
 909   char buffer[READER_CHAR_NAME_MAX_SIZE];
 910   SCM charname;
 911   size_t charname_len, bytes_read;
 912   scm_t_wchar cp;
 913   int overflow;
 914   scm_t_port *pt;
 915
 916   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 917   if (overflow)
 918     goto char_error;
 919
 920   if (bytes_read == 0)
 921     {
 922       chr = scm_getc (port);
 923       if (chr == EOF)
 924         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 925                            "while reading character", SCM_EOL);
 926
 927       /* CHR must be a token delimiter, like a whitespace.  */
 928       return (SCM_MAKE_CHAR (chr));
 929     }
 930
 931   pt = SCM_PTAB_ENTRY (port);
 932
 933   /* Simple ASCII characters can be processed immediately.  Also, simple
 934      ISO-8859-1 characters can be processed immediately if the encoding for this
 935      port is ISO-8859-1.  */
 936   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 937     {
 938       SCM_COL (port) += 1;
 939       return SCM_MAKE_CHAR (buffer[0]);
 940     }
 941
 942   /* Otherwise, convert the buffer into a proper scheme string for
 943      processing.  */
 944   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 945                                pt->ilseq_handler);
 946   charname_len = scm_i_string_length (charname);
 947   SCM_COL (port) += charname_len;
 948   cp = scm_i_string_ref (charname, 0);
 949   if (charname_len == 1)
 950     return SCM_MAKE_CHAR (cp);
 951
 952   /* Ignore dotted circles, which may be used to keep combining characters from
 953      combining with the backslash in #\charname.  */
 954   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 955     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 956
 957   if (cp >= '0' && cp < '8')
 958     {
 959       /* Dirk:FIXME::  This type of character syntax is not R5RS
 960        * compliant.  Further, it should be verified that the constant
 961        * does only consist of octal digits.  */
 962       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 963       if (SCM_I_INUMP (p))
 964         {
 965           scm_t_wchar c = SCM_I_INUM (p);
 966           if (SCM_IS_UNICODE_CHAR (c))
 967             return SCM_MAKE_CHAR (c);
 968           else
 969             scm_i_input_error (FUNC_NAME, port,
 970                                "out-of-range octal character escape: ~a",
 971                                scm_list_1 (charname));
 972         }
 973     }
 974
 975   if (cp == 'x' && (charname_len > 1))
 976     {
 977       SCM p;
 978
 979       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 980       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 981                                 scm_from_uint (16));
 982       if (SCM_I_INUMP (p))
 983         {
 984           scm_t_wchar c = SCM_I_INUM (p);
 985           if (SCM_IS_UNICODE_CHAR (c))
 986             return SCM_MAKE_CHAR (c);
 987           else
 988             scm_i_input_error (FUNC_NAME, port,
 989                                "out-of-range hex character escape: ~a",
 990                                scm_list_1 (charname));
 991         }
 992     }
 993
 994   /* The names of characters should never have non-Latin1
 995      characters.  */
 996   if (scm_i_is_narrow_string (charname)
 997       || scm_i_try_narrow_string (charname))
 998     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 999                                        charname_len);
1000       if (scm_is_true (ch))
1001         return ch;
1002     }
1003
1004  char_error:
1005   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1006                      scm_list_1 (charname));
1007
1008   return SCM_UNSPECIFIED;
1009 }
1010 #undef FUNC_NAME
1011
1012 static inline SCM
1013 scm_read_keyword (int chr, SCM port)
1014 {
1015   SCM symbol;
1016
1017   /* Read the symbol that comprises the keyword.  Doing this instead of
1018      invoking a specific symbol reader function allows `scm_read_keyword ()'
1019      to adapt to the delimiters currently valid of symbols.
1020
1021      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1022   symbol = scm_read_expression (port);
1023   if (!scm_is_symbol (symbol))
1024     scm_i_input_error ("scm_read_keyword", port,
1025                        "keyword prefix `~a' not followed by a symbol: ~s",
1026                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1027
1028   return (scm_symbol_to_keyword (symbol));
1029 }
1030
1031 static inline SCM
1032 scm_read_vector (int chr, SCM port)
1033 {
1034   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1035      guarantee that it's going to do what we want.  After all, this is an
1036      implementation detail of `scm_read_vector ()', not a desirable
1037      property.  */
1038   return (scm_vector (scm_read_sexp (chr, port)));
1039 }
1040
1041 static inline SCM
1042 scm_read_srfi4_vector (int chr, SCM port)
1043 {
1044   return scm_i_read_array (port, chr);
1045 }
1046
1047 static SCM
1048 scm_read_bytevector (scm_t_wchar chr, SCM port)
1049 {
1050   chr = scm_getc (port);
1051   if (chr != 'u')
1052     goto syntax;
1053
1054   chr = scm_getc (port);
1055   if (chr != '8')
1056     goto syntax;
1057
1058   chr = scm_getc (port);
1059   if (chr != '(')
1060     goto syntax;
1061
1062   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1063
1064  syntax:
1065   scm_i_input_error ("read_bytevector", port,
1066                      "invalid bytevector prefix",
1067                      SCM_MAKE_CHAR (chr));
1068   return SCM_UNSPECIFIED;
1069 }
1070
1071 static SCM
1072 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1073 {
1074   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1075      terribly inefficient but who cares?  */
1076   SCM s_bits = SCM_EOL;
1077
1078   for (chr = scm_getc (port);
1079        (chr != EOF) && ((chr == '0') || (chr == '1'));
1080        chr = scm_getc (port))
1081     {
1082       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1083     }
1084
1085   if (chr != EOF)
1086     scm_ungetc (chr, port);
1087
1088   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1089 }
1090
1091 static inline SCM
1092 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1093 {
1094   int bang_seen = 0;
1095
1096   /* We can use the get_byte here because there is no need to get the
1097      locale correct when reading comments. This presumes that
1098      hash and exclamation points always represent themselves no
1099      matter what the source encoding is.*/
1100   for (;;)
1101     {
1102       int c = scm_get_byte_or_eof (port);
1103
1104       if (c == EOF)
1105         scm_i_input_error ("skip_block_comment", port,
1106                            "unterminated `#! ... !#' comment", SCM_EOL);
1107
1108       if (c == '!')
1109         bang_seen = 1;
1110       else if (c == '#' && bang_seen)
1111         break;
1112       else
1113         bang_seen = 0;
1114     }
1115
1116   return SCM_UNSPECIFIED;
1117 }
1118
1119 static inline SCM
1120 scm_read_shebang (scm_t_wchar chr, SCM port)
1121 {
1122   int c = 0;
1123   if ((c = scm_get_byte_or_eof (port)) != 'r')
1124     {
1125       scm_ungetc (c, port);
1126       return scm_read_scsh_block_comment (chr, port);
1127     }
1128   if ((c = scm_get_byte_or_eof (port)) != '6')
1129     {
1130       scm_ungetc (c, port);
1131       scm_ungetc ('r', port);
1132       return scm_read_scsh_block_comment (chr, port);
1133     }
1134   if ((c = scm_get_byte_or_eof (port)) != 'r')
1135     {
1136       scm_ungetc (c, port);
1137       scm_ungetc ('6', port);
1138       scm_ungetc ('r', port);
1139       return scm_read_scsh_block_comment (chr, port);
1140     }
1141   if ((c = scm_get_byte_or_eof (port)) != 's')
1142     {
1143       scm_ungetc (c, port);
1144       scm_ungetc ('r', port);
1145       scm_ungetc ('6', port);
1146       scm_ungetc ('r', port);
1147       return scm_read_scsh_block_comment (chr, port);
1148     }
1149
1150   return SCM_UNSPECIFIED;
1151 }
1152
1153 static SCM
1154 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1155 {
1156   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1157      nested.  So care must be taken.  */
1158   int nesting_level = 1;
1159   int opening_seen = 0, closing_seen = 0;
1160
1161   while (nesting_level > 0)
1162     {
1163       int c = scm_getc (port);
1164
1165       if (c == EOF)
1166         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1167                            "unterminated `#| ... |#' comment", SCM_EOL);
1168
1169       if (opening_seen)
1170         {
1171           if (c == '|')
1172             nesting_level++;
1173           opening_seen = 0;
1174         }
1175       else if (closing_seen)
1176         {
1177           if (c == '#')
1178             nesting_level--;
1179           closing_seen = 0;
1180         }
1181       else if (c == '|')
1182         closing_seen = 1;
1183       else if (c == '#')
1184         opening_seen = 1;
1185       else
1186         opening_seen = closing_seen = 0;
1187     }
1188
1189   return SCM_UNSPECIFIED;
1190 }
1191
1192 static SCM
1193 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1194 {
1195   scm_t_wchar c;
1196
1197   c = flush_ws (port, (char *) NULL);
1198   if (EOF == c)
1199     scm_i_input_error ("read_commented_expression", port,
1200                        "no expression after #; comment", SCM_EOL);
1201   scm_ungetc (c, port);
1202   scm_read_expression (port);
1203   return SCM_UNSPECIFIED;
1204 }
1205
1206 static SCM
1207 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1208 {
1209   /* Guile's extended symbol read syntax looks like this:
1210
1211        #{This is all a symbol name}#
1212
1213      So here, CHR is expected to be `{'.  */
1214   int saw_brace = 0, finished = 0;
1215   size_t len = 0;
1216   SCM buf = scm_i_make_string (1024, NULL);
1217
1218   buf = scm_i_string_start_writing (buf);
1219
1220   while ((chr = scm_getc (port)) != EOF)
1221     {
1222       if (saw_brace)
1223         {
1224           if (chr == '#')
1225             {
1226               finished = 1;
1227               break;
1228             }
1229           else
1230             {
1231               saw_brace = 0;
1232               scm_i_string_set_x (buf, len++, '}');
1233               scm_i_string_set_x (buf, len++, chr);
1234             }
1235         }
1236       else if (chr == '}')
1237         saw_brace = 1;
1238       else
1239         scm_i_string_set_x (buf, len++, chr);
1240
1241       if (len >= scm_i_string_length (buf) - 2)
1242         {
1243           SCM addy;
1244
1245           scm_i_string_stop_writing ();
1246           addy = scm_i_make_string (1024, NULL);
1247           buf = scm_string_append (scm_list_2 (buf, addy));
1248           len = 0;
1249           buf = scm_i_string_start_writing (buf);
1250         }
1251
1252       if (finished)
1253         break;
1254     }
1255   scm_i_string_stop_writing ();
1256
1257   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1258 }
1259
1260
1261 \f
1262 /* Top-level token readers, i.e., dispatchers.  */
1263
1264 static SCM
1265 scm_read_sharp_extension (int chr, SCM port)
1266 {
1267   SCM proc;
1268
1269   proc = scm_get_hash_procedure (chr);
1270   if (scm_is_true (scm_procedure_p (proc)))
1271     {
1272       long line = SCM_LINUM (port);
1273       int column = SCM_COL (port) - 2;
1274       SCM got;
1275
1276       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1277       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1278         {
1279           if (SCM_RECORD_POSITIONS_P)
1280             return (recsexpr (got, line, column,
1281                               SCM_FILENAME (port)));
1282           else
1283             return got;
1284         }
1285     }
1286
1287   return SCM_UNSPECIFIED;
1288 }
1289
1290 /* The reader for the sharp `#' character.  It basically dispatches reads
1291    among the above token readers.   */
1292 static SCM
1293 scm_read_sharp (scm_t_wchar chr, SCM port)
1294 #define FUNC_NAME "scm_lreadr"
1295 {
1296   SCM result;
1297
1298   chr = scm_getc (port);
1299
1300   result = scm_read_sharp_extension (chr, port);
1301   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1302     return result;
1303
1304   switch (chr)
1305     {
1306     case '\\':
1307       return (scm_read_character (chr, port));
1308     case '(':
1309       return (scm_read_vector (chr, port));
1310     case 's':
1311     case 'u':
1312     case 'f':
1313       /* This one may return either a boolean or an SRFI-4 vector.  */
1314       return (scm_read_srfi4_vector (chr, port));
1315     case 'v':
1316       return (scm_read_bytevector (chr, port));
1317     case '*':
1318       return (scm_read_guile_bit_vector (chr, port));
1319     case 't':
1320     case 'T':
1321     case 'F':
1322       /* This one may return either a boolean or an SRFI-4 vector.  */
1323       return (scm_read_boolean (chr, port));
1324     case ':':
1325       return (scm_read_keyword (chr, port));
1326     case '0': case '1': case '2': case '3': case '4':
1327     case '5': case '6': case '7': case '8': case '9':
1328     case '@':
1329 #if SCM_ENABLE_DEPRECATED
1330       /* See below for 'i' and 'e'. */
1331     case 'a':
1332     case 'c':
1333     case 'y':
1334     case 'h':
1335     case 'l':
1336 #endif
1337       return (scm_i_read_array (port, chr));
1338
1339     case 'i':
1340     case 'e':
1341 #if SCM_ENABLE_DEPRECATED
1342       {
1343         /* When next char is '(', it really is an old-style
1344            uniform array. */
1345         scm_t_wchar next_c = scm_getc (port);
1346         if (next_c != EOF)
1347           scm_ungetc (next_c, port);
1348         if (next_c == '(')
1349           return scm_i_read_array (port, chr);
1350         /* Fall through. */
1351       }
1352 #endif
1353     case 'b':
1354     case 'B':
1355     case 'o':
1356     case 'O':
1357     case 'd':
1358     case 'D':
1359     case 'x':
1360     case 'X':
1361     case 'I':
1362     case 'E':
1363       return (scm_read_number_and_radix (chr, port));
1364     case '{':
1365       return (scm_read_extended_symbol (chr, port));
1366     case '!':
1367       return (scm_read_shebang (chr, port));
1368     case ';':
1369       return (scm_read_commented_expression (chr, port));
1370     case '`':
1371     case '\'':
1372     case ',':
1373       return (scm_read_syntax (chr, port));
1374     case 'n':
1375       return (scm_read_nil (chr, port));
1376     default:
1377       result = scm_read_sharp_extension (chr, port);
1378       if (scm_is_eq (result, SCM_UNSPECIFIED))
1379         {
1380           /* To remain compatible with 1.8 and earlier, the following
1381              characters have lower precedence than `read-hash-extend'
1382              characters.  */
1383           switch (chr)
1384             {
1385             case '|':
1386               return scm_read_r6rs_block_comment (chr, port);
1387             default:
1388               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1389                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1390             }
1391         }
1392       else
1393         return result;
1394     }
1395
1396   return SCM_UNSPECIFIED;
1397 }
1398 #undef FUNC_NAME
1399
1400 static SCM
1401 scm_read_expression (SCM port)
1402 #define FUNC_NAME "scm_read_expression"
1403 {
1404   while (1)
1405     {
1406       register scm_t_wchar chr;
1407
1408       chr = scm_getc (port);
1409
1410       switch (chr)
1411         {
1412         case SCM_WHITE_SPACES:
1413         case SCM_LINE_INCREMENTORS:
1414           break;
1415         case ';':
1416           (void) scm_read_semicolon_comment (chr, port);
1417           break;
1418         case '[':
1419           if (!SCM_SQUARE_BRACKETS_P)
1420             return (scm_read_mixed_case_symbol (chr, port));
1421           /* otherwise fall through */
1422         case '(':
1423           return (scm_read_sexp (chr, port));
1424         case '"':
1425           return (scm_read_string (chr, port));
1426         case '\'':
1427         case '`':
1428         case ',':
1429           return (scm_read_quote (chr, port));
1430         case '#':
1431           {
1432             SCM result;
1433             result = scm_read_sharp (chr, port);
1434             if (scm_is_eq (result, SCM_UNSPECIFIED))
1435               /* We read a comment or some such.  */
1436               break;
1437             else
1438               return result;
1439           }
1440         case ')':
1441           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1442           break;
1443         case EOF:
1444           return SCM_EOF_VAL;
1445         case ':':
1446           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1447             return scm_symbol_to_keyword (scm_read_expression (port));
1448           /* Fall through.  */
1449
1450         default:
1451           {
1452             if (((chr >= '0') && (chr <= '9'))
1453                 || (strchr ("+-.", chr)))
1454               return (scm_read_number (chr, port));
1455             else
1456               return (scm_read_mixed_case_symbol (chr, port));
1457           }
1458         }
1459     }
1460 }
1461 #undef FUNC_NAME
1462
1463 \f
1464 /* Actual reader.  */
1465
1466 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1467             (SCM port),
1468             "Read an s-expression from the input port @var{port}, or from\n"
1469             "the current input port if @var{port} is not specified.\n"
1470             "Any whitespace before the next token is discarded.")
1471 #define FUNC_NAME s_scm_read
1472 {
1473   int c;
1474
1475   if (SCM_UNBNDP (port))
1476     port = scm_current_input_port ();
1477   SCM_VALIDATE_OPINPORT (1, port);
1478
1479   c = flush_ws (port, (char *) NULL);
1480   if (EOF == c)
1481     return SCM_EOF_VAL;
1482   scm_ungetc (c, port);
1483
1484   return (scm_read_expression (port));
1485 }
1486 #undef FUNC_NAME
1487
1488
1489 \f
1490
1491 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1492 static SCM
1493 recsexpr (SCM obj, long line, int column, SCM filename)
1494 {
1495   if (!scm_is_pair(obj)) {
1496     return obj;
1497   } else {
1498     SCM tmp = obj, copy;
1499     /* If this sexpr is visible in the read:sharp source, we want to
1500        keep that information, so only record non-constant cons cells
1501        which haven't previously been read by the reader. */
1502     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1503       {
1504         if (SCM_COPY_SOURCE_P)
1505           {
1506             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1507                              SCM_UNDEFINED);
1508             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1509               {
1510                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1511                                                       line,
1512                                                       column,
1513                                                       filename),
1514                                             SCM_UNDEFINED));
1515                 copy = SCM_CDR (copy);
1516               }
1517             SCM_SETCDR (copy, tmp);
1518           }
1519         else
1520           {
1521             recsexpr (SCM_CAR (obj), line, column, filename);
1522             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1523               recsexpr (SCM_CAR (tmp), line, column, filename);
1524             copy = SCM_UNDEFINED;
1525           }
1526         scm_whash_insert (scm_source_whash,
1527                           obj,
1528                           scm_make_srcprops (line,
1529                                              column,
1530                                              filename,
1531                                              copy,
1532                                              SCM_EOL));
1533       }
1534     return obj;
1535   }
1536 }
1537
1538 /* Manipulate the read-hash-procedures alist.  This could be written in
1539    Scheme, but maybe it will also be used by C code during initialisation.  */
1540 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1541             (SCM chr, SCM proc),
1542             "Install the procedure @var{proc} for reading expressions\n"
1543             "starting with the character sequence @code{#} and @var{chr}.\n"
1544             "@var{proc} will be called with two arguments:  the character\n"
1545             "@var{chr} and the port to read further data from. The object\n"
1546             "returned will be the return value of @code{read}. \n"
1547             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1548             )
1549 #define FUNC_NAME s_scm_read_hash_extend
1550 {
1551   SCM this;
1552   SCM prev;
1553
1554   SCM_VALIDATE_CHAR (1, chr);
1555   SCM_ASSERT (scm_is_false (proc)
1556               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1557               proc, SCM_ARG2, FUNC_NAME);
1558
1559   /* Check if chr is already in the alist.  */
1560   this = *scm_read_hash_procedures;
1561   prev = SCM_BOOL_F;
1562   while (1)
1563     {
1564       if (scm_is_null (this))
1565         {
1566           /* not found, so add it to the beginning.  */
1567           if (scm_is_true (proc))
1568             {
1569               *scm_read_hash_procedures =
1570                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1571             }
1572           break;
1573         }
1574       if (scm_is_eq (chr, SCM_CAAR (this)))
1575         {
1576           /* already in the alist.  */
1577           if (scm_is_false (proc))
1578             {
1579               /* remove it.  */
1580               if (scm_is_false (prev))
1581                 {
1582                   *scm_read_hash_procedures =
1583                     SCM_CDR (*scm_read_hash_procedures);
1584                 }
1585               else
1586                 scm_set_cdr_x (prev, SCM_CDR (this));
1587             }
1588           else
1589             {
1590               /* replace it.  */
1591               scm_set_cdr_x (SCM_CAR (this), proc);
1592             }
1593           break;
1594         }
1595       prev = this;
1596       this = SCM_CDR (this);
1597     }
1598
1599   return SCM_UNSPECIFIED;
1600 }
1601 #undef FUNC_NAME
1602
1603 /* Recover the read-hash procedure corresponding to char c.  */
1604 static SCM
1605 scm_get_hash_procedure (int c)
1606 {
1607   SCM rest = *scm_read_hash_procedures;
1608
1609   while (1)
1610     {
1611       if (scm_is_null (rest))
1612         return SCM_BOOL_F;
1613
1614       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1615         return SCM_CDAR (rest);
1616
1617       rest = SCM_CDR (rest);
1618     }
1619 }
1620
1621 #define SCM_ENCODING_SEARCH_SIZE (500)
1622
1623 /* Search the first few hundred characters of a file for an Emacs-like coding
1624    declaration.  Returns either NULL or a string whose storage has been
1625    allocated with `scm_gc_malloc ()'.  */
1626 char *
1627 scm_i_scan_for_encoding (SCM port)
1628 {
1629   char header[SCM_ENCODING_SEARCH_SIZE+1];
1630   size_t bytes_read, encoding_length, i;
1631   char *encoding = NULL;
1632   int utf8_bom = 0;
1633   char *pos, *encoding_start;
1634   int in_comment;
1635
1636   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1637     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1638        "guile <(echo '(display "hello")')") so bail out.  */
1639     return NULL;
1640
1641   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1642
1643   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1644
1645   if (bytes_read > 3
1646       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1647     utf8_bom = 1;
1648
1649   /* search past "coding[:=]" */
1650   pos = header;
1651   while (1)
1652     {
1653       if ((pos = strstr(pos, "coding")) == NULL)
1654         return NULL;
1655
1656       pos += strlen("coding");
1657       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1658           (*pos == ':' || *pos == '='))
1659         {
1660           pos ++;
1661           break;
1662         }
1663     }
1664
1665   /* skip spaces */
1666   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1667          (*pos == ' ' || *pos == '\t'))
1668     pos ++;
1669
1670   /* grab the next token */
1671   encoding_start = pos;
1672   i = 0;
1673   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1674          && encoding_start + i - header < bytes_read
1675          && (isalnum ((int) encoding_start[i])
1676              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1677     i++;
1678
1679   encoding_length = i;
1680   if (encoding_length == 0)
1681     return NULL;
1682
1683   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1684   for (i = 0; i < encoding_length; i++)
1685     encoding[i] = toupper ((int) encoding[i]);
1686
1687   /* push backwards to make sure we were in a comment */
1688   in_comment = 0;
1689   pos = encoding_start;
1690   while (pos >= header)
1691     {
1692       if (*pos == '\n')
1693         {
1694           /* This wasn't in a semicolon comment. Check for a
1695            hash-bang comment. */
1696           char *beg = strstr (header, "#!");
1697           char *end = strstr (header, "!#");
1698           if (beg < encoding_start && encoding_start + encoding_length < end)
1699             in_comment = 1;
1700           break;
1701         }
1702       if (*pos == ';')
1703         {
1704           in_comment = 1;
1705           break;
1706         }
1707       pos --;
1708     }
1709   if (!in_comment)
1710     /* This wasn't in a comment */
1711     return NULL;
1712
1713   if (utf8_bom && strcmp(encoding, "UTF-8"))
1714     scm_misc_error (NULL,
1715                     "the port input declares the encoding ~s but is encoded as UTF-8",
1716                     scm_list_1 (scm_from_locale_string (encoding)));
1717
1718   return encoding;
1719 }
1720
1721 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1722             (SCM port),
1723             "Scans the port for an Emacs-like character coding declaration\n"
1724             "near the top of the contents of a port with random-acessible contents.\n"
1725             "The coding declaration is of the form\n"
1726             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1727             "\n"
1728             "Returns a string containing the character encoding of the file\n"
1729             "if a declaration was found, or @code{#f} otherwise.\n")
1730 #define FUNC_NAME s_scm_file_encoding
1731 {
1732   char *enc;
1733   SCM s_enc;
1734
1735   enc = scm_i_scan_for_encoding (port);
1736   if (enc == NULL)
1737     return SCM_BOOL_F;
1738   else
1739     {
1740       s_enc = scm_from_locale_string (enc);
1741       return s_enc;
1742     }
1743
1744   return SCM_BOOL_F;
1745 }
1746 #undef FUNC_NAME
1747
1748 void
1749 scm_init_read ()
1750 {
1751   scm_read_hash_procedures =
1752     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1753
1754   scm_init_opts (scm_read_options, scm_read_opts);
1755 #include "libguile/read.x"
1756 }
1757
1758 /*
1759   Local Variables:
1760   c-file-style: "gnu"
1761   End:
1762 */