libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static inline SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static inline void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static inline int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   register scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 361
 362
 363 static SCM
 364 scm_read_sexp (scm_t_wchar chr, SCM port)
 365 #define FUNC_NAME "scm_i_lreadparen"
 366 {
 367   register int c;
 368   register SCM tmp;
 369   register SCM tl, ans = SCM_EOL;
 370   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 371   const int terminating_char = ((chr == '[') ? ']' : ')');
 372
 373   /* Need to capture line and column numbers here. */
 374   long line = SCM_LINUM (port);
 375   int column = SCM_COL (port) - 1;
 376
 377
 378   c = flush_ws (port, FUNC_NAME);
 379   if (terminating_char == c)
 380     return SCM_EOL;
 381
 382   scm_ungetc (c, port);
 383   if (scm_is_eq (scm_sym_dot,
 384                  (tmp = scm_read_expression (port))))
 385     {
 386       ans = scm_read_expression (port);
 387       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 388         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 389                            SCM_EOL);
 390       return ans;
 391     }
 392
 393   /* Build the head of the list structure. */
 394   ans = tl = scm_cons (tmp, SCM_EOL);
 395
 396   if (SCM_COPY_SOURCE_P)
 397     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 398                            ? copy
 399                            : tmp,
 400                            SCM_EOL);
 401
 402   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 403     {
 404       SCM new_tail;
 405
 406       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 407         scm_i_input_error (FUNC_NAME, port,
 408                            "in pair: mismatched close paren: ~A",
 409                            scm_list_1 (SCM_MAKE_CHAR (c)));
 410
 411       scm_ungetc (c, port);
 412       tmp = scm_read_expression (port);
 413
 414       if (scm_is_eq (scm_sym_dot, tmp))
 415         {
 416           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 417
 418           if (SCM_COPY_SOURCE_P)
 419             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 420                                        SCM_EOL));
 421
 422           c = flush_ws (port, FUNC_NAME);
 423           if (terminating_char != c)
 424             scm_i_input_error (FUNC_NAME, port,
 425                                "in pair: missing close paren", SCM_EOL);
 426           goto exit;
 427         }
 428
 429       new_tail = scm_cons (tmp, SCM_EOL);
 430       SCM_SETCDR (tl, new_tail);
 431       tl = new_tail;
 432
 433       if (SCM_COPY_SOURCE_P)
 434         {
 435           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 436                                     ? copy
 437                                     : tmp, SCM_EOL);
 438           SCM_SETCDR (tl2, new_tail2);
 439           tl2 = new_tail2;
 440         }
 441     }
 442
 443  exit:
 444   if (SCM_RECORD_POSITIONS_P)
 445     scm_whash_insert (scm_source_whash,
 446                       ans,
 447                       scm_make_srcprops (line, column,
 448                                          SCM_FILENAME (port),
 449                                          SCM_COPY_SOURCE_P
 450                                          ? ans2
 451                                          : SCM_UNDEFINED,
 452                                          SCM_EOL));
 453   return ans;
 454 }
 455 #undef FUNC_NAME
 456
 457
 458 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 459    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 460    found.  */
 461 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 462   do                                                               \
 463     {                                                              \
 464       scm_t_wchar a;                                               \
 465       size_t i = 0;                                                \
 466       c = 0;                                                       \
 467       while (i < ndigits)                                          \
 468         {                                                          \
 469           a = scm_getc (port);                                     \
 470           if (a == EOF)                                            \
 471             goto str_eof;                                          \
 472           if (terminator                                           \
 473               && (a == (scm_t_wchar) terminator)                   \
 474               && (i > 0))                                          \
 475             break;                                                 \
 476           if ('0' <= a && a <= '9')                                \
 477             a -= '0';                                              \
 478           else if ('A' <= a && a <= 'F')                           \
 479             a = a - 'A' + 10;                                      \
 480           else if ('a' <= a && a <= 'f')                           \
 481             a = a - 'a' + 10;                                      \
 482           else                                                     \
 483             {                                                      \
 484               c = a;                                               \
 485               goto bad_escaped;                                    \
 486             }                                                      \
 487           c = c * 16 + a;                                          \
 488           i ++;                                                    \
 489         }                                                          \
 490     } while (0)
 491
 492 static void
 493 skip_intraline_whitespace (SCM port)
 494 {
 495   scm_t_wchar c;
 496
 497   do
 498     {
 499       c = scm_getc (port);
 500       if (c == EOF)
 501         return;
 502     }
 503   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 504
 505   scm_ungetc (c, port);
 506 }
 507
 508 static SCM
 509 scm_read_string (int chr, SCM port)
 510 #define FUNC_NAME "scm_lreadr"
 511 {
 512   /* For strings smaller than C_STR, this function creates only one Scheme
 513      object (the string returned).  */
 514
 515   SCM str = SCM_BOOL_F;
 516   unsigned c_str_len = 0;
 517   scm_t_wchar c;
 518
 519   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 520   while ('"' != (c = scm_getc (port)))
 521     {
 522       if (c == EOF)
 523         {
 524         str_eof:
 525           scm_i_input_error (FUNC_NAME, port,
 526                              "end of file in string constant", SCM_EOL);
 527         }
 528
 529       if (c_str_len + 1 >= scm_i_string_length (str))
 530         {
 531           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 532
 533           str = scm_string_append (scm_list_2 (str, addy));
 534         }
 535
 536       if (c == '\\')
 537         {
 538           switch (c = scm_getc (port))
 539             {
 540             case EOF:
 541               goto str_eof;
 542             case '"':
 543             case '\\':
 544               break;
 545             case '\n':
 546               if (SCM_HUNGRY_EOL_ESCAPES_P)
 547                 skip_intraline_whitespace (port);
 548               continue;
 549             case '0':
 550               c = '\0';
 551               break;
 552             case 'f':
 553               c = '\f';
 554               break;
 555             case 'n':
 556               c = '\n';
 557               break;
 558             case 'r':
 559               c = '\r';
 560               break;
 561             case 't':
 562               c = '\t';
 563               break;
 564             case 'a':
 565               c = '\007';
 566               break;
 567             case 'v':
 568               c = '\v';
 569               break;
 570             case 'b':
 571               c = '\010';
 572               break;
 573             case 'x':
 574               if (SCM_R6RS_ESCAPES_P)
 575                 SCM_READ_HEX_ESCAPE (10, ';');
 576               else
 577                 SCM_READ_HEX_ESCAPE (2, '\0');
 578               break;
 579             case 'u':
 580               if (!SCM_R6RS_ESCAPES_P)
 581                 {
 582                   SCM_READ_HEX_ESCAPE (4, '\0');
 583                   break;
 584                 }
 585             case 'U':
 586               if (!SCM_R6RS_ESCAPES_P)
 587                 {
 588                   SCM_READ_HEX_ESCAPE (6, '\0');
 589                   break;
 590                 }
 591             default:
 592             bad_escaped:
 593               scm_i_input_error (FUNC_NAME, port,
 594                                  "illegal character in escape sequence: ~S",
 595                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 596             }
 597         }
 598       str = scm_i_string_start_writing (str);
 599       scm_i_string_set_x (str, c_str_len++, c);
 600       scm_i_string_stop_writing ();
 601     }
 602
 603   if (c_str_len > 0)
 604     {
 605       return scm_i_substring_copy (str, 0, c_str_len);
 606     }
 607
 608   return scm_nullstr;
 609 }
 610 #undef FUNC_NAME
 611
 612
 613 static SCM
 614 scm_read_number (scm_t_wchar chr, SCM port)
 615 {
 616   SCM result, str = SCM_EOL;
 617   char buffer[READER_BUFFER_SIZE];
 618   char *overflow_buffer = NULL;
 619   size_t bytes_read;
 620   int overflow;
 621   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 622
 623   scm_ungetc (chr, port);
 624   overflow = read_complete_token (port, buffer, sizeof (buffer),
 625                                   &overflow_buffer, &bytes_read);
 626
 627   if (!overflow)
 628     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 629   else
 630     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 631                             pt->ilseq_handler);
 632
 633   result = scm_string_to_number (str, SCM_UNDEFINED);
 634   if (!scm_is_true (result))
 635     {
 636       /* Return a symbol instead of a number */
 637       if (SCM_CASE_INSENSITIVE_P)
 638         str = scm_string_downcase_x (str);
 639       result = scm_string_to_symbol (str);
 640     }
 641
 642   if (overflow)
 643     free (overflow_buffer);
 644   SCM_COL (port) += scm_i_string_length (str);
 645   return result;
 646 }
 647
 648 static SCM
 649 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 650 {
 651   SCM result;
 652   int ends_with_colon = 0;
 653   size_t bytes_read;
 654   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 655   int overflow;
 656   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 657   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 658   SCM str;
 659
 660   scm_ungetc (chr, port);
 661   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 662                                   &overflow_buffer, &bytes_read);
 663   if (bytes_read > 0)
 664     {
 665       if (!overflow)
 666         ends_with_colon = buffer[bytes_read - 1] == ':';
 667       else
 668         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 669     }
 670
 671   if (postfix && ends_with_colon && (bytes_read > 1))
 672     {
 673       if (!overflow)
 674         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 675       else
 676         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 677                                 pt->ilseq_handler);
 678
 679       if (SCM_CASE_INSENSITIVE_P)
 680         str = scm_string_downcase_x (str);
 681       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 682     }
 683   else
 684     {
 685       if (!overflow)
 686         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 687       else
 688         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 689                                 pt->ilseq_handler);
 690
 691       if (SCM_CASE_INSENSITIVE_P)
 692         str = scm_string_downcase_x (str);
 693       result = scm_string_to_symbol (str);
 694     }
 695
 696   if (overflow)
 697     free (overflow_buffer);
 698   SCM_COL (port) += scm_i_string_length (str);
 699   return result;
 700 }
 701
 702 static SCM
 703 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 704 #define FUNC_NAME "scm_lreadr"
 705 {
 706   SCM result;
 707   size_t read;
 708   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 709   int overflow;
 710   unsigned int radix;
 711   SCM str;
 712   scm_t_port *pt;
 713
 714   switch (chr)
 715     {
 716     case 'B':
 717     case 'b':
 718       radix = 2;
 719       break;
 720
 721     case 'o':
 722     case 'O':
 723       radix = 8;
 724       break;
 725
 726     case 'd':
 727     case 'D':
 728       radix = 10;
 729       break;
 730
 731     case 'x':
 732     case 'X':
 733       radix = 16;
 734       break;
 735
 736     default:
 737       scm_ungetc (chr, port);
 738       scm_ungetc ('#', port);
 739       radix = 10;
 740     }
 741
 742   overflow = read_complete_token (port, buffer, sizeof (buffer),
 743                                   &overflow_buffer, &read);
 744
 745   pt = SCM_PTAB_ENTRY (port);
 746   if (!overflow)
 747     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 748   else
 749     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 750                             pt->ilseq_handler);
 751
 752   result = scm_string_to_number (str, scm_from_uint (radix));
 753
 754   if (overflow)
 755     free (overflow_buffer);
 756
 757   SCM_COL (port) += scm_i_string_length (str);
 758
 759   if (scm_is_true (result))
 760     return result;
 761
 762   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 763
 764   return SCM_BOOL_F;
 765 }
 766 #undef FUNC_NAME
 767
 768 static SCM
 769 scm_read_quote (int chr, SCM port)
 770 {
 771   SCM p;
 772   long line = SCM_LINUM (port);
 773   int column = SCM_COL (port) - 1;
 774
 775   switch (chr)
 776     {
 777     case '`':
 778       p = scm_sym_quasiquote;
 779       break;
 780
 781     case '\'':
 782       p = scm_sym_quote;
 783       break;
 784
 785     case ',':
 786       {
 787         scm_t_wchar c;
 788
 789         c = scm_getc (port);
 790         if ('@' == c)
 791           p = scm_sym_uq_splicing;
 792         else
 793           {
 794             scm_ungetc (c, port);
 795             p = scm_sym_unquote;
 796           }
 797         break;
 798       }
 799
 800     default:
 801       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 802                "scm_read_quote", chr);
 803       abort ();
 804     }
 805
 806   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 807   if (SCM_RECORD_POSITIONS_P)
 808     scm_whash_insert (scm_source_whash, p,
 809                       scm_make_srcprops (line, column,
 810                                          SCM_FILENAME (port),
 811                                          SCM_COPY_SOURCE_P
 812                                          ? (scm_cons2 (SCM_CAR (p),
 813                                                        SCM_CAR (SCM_CDR (p)),
 814                                                        SCM_EOL))
 815                                          : SCM_UNDEFINED,
 816                                          SCM_EOL));
 817
 818
 819   return p;
 820 }
 821
 822 SCM_SYMBOL (sym_syntax, "syntax");
 823 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 824 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 825 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 826
 827 static SCM
 828 scm_read_syntax (int chr, SCM port)
 829 {
 830   SCM p;
 831   long line = SCM_LINUM (port);
 832   int column = SCM_COL (port) - 1;
 833
 834   switch (chr)
 835     {
 836     case '`':
 837       p = sym_quasisyntax;
 838       break;
 839
 840     case '\'':
 841       p = sym_syntax;
 842       break;
 843
 844     case ',':
 845       {
 846         int c;
 847
 848         c = scm_getc (port);
 849         if ('@' == c)
 850           p = sym_unsyntax_splicing;
 851         else
 852           {
 853             scm_ungetc (c, port);
 854             p = sym_unsyntax;
 855           }
 856         break;
 857       }
 858
 859     default:
 860       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 861                "scm_read_syntax", chr);
 862       abort ();
 863     }
 864
 865   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 866   if (SCM_RECORD_POSITIONS_P)
 867     scm_whash_insert (scm_source_whash, p,
 868                       scm_make_srcprops (line, column,
 869                                          SCM_FILENAME (port),
 870                                          SCM_COPY_SOURCE_P
 871                                          ? (scm_cons2 (SCM_CAR (p),
 872                                                        SCM_CAR (SCM_CDR (p)),
 873                                                        SCM_EOL))
 874                                          : SCM_UNDEFINED,
 875                                          SCM_EOL));
 876
 877
 878   return p;
 879 }
 880
 881 static inline SCM
 882 scm_read_nil (int chr, SCM port)
 883 {
 884   SCM id = scm_read_mixed_case_symbol (chr, port);
 885
 886   if (!scm_is_eq (id, sym_nil))
 887     scm_i_input_error ("scm_read_nil", port,
 888                        "unexpected input while reading #nil: ~a",
 889                        scm_list_1 (id));
 890
 891   return SCM_ELISP_NIL;
 892 }
 893
 894 static inline SCM
 895 scm_read_semicolon_comment (int chr, SCM port)
 896 {
 897   int c;
 898
 899   /* We use the get_byte here because there is no need to get the
 900      locale correct with comment input. This presumes that newline
 901      always represents itself no matter what the encoding is.  */
 902   for (c = scm_get_byte_or_eof (port);
 903        (c != EOF) && (c != '\n');
 904        c = scm_get_byte_or_eof (port));
 905
 906   return SCM_UNSPECIFIED;
 907 }
 908
 909 \f
 910 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 911
 912 static SCM
 913 scm_read_boolean (int chr, SCM port)
 914 {
 915   switch (chr)
 916     {
 917     case 't':
 918     case 'T':
 919       return SCM_BOOL_T;
 920
 921     case 'f':
 922     case 'F':
 923       return SCM_BOOL_F;
 924     }
 925
 926   return SCM_UNSPECIFIED;
 927 }
 928
 929 static SCM
 930 scm_read_character (scm_t_wchar chr, SCM port)
 931 #define FUNC_NAME "scm_lreadr"
 932 {
 933   char buffer[READER_CHAR_NAME_MAX_SIZE];
 934   SCM charname;
 935   size_t charname_len, bytes_read;
 936   scm_t_wchar cp;
 937   int overflow;
 938   scm_t_port *pt;
 939
 940   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 941   if (overflow)
 942     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 943
 944   if (bytes_read == 0)
 945     {
 946       chr = scm_getc (port);
 947       if (chr == EOF)
 948         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 949                            "while reading character", SCM_EOL);
 950
 951       /* CHR must be a token delimiter, like a whitespace.  */
 952       return (SCM_MAKE_CHAR (chr));
 953     }
 954
 955   pt = SCM_PTAB_ENTRY (port);
 956
 957   /* Simple ASCII characters can be processed immediately.  Also, simple
 958      ISO-8859-1 characters can be processed immediately if the encoding for this
 959      port is ISO-8859-1.  */
 960   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 961     {
 962       SCM_COL (port) += 1;
 963       return SCM_MAKE_CHAR (buffer[0]);
 964     }
 965
 966   /* Otherwise, convert the buffer into a proper scheme string for
 967      processing.  */
 968   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 969                                pt->ilseq_handler);
 970   charname_len = scm_i_string_length (charname);
 971   SCM_COL (port) += charname_len;
 972   cp = scm_i_string_ref (charname, 0);
 973   if (charname_len == 1)
 974     return SCM_MAKE_CHAR (cp);
 975
 976   /* Ignore dotted circles, which may be used to keep combining characters from
 977      combining with the backslash in #\charname.  */
 978   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 979     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 980
 981   if (cp >= '0' && cp < '8')
 982     {
 983       /* Dirk:FIXME::  This type of character syntax is not R5RS
 984        * compliant.  Further, it should be verified that the constant
 985        * does only consist of octal digits.  */
 986       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 987       if (SCM_I_INUMP (p))
 988         {
 989           scm_t_wchar c = scm_to_uint32 (p);
 990           if (SCM_IS_UNICODE_CHAR (c))
 991             return SCM_MAKE_CHAR (c);
 992           else
 993             scm_i_input_error (FUNC_NAME, port,
 994                                "out-of-range octal character escape: ~a",
 995                                scm_list_1 (charname));
 996         }
 997     }
 998
 999   if (cp == 'x' && (charname_len > 1))
1000     {
1001       SCM p;
1002
1003       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1005                                 scm_from_uint (16));
1006       if (SCM_I_INUMP (p))
1007         {
1008           scm_t_wchar c = scm_to_uint32 (p);
1009           if (SCM_IS_UNICODE_CHAR (c))
1010             return SCM_MAKE_CHAR (c);
1011           else
1012             scm_i_input_error (FUNC_NAME, port,
1013                                "out-of-range hex character escape: ~a",
1014                                scm_list_1 (charname));
1015         }
1016     }
1017
1018   /* The names of characters should never have non-Latin1
1019      characters.  */
1020   if (scm_i_is_narrow_string (charname)
1021       || scm_i_try_narrow_string (charname))
1022     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1023                                        charname_len);
1024       if (scm_is_true (ch))
1025         return ch;
1026     }
1027
1028   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1029                      scm_list_1 (charname));
1030
1031   return SCM_UNSPECIFIED;
1032 }
1033 #undef FUNC_NAME
1034
1035 static inline SCM
1036 scm_read_keyword (int chr, SCM port)
1037 {
1038   SCM symbol;
1039
1040   /* Read the symbol that comprises the keyword.  Doing this instead of
1041      invoking a specific symbol reader function allows `scm_read_keyword ()'
1042      to adapt to the delimiters currently valid of symbols.
1043
1044      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1045   symbol = scm_read_expression (port);
1046   if (!scm_is_symbol (symbol))
1047     scm_i_input_error ("scm_read_keyword", port,
1048                        "keyword prefix `~a' not followed by a symbol: ~s",
1049                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1050
1051   return (scm_symbol_to_keyword (symbol));
1052 }
1053
1054 static inline SCM
1055 scm_read_vector (int chr, SCM port)
1056 {
1057   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058      guarantee that it's going to do what we want.  After all, this is an
1059      implementation detail of `scm_read_vector ()', not a desirable
1060      property.  */
1061   return (scm_vector (scm_read_sexp (chr, port)));
1062 }
1063
1064 static inline SCM
1065 scm_read_srfi4_vector (int chr, SCM port)
1066 {
1067   return scm_i_read_array (port, chr);
1068 }
1069
1070 static SCM
1071 scm_read_bytevector (scm_t_wchar chr, SCM port)
1072 {
1073   chr = scm_getc (port);
1074   if (chr != 'u')
1075     goto syntax;
1076
1077   chr = scm_getc (port);
1078   if (chr != '8')
1079     goto syntax;
1080
1081   chr = scm_getc (port);
1082   if (chr != '(')
1083     goto syntax;
1084
1085   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1086
1087  syntax:
1088   scm_i_input_error ("read_bytevector", port,
1089                      "invalid bytevector prefix",
1090                      SCM_MAKE_CHAR (chr));
1091   return SCM_UNSPECIFIED;
1092 }
1093
1094 static SCM
1095 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1096 {
1097   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1098      terribly inefficient but who cares?  */
1099   SCM s_bits = SCM_EOL;
1100
1101   for (chr = scm_getc (port);
1102        (chr != EOF) && ((chr == '0') || (chr == '1'));
1103        chr = scm_getc (port))
1104     {
1105       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1106     }
1107
1108   if (chr != EOF)
1109     scm_ungetc (chr, port);
1110
1111   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1112 }
1113
1114 static inline SCM
1115 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1116 {
1117   int bang_seen = 0;
1118
1119   for (;;)
1120     {
1121       int c = scm_getc (port);
1122
1123       if (c == EOF)
1124         scm_i_input_error ("skip_block_comment", port,
1125                            "unterminated `#! ... !#' comment", SCM_EOL);
1126
1127       if (c == '!')
1128         bang_seen = 1;
1129       else if (c == '#' && bang_seen)
1130         break;
1131       else
1132         bang_seen = 0;
1133     }
1134
1135   return SCM_UNSPECIFIED;
1136 }
1137
1138 static inline SCM
1139 scm_read_shebang (scm_t_wchar chr, SCM port)
1140 {
1141   int c = 0;
1142   if ((c = scm_get_byte_or_eof (port)) != 'r')
1143     {
1144       scm_ungetc (c, port);
1145       return scm_read_scsh_block_comment (chr, port);
1146     }
1147   if ((c = scm_get_byte_or_eof (port)) != '6')
1148     {
1149       scm_ungetc (c, port);
1150       scm_ungetc ('r', port);
1151       return scm_read_scsh_block_comment (chr, port);
1152     }
1153   if ((c = scm_get_byte_or_eof (port)) != 'r')
1154     {
1155       scm_ungetc (c, port);
1156       scm_ungetc ('6', port);
1157       scm_ungetc ('r', port);
1158       return scm_read_scsh_block_comment (chr, port);
1159     }
1160   if ((c = scm_get_byte_or_eof (port)) != 's')
1161     {
1162       scm_ungetc (c, port);
1163       scm_ungetc ('r', port);
1164       scm_ungetc ('6', port);
1165       scm_ungetc ('r', port);
1166       return scm_read_scsh_block_comment (chr, port);
1167     }
1168
1169   return SCM_UNSPECIFIED;
1170 }
1171
1172 static SCM
1173 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1174 {
1175   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1176      nested.  So care must be taken.  */
1177   int nesting_level = 1;
1178   int opening_seen = 0, closing_seen = 0;
1179
1180   while (nesting_level > 0)
1181     {
1182       int c = scm_getc (port);
1183
1184       if (c == EOF)
1185         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1186                            "unterminated `#| ... |#' comment", SCM_EOL);
1187
1188       if (opening_seen)
1189         {
1190           if (c == '|')
1191             nesting_level++;
1192           opening_seen = 0;
1193         }
1194       else if (closing_seen)
1195         {
1196           if (c == '#')
1197             nesting_level--;
1198           closing_seen = 0;
1199         }
1200       else if (c == '|')
1201         closing_seen = 1;
1202       else if (c == '#')
1203         opening_seen = 1;
1204       else
1205         opening_seen = closing_seen = 0;
1206     }
1207
1208   return SCM_UNSPECIFIED;
1209 }
1210
1211 static SCM
1212 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1213 {
1214   scm_t_wchar c;
1215
1216   c = flush_ws (port, (char *) NULL);
1217   if (EOF == c)
1218     scm_i_input_error ("read_commented_expression", port,
1219                        "no expression after #; comment", SCM_EOL);
1220   scm_ungetc (c, port);
1221   scm_read_expression (port);
1222   return SCM_UNSPECIFIED;
1223 }
1224
1225 static SCM
1226 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1227 {
1228   /* Guile's extended symbol read syntax looks like this:
1229
1230        #{This is all a symbol name}#
1231
1232      So here, CHR is expected to be `{'.  */
1233   int saw_brace = 0, finished = 0;
1234   size_t len = 0;
1235   SCM buf = scm_i_make_string (1024, NULL, 0);
1236
1237   buf = scm_i_string_start_writing (buf);
1238
1239   while ((chr = scm_getc (port)) != EOF)
1240     {
1241       if (saw_brace)
1242         {
1243           if (chr == '#')
1244             {
1245               finished = 1;
1246               break;
1247             }
1248           else
1249             {
1250               saw_brace = 0;
1251               scm_i_string_set_x (buf, len++, '}');
1252               scm_i_string_set_x (buf, len++, chr);
1253             }
1254         }
1255       else if (chr == '}')
1256         saw_brace = 1;
1257       else
1258         scm_i_string_set_x (buf, len++, chr);
1259
1260       if (len >= scm_i_string_length (buf) - 2)
1261         {
1262           SCM addy;
1263
1264           scm_i_string_stop_writing ();
1265           addy = scm_i_make_string (1024, NULL, 0);
1266           buf = scm_string_append (scm_list_2 (buf, addy));
1267           len = 0;
1268           buf = scm_i_string_start_writing (buf);
1269         }
1270
1271       if (finished)
1272         break;
1273     }
1274   scm_i_string_stop_writing ();
1275
1276   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1277 }
1278
1279
1280 \f
1281 /* Top-level token readers, i.e., dispatchers.  */
1282
1283 static SCM
1284 scm_read_sharp_extension (int chr, SCM port)
1285 {
1286   SCM proc;
1287
1288   proc = scm_get_hash_procedure (chr);
1289   if (scm_is_true (scm_procedure_p (proc)))
1290     {
1291       long line = SCM_LINUM (port);
1292       int column = SCM_COL (port) - 2;
1293       SCM got;
1294
1295       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1296       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1297         {
1298           if (SCM_RECORD_POSITIONS_P)
1299             return (recsexpr (got, line, column,
1300                               SCM_FILENAME (port)));
1301           else
1302             return got;
1303         }
1304     }
1305
1306   return SCM_UNSPECIFIED;
1307 }
1308
1309 /* The reader for the sharp `#' character.  It basically dispatches reads
1310    among the above token readers.   */
1311 static SCM
1312 scm_read_sharp (scm_t_wchar chr, SCM port)
1313 #define FUNC_NAME "scm_lreadr"
1314 {
1315   SCM result;
1316
1317   chr = scm_getc (port);
1318
1319   result = scm_read_sharp_extension (chr, port);
1320   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1321     return result;
1322
1323   switch (chr)
1324     {
1325     case '\\':
1326       return (scm_read_character (chr, port));
1327     case '(':
1328       return (scm_read_vector (chr, port));
1329     case 's':
1330     case 'u':
1331     case 'f':
1332       /* This one may return either a boolean or an SRFI-4 vector.  */
1333       return (scm_read_srfi4_vector (chr, port));
1334     case 'v':
1335       return (scm_read_bytevector (chr, port));
1336     case '*':
1337       return (scm_read_guile_bit_vector (chr, port));
1338     case 't':
1339     case 'T':
1340     case 'F':
1341       /* This one may return either a boolean or an SRFI-4 vector.  */
1342       return (scm_read_boolean (chr, port));
1343     case ':':
1344       return (scm_read_keyword (chr, port));
1345     case '0': case '1': case '2': case '3': case '4':
1346     case '5': case '6': case '7': case '8': case '9':
1347     case '@':
1348 #if SCM_ENABLE_DEPRECATED
1349       /* See below for 'i' and 'e'. */
1350     case 'a':
1351     case 'c':
1352     case 'y':
1353     case 'h':
1354     case 'l':
1355 #endif
1356       return (scm_i_read_array (port, chr));
1357
1358     case 'i':
1359     case 'e':
1360 #if SCM_ENABLE_DEPRECATED
1361       {
1362         /* When next char is '(', it really is an old-style
1363            uniform array. */
1364         scm_t_wchar next_c = scm_getc (port);
1365         if (next_c != EOF)
1366           scm_ungetc (next_c, port);
1367         if (next_c == '(')
1368           return scm_i_read_array (port, chr);
1369         /* Fall through. */
1370       }
1371 #endif
1372     case 'b':
1373     case 'B':
1374     case 'o':
1375     case 'O':
1376     case 'd':
1377     case 'D':
1378     case 'x':
1379     case 'X':
1380     case 'I':
1381     case 'E':
1382       return (scm_read_number_and_radix (chr, port));
1383     case '{':
1384       return (scm_read_extended_symbol (chr, port));
1385     case '!':
1386       return (scm_read_shebang (chr, port));
1387     case ';':
1388       return (scm_read_commented_expression (chr, port));
1389     case '`':
1390     case '\'':
1391     case ',':
1392       return (scm_read_syntax (chr, port));
1393     case 'n':
1394       return (scm_read_nil (chr, port));
1395     default:
1396       result = scm_read_sharp_extension (chr, port);
1397       if (scm_is_eq (result, SCM_UNSPECIFIED))
1398         {
1399           /* To remain compatible with 1.8 and earlier, the following
1400              characters have lower precedence than `read-hash-extend'
1401              characters.  */
1402           switch (chr)
1403             {
1404             case '|':
1405               return scm_read_r6rs_block_comment (chr, port);
1406             default:
1407               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1408                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1409             }
1410         }
1411       else
1412         return result;
1413     }
1414
1415   return SCM_UNSPECIFIED;
1416 }
1417 #undef FUNC_NAME
1418
1419 static SCM
1420 scm_read_expression (SCM port)
1421 #define FUNC_NAME "scm_read_expression"
1422 {
1423   while (1)
1424     {
1425       register scm_t_wchar chr;
1426
1427       chr = scm_getc (port);
1428
1429       switch (chr)
1430         {
1431         case SCM_WHITE_SPACES:
1432         case SCM_LINE_INCREMENTORS:
1433           break;
1434         case ';':
1435           (void) scm_read_semicolon_comment (chr, port);
1436           break;
1437         case '[':
1438           if (!SCM_SQUARE_BRACKETS_P)
1439             return (scm_read_mixed_case_symbol (chr, port));
1440           /* otherwise fall through */
1441         case '(':
1442           return (scm_read_sexp (chr, port));
1443         case '"':
1444           return (scm_read_string (chr, port));
1445         case '\'':
1446         case '`':
1447         case ',':
1448           return (scm_read_quote (chr, port));
1449         case '#':
1450           {
1451             SCM result;
1452             result = scm_read_sharp (chr, port);
1453             if (scm_is_eq (result, SCM_UNSPECIFIED))
1454               /* We read a comment or some such.  */
1455               break;
1456             else
1457               return result;
1458           }
1459         case ')':
1460           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1461           break;
1462         case ']':
1463           if (SCM_SQUARE_BRACKETS_P)
1464             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1465           /* otherwise fall through */
1466         case EOF:
1467           return SCM_EOF_VAL;
1468         case ':':
1469           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1470             return scm_symbol_to_keyword (scm_read_expression (port));
1471           /* Fall through.  */
1472
1473         default:
1474           {
1475             if (((chr >= '0') && (chr <= '9'))
1476                 || (strchr ("+-.", chr)))
1477               return (scm_read_number (chr, port));
1478             else
1479               return (scm_read_mixed_case_symbol (chr, port));
1480           }
1481         }
1482     }
1483 }
1484 #undef FUNC_NAME
1485
1486 \f
1487 /* Actual reader.  */
1488
1489 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1490             (SCM port),
1491             "Read an s-expression from the input port @var{port}, or from\n"
1492             "the current input port if @var{port} is not specified.\n"
1493             "Any whitespace before the next token is discarded.")
1494 #define FUNC_NAME s_scm_read
1495 {
1496   int c;
1497
1498   if (SCM_UNBNDP (port))
1499     port = scm_current_input_port ();
1500   SCM_VALIDATE_OPINPORT (1, port);
1501
1502   c = flush_ws (port, (char *) NULL);
1503   if (EOF == c)
1504     return SCM_EOF_VAL;
1505   scm_ungetc (c, port);
1506
1507   return (scm_read_expression (port));
1508 }
1509 #undef FUNC_NAME
1510
1511
1512 \f
1513
1514 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1515 static SCM
1516 recsexpr (SCM obj, long line, int column, SCM filename)
1517 {
1518   if (!scm_is_pair(obj)) {
1519     return obj;
1520   } else {
1521     SCM tmp = obj, copy;
1522     /* If this sexpr is visible in the read:sharp source, we want to
1523        keep that information, so only record non-constant cons cells
1524        which haven't previously been read by the reader. */
1525     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1526       {
1527         if (SCM_COPY_SOURCE_P)
1528           {
1529             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1530                              SCM_UNDEFINED);
1531             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1532               {
1533                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1534                                                       line,
1535                                                       column,
1536                                                       filename),
1537                                             SCM_UNDEFINED));
1538                 copy = SCM_CDR (copy);
1539               }
1540             SCM_SETCDR (copy, tmp);
1541           }
1542         else
1543           {
1544             recsexpr (SCM_CAR (obj), line, column, filename);
1545             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1546               recsexpr (SCM_CAR (tmp), line, column, filename);
1547             copy = SCM_UNDEFINED;
1548           }
1549         scm_whash_insert (scm_source_whash,
1550                           obj,
1551                           scm_make_srcprops (line,
1552                                              column,
1553                                              filename,
1554                                              copy,
1555                                              SCM_EOL));
1556       }
1557     return obj;
1558   }
1559 }
1560
1561 /* Manipulate the read-hash-procedures alist.  This could be written in
1562    Scheme, but maybe it will also be used by C code during initialisation.  */
1563 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1564             (SCM chr, SCM proc),
1565             "Install the procedure @var{proc} for reading expressions\n"
1566             "starting with the character sequence @code{#} and @var{chr}.\n"
1567             "@var{proc} will be called with two arguments:  the character\n"
1568             "@var{chr} and the port to read further data from. The object\n"
1569             "returned will be the return value of @code{read}. \n"
1570             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1571             )
1572 #define FUNC_NAME s_scm_read_hash_extend
1573 {
1574   SCM this;
1575   SCM prev;
1576
1577   SCM_VALIDATE_CHAR (1, chr);
1578   SCM_ASSERT (scm_is_false (proc)
1579               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1580               proc, SCM_ARG2, FUNC_NAME);
1581
1582   /* Check if chr is already in the alist.  */
1583   this = scm_i_read_hash_procedures_ref ();
1584   prev = SCM_BOOL_F;
1585   while (1)
1586     {
1587       if (scm_is_null (this))
1588         {
1589           /* not found, so add it to the beginning.  */
1590           if (scm_is_true (proc))
1591             {
1592               SCM new = scm_cons (scm_cons (chr, proc),
1593                                   scm_i_read_hash_procedures_ref ());
1594               scm_i_read_hash_procedures_set_x (new);
1595             }
1596           break;
1597         }
1598       if (scm_is_eq (chr, SCM_CAAR (this)))
1599         {
1600           /* already in the alist.  */
1601           if (scm_is_false (proc))
1602             {
1603               /* remove it.  */
1604               if (scm_is_false (prev))
1605                 {
1606                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1607                   scm_i_read_hash_procedures_set_x (rest);
1608                 }
1609               else
1610                 scm_set_cdr_x (prev, SCM_CDR (this));
1611             }
1612           else
1613             {
1614               /* replace it.  */
1615               scm_set_cdr_x (SCM_CAR (this), proc);
1616             }
1617           break;
1618         }
1619       prev = this;
1620       this = SCM_CDR (this);
1621     }
1622
1623   return SCM_UNSPECIFIED;
1624 }
1625 #undef FUNC_NAME
1626
1627 /* Recover the read-hash procedure corresponding to char c.  */
1628 static SCM
1629 scm_get_hash_procedure (int c)
1630 {
1631   SCM rest = scm_i_read_hash_procedures_ref ();
1632
1633   while (1)
1634     {
1635       if (scm_is_null (rest))
1636         return SCM_BOOL_F;
1637
1638       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1639         return SCM_CDAR (rest);
1640
1641       rest = SCM_CDR (rest);
1642     }
1643 }
1644
1645 #define SCM_ENCODING_SEARCH_SIZE (500)
1646
1647 /* Search the first few hundred characters of a file for an Emacs-like coding
1648    declaration.  Returns either NULL or a string whose storage has been
1649    allocated with `scm_gc_malloc ()'.  */
1650 char *
1651 scm_i_scan_for_encoding (SCM port)
1652 {
1653   scm_t_port *pt;
1654   char header[SCM_ENCODING_SEARCH_SIZE+1];
1655   size_t bytes_read, encoding_length, i;
1656   char *encoding = NULL;
1657   int utf8_bom = 0;
1658   char *pos, *encoding_start;
1659   int in_comment;
1660
1661   pt = SCM_PTAB_ENTRY (port);
1662
1663   if (pt->rw_active == SCM_PORT_WRITE)
1664     scm_flush (port);
1665
1666   if (pt->rw_random)
1667     pt->rw_active = SCM_PORT_READ;
1668
1669   if (pt->read_pos == pt->read_end)
1670     {
1671       /* We can use the read buffer, and thus avoid a seek. */
1672       if (scm_fill_input (port) == EOF)
1673         return NULL;
1674
1675       bytes_read = pt->read_end - pt->read_pos;
1676       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1677         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1678
1679       if (bytes_read <= 1)
1680         /* An unbuffered port -- don't scan.  */
1681         return NULL;
1682
1683       memcpy (header, pt->read_pos, bytes_read);
1684       header[bytes_read] = '\0';
1685     }
1686   else
1687     {
1688       /* Try to read some bytes and then seek back.  Not all ports
1689          support seeking back; and indeed some file ports (like
1690          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1691          check performed by SCM_FPORT_FDES---but fail to seek
1692          backwards.  Hence this block comes second.  We prefer to use
1693          the read buffer in-place.  */
1694       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1695         return NULL;
1696
1697       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1698       header[bytes_read] = '\0';
1699       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1700     }
1701
1702   if (bytes_read > 3
1703       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1704     utf8_bom = 1;
1705
1706   /* search past "coding[:=]" */
1707   pos = header;
1708   while (1)
1709     {
1710       if ((pos = strstr(pos, "coding")) == NULL)
1711         return NULL;
1712
1713       pos += strlen("coding");
1714       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1715           (*pos == ':' || *pos == '='))
1716         {
1717           pos ++;
1718           break;
1719         }
1720     }
1721
1722   /* skip spaces */
1723   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1724          (*pos == ' ' || *pos == '\t'))
1725     pos ++;
1726
1727   /* grab the next token */
1728   encoding_start = pos;
1729   i = 0;
1730   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1731          && encoding_start + i - header < bytes_read
1732          && (isalnum ((int) encoding_start[i])
1733              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1734     i++;
1735
1736   encoding_length = i;
1737   if (encoding_length == 0)
1738     return NULL;
1739
1740   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1741   for (i = 0; i < encoding_length; i++)
1742     encoding[i] = toupper ((int) encoding[i]);
1743
1744   /* push backwards to make sure we were in a comment */
1745   in_comment = 0;
1746   pos = encoding_start;
1747   while (pos >= header)
1748     {
1749       if (*pos == '\n')
1750         {
1751           /* This wasn't in a semicolon comment. Check for a
1752            hash-bang comment. */
1753           char *beg = strstr (header, "#!");
1754           char *end = strstr (header, "!#");
1755           if (beg < encoding_start && encoding_start + encoding_length < end)
1756             in_comment = 1;
1757           break;
1758         }
1759       if (*pos == ';')
1760         {
1761           in_comment = 1;
1762           break;
1763         }
1764       pos --;
1765     }
1766   if (!in_comment)
1767     /* This wasn't in a comment */
1768     return NULL;
1769
1770   if (utf8_bom && strcmp(encoding, "UTF-8"))
1771     scm_misc_error (NULL,
1772                     "the port input declares the encoding ~s but is encoded as UTF-8",
1773                     scm_list_1 (scm_from_locale_string (encoding)));
1774
1775   return encoding;
1776 }
1777
1778 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1779             (SCM port),
1780             "Scans the port for an Emacs-like character coding declaration\n"
1781             "near the top of the contents of a port with random-accessible contents.\n"
1782             "The coding declaration is of the form\n"
1783             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1784             "\n"
1785             "Returns a string containing the character encoding of the file\n"
1786             "if a declaration was found, or @code{#f} otherwise.\n")
1787 #define FUNC_NAME s_scm_file_encoding
1788 {
1789   char *enc;
1790   SCM s_enc;
1791
1792   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1793
1794   enc = scm_i_scan_for_encoding (port);
1795   if (enc == NULL)
1796     return SCM_BOOL_F;
1797   else
1798     {
1799       s_enc = scm_from_locale_string (enc);
1800       return s_enc;
1801     }
1802
1803   return SCM_BOOL_F;
1804 }
1805 #undef FUNC_NAME
1806
1807 void
1808 scm_init_read ()
1809 {
1810   SCM read_hash_procs;
1811
1812   read_hash_procs = scm_make_fluid ();
1813   scm_fluid_set_x (read_hash_procs, SCM_EOL);
1814
1815   scm_i_read_hash_procedures =
1816     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1817
1818   scm_init_opts (scm_read_options, scm_read_opts);
1819 #include "libguile/read.x"
1820 }
1821
1822 /*
1823   Local Variables:
1824   c-file-style: "gnu"
1825   End:
1826 */