libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63 SCM_SYMBOL (sym_nil, "nil");
  64
  65 scm_t_option scm_read_opts[] = {
  66   { SCM_OPTION_BOOLEAN, "copy", 0,
  67     "Copy source code expressions." },
  68   { SCM_OPTION_BOOLEAN, "positions", 0,
  69     "Record positions of source code expressions." },
  70   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  71     "Convert symbols to lower case."},
  72   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  73     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  74   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  75     "Use R6RS variable-length character and string hex escapes."},
  76   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  77     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  78   { 0, },
  79 };
  80
  81 /*
  82   Give meaningful error messages for errors
  83
  84   We use the format
  85
  86   FILE:LINE:COL: MESSAGE
  87   This happened in ....
  88
  89   This is not standard GNU format, but the test-suite likes the real
  90   message to be in front.
  91
  92  */
  93
  94
  95 void
  96 scm_i_input_error (char const *function,
  97                    SCM port, const char *message, SCM arg)
  98 {
  99   SCM fn = (scm_is_string (SCM_FILENAME(port))
 100             ? SCM_FILENAME(port)
 101             : scm_from_locale_string ("#<unknown port>"));
 102
 103   SCM string_port = scm_open_output_string ();
 104   SCM string = SCM_EOL;
 105   scm_simple_format (string_port,
 106                      scm_from_locale_string ("~A:~S:~S: ~A"),
 107                      scm_list_4 (fn,
 108                                  scm_from_long (SCM_LINUM (port) + 1),
 109                                  scm_from_int (SCM_COL (port) + 1),
 110                                  scm_from_locale_string (message)));
 111
 112   string = scm_get_output_string (string_port);
 113   scm_close_output_port (string_port);
 114   scm_error_scm (scm_from_locale_symbol ("read-error"),
 115                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 116                  string,
 117                  arg,
 118                  SCM_BOOL_F);
 119 }
 120
 121
 122 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 123             (SCM setting),
 124             "Option interface for the read options. Instead of using\n"
 125             "this procedure directly, use the procedures @code{read-enable},\n"
 126             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 127 #define FUNC_NAME s_scm_read_options
 128 {
 129   SCM ans = scm_options (setting,
 130                          scm_read_opts,
 131                          FUNC_NAME);
 132   if (SCM_COPY_SOURCE_P)
 133     SCM_RECORD_POSITIONS_P = 1;
 134   return ans;
 135 }
 136 #undef FUNC_NAME
 137
 138 /* An association list mapping extra hash characters to procedures.  */
 139 static SCM *scm_read_hash_procedures;
 140
 141
 142 \f
 143 /* Token readers.  */
 144
 145
 146 /* Size of the C buffer used to read symbols and numbers.  */
 147 #define READER_BUFFER_SIZE            128
 148
 149 /* Size of the C buffer used to read strings.  */
 150 #define READER_STRING_BUFFER_SIZE     512
 151
 152 /* The maximum size of Scheme character names.  */
 153 #define READER_CHAR_NAME_MAX_SIZE      50
 154
 155
 156 /* `isblank' is only in C99.  */
 157 #define CHAR_IS_BLANK_(_chr)                                    \
 158   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 159    || ((_chr) == '\f') || ((_chr) == '\r'))
 160
 161 #ifdef MSDOS
 162 # define CHAR_IS_BLANK(_chr)                    \
 163   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 164 #else
 165 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 166 #endif
 167
 168
 169 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 170    structure'').  */
 171 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 172   (CHAR_IS_BLANK (c)                                            \
 173    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 174    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 175
 176 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 177
 178 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 179    Structure''.  */
 180 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 181   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 182    || ((_chr) == 'd') || ((_chr) == 'l'))
 183
 184 /* Read an SCSH block comment.  */
 185 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 186 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 187 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 188 static SCM scm_read_shebang (scm_t_wchar, SCM);
 189 static SCM scm_get_hash_procedure (int);
 190
 191 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 192    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 193    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 194    bytes actually read.  */
 195 static inline int
 196 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 197  {
 198    *read = 0;
 199
 200    while (*read < buf_size)
 201      {
 202        int chr;
 203
 204        chr = scm_get_byte_or_eof (port);
 205
 206        if (chr == EOF)
 207         return 0;
 208       else if (CHAR_IS_DELIMITER (chr))
 209         {
 210           scm_unget_byte (chr, port);
 211           return 0;
 212         }
 213       else
 214         {
 215           *buf = (char) chr;
 216           buf++, (*read)++;
 217         }
 218      }
 219
 220    return 1;
 221  }
 222
 223 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 224    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 225    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 226    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 227    will be set the number of bytes actually read.  */
 228 static int
 229 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 230                            char **overflow_buffer, size_t *read)
 231 {
 232   int overflow = 0;
 233   size_t bytes_read, overflow_size;
 234
 235   *overflow_buffer = NULL;
 236   overflow_size = 0;
 237
 238   do
 239     {
 240       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 241       if (bytes_read == 0)
 242         break;
 243       if (overflow || overflow_size != 0)
 244         {
 245           if (overflow_size == 0)
 246             {
 247               *overflow_buffer = scm_malloc (bytes_read);
 248               memcpy (*overflow_buffer, buffer, bytes_read);
 249               overflow_size = bytes_read;
 250             }
 251           else
 252             {
 253               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 254               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 255               overflow_size += bytes_read;
 256             }
 257         }
 258     }
 259   while (overflow);
 260
 261   if (overflow_size)
 262     *read = overflow_size;
 263   else
 264     *read = bytes_read;
 265
 266   return (overflow_size != 0);
 267 }
 268
 269 /* Skip whitespace from PORT and return the first non-whitespace character
 270    read.  Raise an error on end-of-file.  */
 271 static int
 272 flush_ws (SCM port, const char *eoferr)
 273 {
 274   register scm_t_wchar c;
 275   while (1)
 276     switch (c = scm_getc (port))
 277       {
 278       case EOF:
 279       goteof:
 280         if (eoferr)
 281           {
 282             scm_i_input_error (eoferr,
 283                                port,
 284                                "end of file",
 285                                SCM_EOL);
 286           }
 287         return c;
 288
 289       case ';':
 290       lp:
 291         switch (c = scm_getc (port))
 292           {
 293           case EOF:
 294             goto goteof;
 295           default:
 296             goto lp;
 297           case SCM_LINE_INCREMENTORS:
 298             break;
 299           }
 300         break;
 301
 302       case '#':
 303         switch (c = scm_getc (port))
 304           {
 305           case EOF:
 306             eoferr = "read_sharp";
 307             goto goteof;
 308           case '!':
 309             scm_read_shebang (c, port);
 310             break;
 311           case ';':
 312             scm_read_commented_expression (c, port);
 313             break;
 314           case '|':
 315             if (scm_is_false (scm_get_hash_procedure (c)))
 316               {
 317                 scm_read_r6rs_block_comment (c, port);
 318                 break;
 319               }
 320             /* fall through */
 321           default:
 322             scm_ungetc (c, port);
 323             return '#';
 324           }
 325         break;
 326
 327       case SCM_LINE_INCREMENTORS:
 328       case SCM_SINGLE_SPACES:
 329       case '\t':
 330         break;
 331
 332       default:
 333         return c;
 334       }
 335
 336   return 0;
 337 }
 338
 339
 340 \f
 341 /* Token readers.  */
 342
 343 static SCM scm_read_expression (SCM port);
 344 static SCM scm_read_sharp (int chr, SCM port);
 345 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 346
 347
 348 static SCM
 349 scm_read_sexp (scm_t_wchar chr, SCM port)
 350 #define FUNC_NAME "scm_i_lreadparen"
 351 {
 352   register int c;
 353   register SCM tmp;
 354   register SCM tl, ans = SCM_EOL;
 355   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 356   const int terminating_char = ((chr == '[') ? ']' : ')');
 357
 358   /* Need to capture line and column numbers here. */
 359   long line = SCM_LINUM (port);
 360   int column = SCM_COL (port) - 1;
 361
 362
 363   c = flush_ws (port, FUNC_NAME);
 364   if (terminating_char == c)
 365     return SCM_EOL;
 366
 367   scm_ungetc (c, port);
 368   if (scm_is_eq (scm_sym_dot,
 369                  (tmp = scm_read_expression (port))))
 370     {
 371       ans = scm_read_expression (port);
 372       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 373         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 374                            SCM_EOL);
 375       return ans;
 376     }
 377
 378   /* Build the head of the list structure. */
 379   ans = tl = scm_cons (tmp, SCM_EOL);
 380
 381   if (SCM_COPY_SOURCE_P)
 382     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 383                            ? copy
 384                            : tmp,
 385                            SCM_EOL);
 386
 387   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 388     {
 389       SCM new_tail;
 390
 391       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 392         scm_i_input_error (FUNC_NAME, port,
 393                            "in pair: mismatched close paren: ~A",
 394                            scm_list_1 (SCM_MAKE_CHAR (c)));
 395
 396       scm_ungetc (c, port);
 397       tmp = scm_read_expression (port);
 398
 399       if (scm_is_eq (scm_sym_dot, tmp))
 400         {
 401           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 402
 403           if (SCM_COPY_SOURCE_P)
 404             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 405                                        SCM_EOL));
 406
 407           c = flush_ws (port, FUNC_NAME);
 408           if (terminating_char != c)
 409             scm_i_input_error (FUNC_NAME, port,
 410                                "in pair: missing close paren", SCM_EOL);
 411           goto exit;
 412         }
 413
 414       new_tail = scm_cons (tmp, SCM_EOL);
 415       SCM_SETCDR (tl, new_tail);
 416       tl = new_tail;
 417
 418       if (SCM_COPY_SOURCE_P)
 419         {
 420           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 421                                     ? copy
 422                                     : tmp, SCM_EOL);
 423           SCM_SETCDR (tl2, new_tail2);
 424           tl2 = new_tail2;
 425         }
 426     }
 427
 428  exit:
 429   if (SCM_RECORD_POSITIONS_P)
 430     scm_whash_insert (scm_source_whash,
 431                       ans,
 432                       scm_make_srcprops (line, column,
 433                                          SCM_FILENAME (port),
 434                                          SCM_COPY_SOURCE_P
 435                                          ? ans2
 436                                          : SCM_UNDEFINED,
 437                                          SCM_EOL));
 438   return ans;
 439 }
 440 #undef FUNC_NAME
 441
 442
 443 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 444    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 445    found.  */
 446 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 447   do                                                               \
 448     {                                                              \
 449       scm_t_wchar a;                                               \
 450       size_t i = 0;                                                \
 451       c = 0;                                                       \
 452       while (i < ndigits)                                          \
 453         {                                                          \
 454           a = scm_getc (port);                                     \
 455           if (a == EOF)                                            \
 456             goto str_eof;                                          \
 457           if (terminator                                           \
 458               && (a == (scm_t_wchar) terminator)                   \
 459               && (i > 0))                                          \
 460             break;                                                 \
 461           if ('0' <= a && a <= '9')                                \
 462             a -= '0';                                              \
 463           else if ('A' <= a && a <= 'F')                           \
 464             a = a - 'A' + 10;                                      \
 465           else if ('a' <= a && a <= 'f')                           \
 466             a = a - 'a' + 10;                                      \
 467           else                                                     \
 468             {                                                      \
 469               c = a;                                               \
 470               goto bad_escaped;                                    \
 471             }                                                      \
 472           c = c * 16 + a;                                          \
 473           i ++;                                                    \
 474         }                                                          \
 475     } while (0)
 476
 477 static SCM
 478 scm_read_string (int chr, SCM port)
 479 #define FUNC_NAME "scm_lreadr"
 480 {
 481   /* For strings smaller than C_STR, this function creates only one Scheme
 482      object (the string returned).  */
 483
 484   SCM str = SCM_BOOL_F;
 485   unsigned c_str_len = 0;
 486   scm_t_wchar c;
 487
 488   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 489   while ('"' != (c = scm_getc (port)))
 490     {
 491       if (c == EOF)
 492         {
 493         str_eof:
 494           scm_i_input_error (FUNC_NAME, port,
 495                              "end of file in string constant", SCM_EOL);
 496         }
 497
 498       if (c_str_len + 1 >= scm_i_string_length (str))
 499         {
 500           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 501
 502           str = scm_string_append (scm_list_2 (str, addy));
 503         }
 504
 505       if (c == '\\')
 506         {
 507           switch (c = scm_getc (port))
 508             {
 509             case EOF:
 510               goto str_eof;
 511             case '"':
 512             case '\\':
 513               break;
 514             case '\n':
 515               continue;
 516             case '0':
 517               c = '\0';
 518               break;
 519             case 'f':
 520               c = '\f';
 521               break;
 522             case 'n':
 523               c = '\n';
 524               break;
 525             case 'r':
 526               c = '\r';
 527               break;
 528             case 't':
 529               c = '\t';
 530               break;
 531             case 'a':
 532               c = '\007';
 533               break;
 534             case 'v':
 535               c = '\v';
 536               break;
 537             case 'b':
 538               c = '\010';
 539               break;
 540             case 'x':
 541               if (SCM_R6RS_ESCAPES_P)
 542                 SCM_READ_HEX_ESCAPE (10, ';');
 543               else
 544                 SCM_READ_HEX_ESCAPE (2, '\0');
 545               break;
 546             case 'u':
 547               if (!SCM_R6RS_ESCAPES_P)
 548                 {
 549                   SCM_READ_HEX_ESCAPE (4, '\0');
 550                   break;
 551                 }
 552             case 'U':
 553               if (!SCM_R6RS_ESCAPES_P)
 554                 {
 555                   SCM_READ_HEX_ESCAPE (6, '\0');
 556                   break;
 557                 }
 558             default:
 559             bad_escaped:
 560               scm_i_input_error (FUNC_NAME, port,
 561                                  "illegal character in escape sequence: ~S",
 562                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 563             }
 564         }
 565       str = scm_i_string_start_writing (str);
 566       scm_i_string_set_x (str, c_str_len++, c);
 567       scm_i_string_stop_writing ();
 568     }
 569
 570   if (c_str_len > 0)
 571     {
 572       return scm_i_substring_copy (str, 0, c_str_len);
 573     }
 574
 575   return scm_nullstr;
 576 }
 577 #undef FUNC_NAME
 578
 579
 580 static SCM
 581 scm_read_number (scm_t_wchar chr, SCM port)
 582 {
 583   SCM result, str = SCM_EOL;
 584   char buffer[READER_BUFFER_SIZE];
 585   char *overflow_buffer = NULL;
 586   size_t bytes_read;
 587   int overflow;
 588   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 589
 590   scm_ungetc (chr, port);
 591   overflow = read_complete_token (port, buffer, sizeof (buffer),
 592                                   &overflow_buffer, &bytes_read);
 593
 594   if (!overflow)
 595     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 596   else
 597     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 598                             pt->ilseq_handler);
 599
 600   result = scm_string_to_number (str, SCM_UNDEFINED);
 601   if (!scm_is_true (result))
 602     {
 603       /* Return a symbol instead of a number */
 604       if (SCM_CASE_INSENSITIVE_P)
 605         str = scm_string_downcase_x (str);
 606       result = scm_string_to_symbol (str);
 607     }
 608
 609   if (overflow)
 610     free (overflow_buffer);
 611   SCM_COL (port) += scm_i_string_length (str);
 612   return result;
 613 }
 614
 615 static SCM
 616 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 617 {
 618   SCM result;
 619   int ends_with_colon = 0;
 620   size_t bytes_read;
 621   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 622   int overflow;
 623   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 624   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 625   SCM str;
 626
 627   scm_ungetc (chr, port);
 628   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 629                                   &overflow_buffer, &bytes_read);
 630   if (bytes_read > 0)
 631     {
 632       if (!overflow)
 633         ends_with_colon = buffer[bytes_read - 1] == ':';
 634       else
 635         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 636     }
 637
 638   if (postfix && ends_with_colon && (bytes_read > 1))
 639     {
 640       if (!overflow)
 641         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 642       else
 643         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 644                                 pt->ilseq_handler);
 645
 646       if (SCM_CASE_INSENSITIVE_P)
 647         str = scm_string_downcase_x (str);
 648       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 649     }
 650   else
 651     {
 652       if (!overflow)
 653         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 654       else
 655         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 656                                 pt->ilseq_handler);
 657
 658       if (SCM_CASE_INSENSITIVE_P)
 659         str = scm_string_downcase_x (str);
 660       result = scm_string_to_symbol (str);
 661     }
 662
 663   if (overflow)
 664     free (overflow_buffer);
 665   SCM_COL (port) += scm_i_string_length (str);
 666   return result;
 667 }
 668
 669 static SCM
 670 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 671 #define FUNC_NAME "scm_lreadr"
 672 {
 673   SCM result;
 674   size_t read;
 675   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 676   int overflow;
 677   unsigned int radix;
 678   SCM str;
 679   scm_t_port *pt;
 680
 681   switch (chr)
 682     {
 683     case 'B':
 684     case 'b':
 685       radix = 2;
 686       break;
 687
 688     case 'o':
 689     case 'O':
 690       radix = 8;
 691       break;
 692
 693     case 'd':
 694     case 'D':
 695       radix = 10;
 696       break;
 697
 698     case 'x':
 699     case 'X':
 700       radix = 16;
 701       break;
 702
 703     default:
 704       scm_ungetc (chr, port);
 705       scm_ungetc ('#', port);
 706       radix = 10;
 707     }
 708
 709   overflow = read_complete_token (port, buffer, sizeof (buffer),
 710                                   &overflow_buffer, &read);
 711
 712   pt = SCM_PTAB_ENTRY (port);
 713   if (!overflow)
 714     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 715   else
 716     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 717                             pt->ilseq_handler);
 718
 719   result = scm_string_to_number (str, scm_from_uint (radix));
 720
 721   if (overflow)
 722     free (overflow_buffer);
 723
 724   SCM_COL (port) += scm_i_string_length (str);
 725
 726   if (scm_is_true (result))
 727     return result;
 728
 729   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 730
 731   return SCM_BOOL_F;
 732 }
 733 #undef FUNC_NAME
 734
 735 static SCM
 736 scm_read_quote (int chr, SCM port)
 737 {
 738   SCM p;
 739   long line = SCM_LINUM (port);
 740   int column = SCM_COL (port) - 1;
 741
 742   switch (chr)
 743     {
 744     case '`':
 745       p = scm_sym_quasiquote;
 746       break;
 747
 748     case '\'':
 749       p = scm_sym_quote;
 750       break;
 751
 752     case ',':
 753       {
 754         scm_t_wchar c;
 755
 756         c = scm_getc (port);
 757         if ('@' == c)
 758           p = scm_sym_uq_splicing;
 759         else
 760           {
 761             scm_ungetc (c, port);
 762             p = scm_sym_unquote;
 763           }
 764         break;
 765       }
 766
 767     default:
 768       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 769                "scm_read_quote", chr);
 770       abort ();
 771     }
 772
 773   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 774   if (SCM_RECORD_POSITIONS_P)
 775     scm_whash_insert (scm_source_whash, p,
 776                       scm_make_srcprops (line, column,
 777                                          SCM_FILENAME (port),
 778                                          SCM_COPY_SOURCE_P
 779                                          ? (scm_cons2 (SCM_CAR (p),
 780                                                        SCM_CAR (SCM_CDR (p)),
 781                                                        SCM_EOL))
 782                                          : SCM_UNDEFINED,
 783                                          SCM_EOL));
 784
 785
 786   return p;
 787 }
 788
 789 SCM_SYMBOL (sym_syntax, "syntax");
 790 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 791 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 792 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 793
 794 static SCM
 795 scm_read_syntax (int chr, SCM port)
 796 {
 797   SCM p;
 798   long line = SCM_LINUM (port);
 799   int column = SCM_COL (port) - 1;
 800
 801   switch (chr)
 802     {
 803     case '`':
 804       p = sym_quasisyntax;
 805       break;
 806
 807     case '\'':
 808       p = sym_syntax;
 809       break;
 810
 811     case ',':
 812       {
 813         int c;
 814
 815         c = scm_getc (port);
 816         if ('@' == c)
 817           p = sym_unsyntax_splicing;
 818         else
 819           {
 820             scm_ungetc (c, port);
 821             p = sym_unsyntax;
 822           }
 823         break;
 824       }
 825
 826     default:
 827       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 828                "scm_read_syntax", chr);
 829       abort ();
 830     }
 831
 832   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 833   if (SCM_RECORD_POSITIONS_P)
 834     scm_whash_insert (scm_source_whash, p,
 835                       scm_make_srcprops (line, column,
 836                                          SCM_FILENAME (port),
 837                                          SCM_COPY_SOURCE_P
 838                                          ? (scm_cons2 (SCM_CAR (p),
 839                                                        SCM_CAR (SCM_CDR (p)),
 840                                                        SCM_EOL))
 841                                          : SCM_UNDEFINED,
 842                                          SCM_EOL));
 843
 844
 845   return p;
 846 }
 847
 848 static inline SCM
 849 scm_read_nil (int chr, SCM port)
 850 {
 851   SCM id = scm_read_mixed_case_symbol (chr, port);
 852
 853   if (!scm_is_eq (id, sym_nil))
 854     scm_i_input_error ("scm_read_nil", port,
 855                        "unexpected input while reading #nil: ~a",
 856                        scm_list_1 (id));
 857
 858   return SCM_ELISP_NIL;
 859 }
 860
 861 static inline SCM
 862 scm_read_semicolon_comment (int chr, SCM port)
 863 {
 864   int c;
 865
 866   /* We use the get_byte here because there is no need to get the
 867      locale correct with comment input. This presumes that newline
 868      always represents itself no matter what the encoding is.  */
 869   for (c = scm_get_byte_or_eof (port);
 870        (c != EOF) && (c != '\n');
 871        c = scm_get_byte_or_eof (port));
 872
 873   return SCM_UNSPECIFIED;
 874 }
 875
 876 \f
 877 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 878
 879 static SCM
 880 scm_read_boolean (int chr, SCM port)
 881 {
 882   switch (chr)
 883     {
 884     case 't':
 885     case 'T':
 886       return SCM_BOOL_T;
 887
 888     case 'f':
 889     case 'F':
 890       return SCM_BOOL_F;
 891     }
 892
 893   return SCM_UNSPECIFIED;
 894 }
 895
 896 static SCM
 897 scm_read_character (scm_t_wchar chr, SCM port)
 898 #define FUNC_NAME "scm_lreadr"
 899 {
 900   char buffer[READER_CHAR_NAME_MAX_SIZE];
 901   SCM charname;
 902   size_t charname_len, bytes_read;
 903   scm_t_wchar cp;
 904   int overflow;
 905   scm_t_port *pt;
 906
 907   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 908   if (overflow)
 909     goto char_error;
 910
 911   if (bytes_read == 0)
 912     {
 913       chr = scm_getc (port);
 914       if (chr == EOF)
 915         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 916                            "while reading character", SCM_EOL);
 917
 918       /* CHR must be a token delimiter, like a whitespace.  */
 919       return (SCM_MAKE_CHAR (chr));
 920     }
 921
 922   pt = SCM_PTAB_ENTRY (port);
 923
 924   /* Simple ASCII characters can be processed immediately.  Also, simple
 925      ISO-8859-1 characters can be processed immediately if the encoding for this
 926      port is ISO-8859-1.  */
 927   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 928     {
 929       SCM_COL (port) += 1;
 930       return SCM_MAKE_CHAR (buffer[0]);
 931     }
 932
 933   /* Otherwise, convert the buffer into a proper scheme string for
 934      processing.  */
 935   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 936                                pt->ilseq_handler);
 937   charname_len = scm_i_string_length (charname);
 938   SCM_COL (port) += charname_len;
 939   cp = scm_i_string_ref (charname, 0);
 940   if (charname_len == 1)
 941     return SCM_MAKE_CHAR (cp);
 942
 943   /* Ignore dotted circles, which may be used to keep combining characters from
 944      combining with the backslash in #\charname.  */
 945   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 946     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 947
 948   if (cp >= '0' && cp < '8')
 949     {
 950       /* Dirk:FIXME::  This type of character syntax is not R5RS
 951        * compliant.  Further, it should be verified that the constant
 952        * does only consist of octal digits.  */
 953       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 954       if (SCM_I_INUMP (p))
 955         {
 956           scm_t_wchar c = SCM_I_INUM (p);
 957           if (SCM_IS_UNICODE_CHAR (c))
 958             return SCM_MAKE_CHAR (c);
 959           else
 960             scm_i_input_error (FUNC_NAME, port,
 961                                "out-of-range octal character escape: ~a",
 962                                scm_list_1 (charname));
 963         }
 964     }
 965
 966   if (cp == 'x' && (charname_len > 1))
 967     {
 968       SCM p;
 969
 970       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 971       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 972                                 scm_from_uint (16));
 973       if (SCM_I_INUMP (p))
 974         {
 975           scm_t_wchar c = SCM_I_INUM (p);
 976           if (SCM_IS_UNICODE_CHAR (c))
 977             return SCM_MAKE_CHAR (c);
 978           else
 979             scm_i_input_error (FUNC_NAME, port,
 980                                "out-of-range hex character escape: ~a",
 981                                scm_list_1 (charname));
 982         }
 983     }
 984
 985   /* The names of characters should never have non-Latin1
 986      characters.  */
 987   if (scm_i_is_narrow_string (charname)
 988       || scm_i_try_narrow_string (charname))
 989     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 990                                        charname_len);
 991       if (scm_is_true (ch))
 992         return ch;
 993     }
 994
 995  char_error:
 996   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 997                      scm_list_1 (charname));
 998
 999   return SCM_UNSPECIFIED;
1000 }
1001 #undef FUNC_NAME
1002
1003 static inline SCM
1004 scm_read_keyword (int chr, SCM port)
1005 {
1006   SCM symbol;
1007
1008   /* Read the symbol that comprises the keyword.  Doing this instead of
1009      invoking a specific symbol reader function allows `scm_read_keyword ()'
1010      to adapt to the delimiters currently valid of symbols.
1011
1012      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1013   symbol = scm_read_expression (port);
1014   if (!scm_is_symbol (symbol))
1015     scm_i_input_error ("scm_read_keyword", port,
1016                        "keyword prefix `~a' not followed by a symbol: ~s",
1017                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1018
1019   return (scm_symbol_to_keyword (symbol));
1020 }
1021
1022 static inline SCM
1023 scm_read_vector (int chr, SCM port)
1024 {
1025   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1026      guarantee that it's going to do what we want.  After all, this is an
1027      implementation detail of `scm_read_vector ()', not a desirable
1028      property.  */
1029   return (scm_vector (scm_read_sexp (chr, port)));
1030 }
1031
1032 static inline SCM
1033 scm_read_srfi4_vector (int chr, SCM port)
1034 {
1035   return scm_i_read_array (port, chr);
1036 }
1037
1038 static SCM
1039 scm_read_bytevector (scm_t_wchar chr, SCM port)
1040 {
1041   chr = scm_getc (port);
1042   if (chr != 'u')
1043     goto syntax;
1044
1045   chr = scm_getc (port);
1046   if (chr != '8')
1047     goto syntax;
1048
1049   chr = scm_getc (port);
1050   if (chr != '(')
1051     goto syntax;
1052
1053   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1054
1055  syntax:
1056   scm_i_input_error ("read_bytevector", port,
1057                      "invalid bytevector prefix",
1058                      SCM_MAKE_CHAR (chr));
1059   return SCM_UNSPECIFIED;
1060 }
1061
1062 static SCM
1063 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1064 {
1065   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1066      terribly inefficient but who cares?  */
1067   SCM s_bits = SCM_EOL;
1068
1069   for (chr = scm_getc (port);
1070        (chr != EOF) && ((chr == '0') || (chr == '1'));
1071        chr = scm_getc (port))
1072     {
1073       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1074     }
1075
1076   if (chr != EOF)
1077     scm_ungetc (chr, port);
1078
1079   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1080 }
1081
1082 static inline SCM
1083 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1084 {
1085   int bang_seen = 0;
1086
1087   /* We can use the get_byte here because there is no need to get the
1088      locale correct when reading comments. This presumes that
1089      hash and exclamation points always represent themselves no
1090      matter what the source encoding is.*/
1091   for (;;)
1092     {
1093       int c = scm_get_byte_or_eof (port);
1094
1095       if (c == EOF)
1096         scm_i_input_error ("skip_block_comment", port,
1097                            "unterminated `#! ... !#' comment", SCM_EOL);
1098
1099       if (c == '!')
1100         bang_seen = 1;
1101       else if (c == '#' && bang_seen)
1102         break;
1103       else
1104         bang_seen = 0;
1105     }
1106
1107   return SCM_UNSPECIFIED;
1108 }
1109
1110 static inline SCM
1111 scm_read_shebang (scm_t_wchar chr, SCM port)
1112 {
1113   int c = 0;
1114   if ((c = scm_get_byte_or_eof (port)) != 'r')
1115     {
1116       scm_ungetc (c, port);
1117       return scm_read_scsh_block_comment (chr, port);
1118     }
1119   if ((c = scm_get_byte_or_eof (port)) != '6')
1120     {
1121       scm_ungetc (c, port);
1122       scm_ungetc ('r', port);
1123       return scm_read_scsh_block_comment (chr, port);
1124     }
1125   if ((c = scm_get_byte_or_eof (port)) != 'r')
1126     {
1127       scm_ungetc (c, port);
1128       scm_ungetc ('6', port);
1129       scm_ungetc ('r', port);
1130       return scm_read_scsh_block_comment (chr, port);
1131     }
1132   if ((c = scm_get_byte_or_eof (port)) != 's')
1133     {
1134       scm_ungetc (c, port);
1135       scm_ungetc ('r', port);
1136       scm_ungetc ('6', port);
1137       scm_ungetc ('r', port);
1138       return scm_read_scsh_block_comment (chr, port);
1139     }
1140
1141   return SCM_UNSPECIFIED;
1142 }
1143
1144 static SCM
1145 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1146 {
1147   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1148      nested.  So care must be taken.  */
1149   int nesting_level = 1;
1150   int opening_seen = 0, closing_seen = 0;
1151
1152   while (nesting_level > 0)
1153     {
1154       int c = scm_getc (port);
1155
1156       if (c == EOF)
1157         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1158                            "unterminated `#| ... |#' comment", SCM_EOL);
1159
1160       if (opening_seen)
1161         {
1162           if (c == '|')
1163             nesting_level++;
1164           opening_seen = 0;
1165         }
1166       else if (closing_seen)
1167         {
1168           if (c == '#')
1169             nesting_level--;
1170           closing_seen = 0;
1171         }
1172       else if (c == '|')
1173         closing_seen = 1;
1174       else if (c == '#')
1175         opening_seen = 1;
1176       else
1177         opening_seen = closing_seen = 0;
1178     }
1179
1180   return SCM_UNSPECIFIED;
1181 }
1182
1183 static SCM
1184 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1185 {
1186   scm_t_wchar c;
1187
1188   c = flush_ws (port, (char *) NULL);
1189   if (EOF == c)
1190     scm_i_input_error ("read_commented_expression", port,
1191                        "no expression after #; comment", SCM_EOL);
1192   scm_ungetc (c, port);
1193   scm_read_expression (port);
1194   return SCM_UNSPECIFIED;
1195 }
1196
1197 static SCM
1198 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1199 {
1200   /* Guile's extended symbol read syntax looks like this:
1201
1202        #{This is all a symbol name}#
1203
1204      So here, CHR is expected to be `{'.  */
1205   int saw_brace = 0, finished = 0;
1206   size_t len = 0;
1207   SCM buf = scm_i_make_string (1024, NULL);
1208
1209   buf = scm_i_string_start_writing (buf);
1210
1211   while ((chr = scm_getc (port)) != EOF)
1212     {
1213       if (saw_brace)
1214         {
1215           if (chr == '#')
1216             {
1217               finished = 1;
1218               break;
1219             }
1220           else
1221             {
1222               saw_brace = 0;
1223               scm_i_string_set_x (buf, len++, '}');
1224               scm_i_string_set_x (buf, len++, chr);
1225             }
1226         }
1227       else if (chr == '}')
1228         saw_brace = 1;
1229       else
1230         scm_i_string_set_x (buf, len++, chr);
1231
1232       if (len >= scm_i_string_length (buf) - 2)
1233         {
1234           SCM addy;
1235
1236           scm_i_string_stop_writing ();
1237           addy = scm_i_make_string (1024, NULL);
1238           buf = scm_string_append (scm_list_2 (buf, addy));
1239           len = 0;
1240           buf = scm_i_string_start_writing (buf);
1241         }
1242
1243       if (finished)
1244         break;
1245     }
1246   scm_i_string_stop_writing ();
1247
1248   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1249 }
1250
1251
1252 \f
1253 /* Top-level token readers, i.e., dispatchers.  */
1254
1255 static SCM
1256 scm_read_sharp_extension (int chr, SCM port)
1257 {
1258   SCM proc;
1259
1260   proc = scm_get_hash_procedure (chr);
1261   if (scm_is_true (scm_procedure_p (proc)))
1262     {
1263       long line = SCM_LINUM (port);
1264       int column = SCM_COL (port) - 2;
1265       SCM got;
1266
1267       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1268       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1269         {
1270           if (SCM_RECORD_POSITIONS_P)
1271             return (recsexpr (got, line, column,
1272                               SCM_FILENAME (port)));
1273           else
1274             return got;
1275         }
1276     }
1277
1278   return SCM_UNSPECIFIED;
1279 }
1280
1281 /* The reader for the sharp `#' character.  It basically dispatches reads
1282    among the above token readers.   */
1283 static SCM
1284 scm_read_sharp (scm_t_wchar chr, SCM port)
1285 #define FUNC_NAME "scm_lreadr"
1286 {
1287   SCM result;
1288
1289   chr = scm_getc (port);
1290
1291   result = scm_read_sharp_extension (chr, port);
1292   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1293     return result;
1294
1295   switch (chr)
1296     {
1297     case '\\':
1298       return (scm_read_character (chr, port));
1299     case '(':
1300       return (scm_read_vector (chr, port));
1301     case 's':
1302     case 'u':
1303     case 'f':
1304       /* This one may return either a boolean or an SRFI-4 vector.  */
1305       return (scm_read_srfi4_vector (chr, port));
1306     case 'v':
1307       return (scm_read_bytevector (chr, port));
1308     case '*':
1309       return (scm_read_guile_bit_vector (chr, port));
1310     case 't':
1311     case 'T':
1312     case 'F':
1313       /* This one may return either a boolean or an SRFI-4 vector.  */
1314       return (scm_read_boolean (chr, port));
1315     case ':':
1316       return (scm_read_keyword (chr, port));
1317     case '0': case '1': case '2': case '3': case '4':
1318     case '5': case '6': case '7': case '8': case '9':
1319     case '@':
1320 #if SCM_ENABLE_DEPRECATED
1321       /* See below for 'i' and 'e'. */
1322     case 'a':
1323     case 'c':
1324     case 'y':
1325     case 'h':
1326     case 'l':
1327 #endif
1328       return (scm_i_read_array (port, chr));
1329
1330     case 'i':
1331     case 'e':
1332 #if SCM_ENABLE_DEPRECATED
1333       {
1334         /* When next char is '(', it really is an old-style
1335            uniform array. */
1336         scm_t_wchar next_c = scm_getc (port);
1337         if (next_c != EOF)
1338           scm_ungetc (next_c, port);
1339         if (next_c == '(')
1340           return scm_i_read_array (port, chr);
1341         /* Fall through. */
1342       }
1343 #endif
1344     case 'b':
1345     case 'B':
1346     case 'o':
1347     case 'O':
1348     case 'd':
1349     case 'D':
1350     case 'x':
1351     case 'X':
1352     case 'I':
1353     case 'E':
1354       return (scm_read_number_and_radix (chr, port));
1355     case '{':
1356       return (scm_read_extended_symbol (chr, port));
1357     case '!':
1358       return (scm_read_shebang (chr, port));
1359     case ';':
1360       return (scm_read_commented_expression (chr, port));
1361     case '`':
1362     case '\'':
1363     case ',':
1364       return (scm_read_syntax (chr, port));
1365     case 'n':
1366       return (scm_read_nil (chr, port));
1367     default:
1368       result = scm_read_sharp_extension (chr, port);
1369       if (scm_is_eq (result, SCM_UNSPECIFIED))
1370         {
1371           /* To remain compatible with 1.8 and earlier, the following
1372              characters have lower precedence than `read-hash-extend'
1373              characters.  */
1374           switch (chr)
1375             {
1376             case '|':
1377               return scm_read_r6rs_block_comment (chr, port);
1378             default:
1379               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1380                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1381             }
1382         }
1383       else
1384         return result;
1385     }
1386
1387   return SCM_UNSPECIFIED;
1388 }
1389 #undef FUNC_NAME
1390
1391 static SCM
1392 scm_read_expression (SCM port)
1393 #define FUNC_NAME "scm_read_expression"
1394 {
1395   while (1)
1396     {
1397       register scm_t_wchar chr;
1398
1399       chr = scm_getc (port);
1400
1401       switch (chr)
1402         {
1403         case SCM_WHITE_SPACES:
1404         case SCM_LINE_INCREMENTORS:
1405           break;
1406         case ';':
1407           (void) scm_read_semicolon_comment (chr, port);
1408           break;
1409         case '[':
1410           if (!SCM_SQUARE_BRACKETS_P)
1411             return (scm_read_mixed_case_symbol (chr, port));
1412           /* otherwise fall through */
1413         case '(':
1414           return (scm_read_sexp (chr, port));
1415         case '"':
1416           return (scm_read_string (chr, port));
1417         case '\'':
1418         case '`':
1419         case ',':
1420           return (scm_read_quote (chr, port));
1421         case '#':
1422           {
1423             SCM result;
1424             result = scm_read_sharp (chr, port);
1425             if (scm_is_eq (result, SCM_UNSPECIFIED))
1426               /* We read a comment or some such.  */
1427               break;
1428             else
1429               return result;
1430           }
1431         case ')':
1432           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1433           break;
1434         case EOF:
1435           return SCM_EOF_VAL;
1436         case ':':
1437           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1438             return scm_symbol_to_keyword (scm_read_expression (port));
1439           /* Fall through.  */
1440
1441         default:
1442           {
1443             if (((chr >= '0') && (chr <= '9'))
1444                 || (strchr ("+-.", chr)))
1445               return (scm_read_number (chr, port));
1446             else
1447               return (scm_read_mixed_case_symbol (chr, port));
1448           }
1449         }
1450     }
1451 }
1452 #undef FUNC_NAME
1453
1454 \f
1455 /* Actual reader.  */
1456
1457 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1458             (SCM port),
1459             "Read an s-expression from the input port @var{port}, or from\n"
1460             "the current input port if @var{port} is not specified.\n"
1461             "Any whitespace before the next token is discarded.")
1462 #define FUNC_NAME s_scm_read
1463 {
1464   int c;
1465
1466   if (SCM_UNBNDP (port))
1467     port = scm_current_input_port ();
1468   SCM_VALIDATE_OPINPORT (1, port);
1469
1470   c = flush_ws (port, (char *) NULL);
1471   if (EOF == c)
1472     return SCM_EOF_VAL;
1473   scm_ungetc (c, port);
1474
1475   return (scm_read_expression (port));
1476 }
1477 #undef FUNC_NAME
1478
1479
1480 \f
1481
1482 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1483 static SCM
1484 recsexpr (SCM obj, long line, int column, SCM filename)
1485 {
1486   if (!scm_is_pair(obj)) {
1487     return obj;
1488   } else {
1489     SCM tmp = obj, copy;
1490     /* If this sexpr is visible in the read:sharp source, we want to
1491        keep that information, so only record non-constant cons cells
1492        which haven't previously been read by the reader. */
1493     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1494       {
1495         if (SCM_COPY_SOURCE_P)
1496           {
1497             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1498                              SCM_UNDEFINED);
1499             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1500               {
1501                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1502                                                       line,
1503                                                       column,
1504                                                       filename),
1505                                             SCM_UNDEFINED));
1506                 copy = SCM_CDR (copy);
1507               }
1508             SCM_SETCDR (copy, tmp);
1509           }
1510         else
1511           {
1512             recsexpr (SCM_CAR (obj), line, column, filename);
1513             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1514               recsexpr (SCM_CAR (tmp), line, column, filename);
1515             copy = SCM_UNDEFINED;
1516           }
1517         scm_whash_insert (scm_source_whash,
1518                           obj,
1519                           scm_make_srcprops (line,
1520                                              column,
1521                                              filename,
1522                                              copy,
1523                                              SCM_EOL));
1524       }
1525     return obj;
1526   }
1527 }
1528
1529 /* Manipulate the read-hash-procedures alist.  This could be written in
1530    Scheme, but maybe it will also be used by C code during initialisation.  */
1531 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1532             (SCM chr, SCM proc),
1533             "Install the procedure @var{proc} for reading expressions\n"
1534             "starting with the character sequence @code{#} and @var{chr}.\n"
1535             "@var{proc} will be called with two arguments:  the character\n"
1536             "@var{chr} and the port to read further data from. The object\n"
1537             "returned will be the return value of @code{read}. \n"
1538             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1539             )
1540 #define FUNC_NAME s_scm_read_hash_extend
1541 {
1542   SCM this;
1543   SCM prev;
1544
1545   SCM_VALIDATE_CHAR (1, chr);
1546   SCM_ASSERT (scm_is_false (proc)
1547               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1548               proc, SCM_ARG2, FUNC_NAME);
1549
1550   /* Check if chr is already in the alist.  */
1551   this = *scm_read_hash_procedures;
1552   prev = SCM_BOOL_F;
1553   while (1)
1554     {
1555       if (scm_is_null (this))
1556         {
1557           /* not found, so add it to the beginning.  */
1558           if (scm_is_true (proc))
1559             {
1560               *scm_read_hash_procedures =
1561                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1562             }
1563           break;
1564         }
1565       if (scm_is_eq (chr, SCM_CAAR (this)))
1566         {
1567           /* already in the alist.  */
1568           if (scm_is_false (proc))
1569             {
1570               /* remove it.  */
1571               if (scm_is_false (prev))
1572                 {
1573                   *scm_read_hash_procedures =
1574                     SCM_CDR (*scm_read_hash_procedures);
1575                 }
1576               else
1577                 scm_set_cdr_x (prev, SCM_CDR (this));
1578             }
1579           else
1580             {
1581               /* replace it.  */
1582               scm_set_cdr_x (SCM_CAR (this), proc);
1583             }
1584           break;
1585         }
1586       prev = this;
1587       this = SCM_CDR (this);
1588     }
1589
1590   return SCM_UNSPECIFIED;
1591 }
1592 #undef FUNC_NAME
1593
1594 /* Recover the read-hash procedure corresponding to char c.  */
1595 static SCM
1596 scm_get_hash_procedure (int c)
1597 {
1598   SCM rest = *scm_read_hash_procedures;
1599
1600   while (1)
1601     {
1602       if (scm_is_null (rest))
1603         return SCM_BOOL_F;
1604
1605       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1606         return SCM_CDAR (rest);
1607
1608       rest = SCM_CDR (rest);
1609     }
1610 }
1611
1612 #define SCM_ENCODING_SEARCH_SIZE (500)
1613
1614 /* Search the first few hundred characters of a file for an Emacs-like coding
1615    declaration.  Returns either NULL or a string whose storage has been
1616    allocated with `scm_gc_malloc ()'.  */
1617 char *
1618 scm_i_scan_for_encoding (SCM port)
1619 {
1620   char header[SCM_ENCODING_SEARCH_SIZE+1];
1621   size_t bytes_read, encoding_length, i;
1622   char *encoding = NULL;
1623   int utf8_bom = 0;
1624   char *pos, *encoding_start;
1625   int in_comment;
1626
1627   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1628     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1629        "guile <(echo '(display "hello")')") so bail out.  */
1630     return NULL;
1631
1632   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1633
1634   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1635
1636   if (bytes_read > 3
1637       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1638     utf8_bom = 1;
1639
1640   /* search past "coding[:=]" */
1641   pos = header;
1642   while (1)
1643     {
1644       if ((pos = strstr(pos, "coding")) == NULL)
1645         return NULL;
1646
1647       pos += strlen("coding");
1648       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1649           (*pos == ':' || *pos == '='))
1650         {
1651           pos ++;
1652           break;
1653         }
1654     }
1655
1656   /* skip spaces */
1657   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1658          (*pos == ' ' || *pos == '\t'))
1659     pos ++;
1660
1661   /* grab the next token */
1662   encoding_start = pos;
1663   i = 0;
1664   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1665          && encoding_start + i - header < bytes_read
1666          && (isalnum ((int) encoding_start[i])
1667              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1668     i++;
1669
1670   encoding_length = i;
1671   if (encoding_length == 0)
1672     return NULL;
1673
1674   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1675   for (i = 0; i < encoding_length; i++)
1676     encoding[i] = toupper ((int) encoding[i]);
1677
1678   /* push backwards to make sure we were in a comment */
1679   in_comment = 0;
1680   pos = encoding_start;
1681   while (pos >= header)
1682     {
1683       if (*pos == '\n')
1684         {
1685           /* This wasn't in a semicolon comment. Check for a
1686            hash-bang comment. */
1687           char *beg = strstr (header, "#!");
1688           char *end = strstr (header, "!#");
1689           if (beg < encoding_start && encoding_start + encoding_length < end)
1690             in_comment = 1;
1691           break;
1692         }
1693       if (*pos == ';')
1694         {
1695           in_comment = 1;
1696           break;
1697         }
1698       pos --;
1699     }
1700   if (!in_comment)
1701     /* This wasn't in a comment */
1702     return NULL;
1703
1704   if (utf8_bom && strcmp(encoding, "UTF-8"))
1705     scm_misc_error (NULL,
1706                     "the port input declares the encoding ~s but is encoded as UTF-8",
1707                     scm_list_1 (scm_from_locale_string (encoding)));
1708
1709   return encoding;
1710 }
1711
1712 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1713             (SCM port),
1714             "Scans the port for an Emacs-like character coding declaration\n"
1715             "near the top of the contents of a port with random-acessible contents.\n"
1716             "The coding declaration is of the form\n"
1717             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1718             "\n"
1719             "Returns a string containing the character encoding of the file\n"
1720             "if a declaration was found, or @code{#f} otherwise.\n")
1721 #define FUNC_NAME s_scm_file_encoding
1722 {
1723   char *enc;
1724   SCM s_enc;
1725
1726   enc = scm_i_scan_for_encoding (port);
1727   if (enc == NULL)
1728     return SCM_BOOL_F;
1729   else
1730     {
1731       s_enc = scm_from_locale_string (enc);
1732       return s_enc;
1733     }
1734
1735   return SCM_BOOL_F;
1736 }
1737 #undef FUNC_NAME
1738
1739 void
1740 scm_init_read ()
1741 {
1742   scm_read_hash_procedures =
1743     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1744
1745   scm_init_opts (scm_read_options, scm_read_opts);
1746 #include "libguile/read.x"
1747 }
1748
1749 /*
1750   Local Variables:
1751   c-file-style: "gnu"
1752   End:
1753 */