libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63
  64 scm_t_option scm_read_opts[] = {
  65   { SCM_OPTION_BOOLEAN, "copy", 0,
  66     "Copy source code expressions." },
  67   { SCM_OPTION_BOOLEAN, "positions", 0,
  68     "Record positions of source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  70     "Convert symbols to lower case."},
  71   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  72     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  73   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  74     "Support Elisp vector syntax, namely `[...]'."},
  75   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  76     "Support `\\(' and `\\)' in strings."},
  77   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  78     "Use R6RS variable-length character and string hex escapes."},
  79   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  80     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_locale_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* An association list mapping extra hash characters to procedures.  */
 142 static SCM *scm_read_hash_procedures;
 143
 144
 145 \f
 146 /* Token readers.  */
 147
 148
 149 /* Size of the C buffer used to read symbols and numbers.  */
 150 #define READER_BUFFER_SIZE            128
 151
 152 /* Size of the C buffer used to read strings.  */
 153 #define READER_STRING_BUFFER_SIZE     512
 154
 155 /* The maximum size of Scheme character names.  */
 156 #define READER_CHAR_NAME_MAX_SIZE      50
 157
 158
 159 /* `isblank' is only in C99.  */
 160 #define CHAR_IS_BLANK_(_chr)                                    \
 161   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 162    || ((_chr) == '\f') || ((_chr) == '\r'))
 163
 164 #ifdef MSDOS
 165 # define CHAR_IS_BLANK(_chr)                    \
 166   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 167 #else
 168 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 169 #endif
 170
 171
 172 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 173    structure'').  */
 174 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 175   (CHAR_IS_BLANK (c)                                            \
 176    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 177    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 178
 179 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 180
 181 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 182    Structure''.  */
 183 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 184   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 185    || ((_chr) == 'd') || ((_chr) == 'l'))
 186
 187 /* Read an SCSH block comment.  */
 188 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 189 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 190 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 191 static SCM scm_get_hash_procedure (int);
 192
 193 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 194    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 195    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 196    bytes actually read.  */
 197 static inline int
 198 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 199  {
 200    *read = 0;
 201
 202    while (*read < buf_size)
 203      {
 204        int chr;
 205
 206        chr = scm_get_byte_or_eof (port);
 207
 208        if (chr == EOF)
 209         return 0;
 210       else if (CHAR_IS_DELIMITER (chr))
 211         {
 212           scm_unget_byte (chr, port);
 213           return 0;
 214         }
 215       else
 216         {
 217           *buf = (char) chr;
 218           buf++, (*read)++;
 219         }
 220      }
 221
 222    return 1;
 223  }
 224
 225 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 226    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 227    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 228    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 229    will be set the number of bytes actually read.  */
 230 static int
 231 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 232                            char **overflow_buffer, size_t *read)
 233 {
 234   int overflow = 0;
 235   size_t bytes_read, overflow_size;
 236
 237   *overflow_buffer = NULL;
 238   overflow_size = 0;
 239
 240   do
 241     {
 242       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 243       if (bytes_read == 0)
 244         break;
 245       if (overflow || overflow_size != 0)
 246         {
 247           if (overflow_size == 0)
 248             {
 249               *overflow_buffer = scm_malloc (bytes_read);
 250               memcpy (*overflow_buffer, buffer, bytes_read);
 251               overflow_size = bytes_read;
 252             }
 253           else
 254             {
 255               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 256               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 257               overflow_size += bytes_read;
 258             }
 259         }
 260     }
 261   while (overflow);
 262
 263   if (overflow_size)
 264     *read = overflow_size;
 265   else
 266     *read = bytes_read;
 267
 268   return (overflow_size != 0);
 269 }
 270
 271 /* Skip whitespace from PORT and return the first non-whitespace character
 272    read.  Raise an error on end-of-file.  */
 273 static int
 274 flush_ws (SCM port, const char *eoferr)
 275 {
 276   register scm_t_wchar c;
 277   while (1)
 278     switch (c = scm_getc (port))
 279       {
 280       case EOF:
 281       goteof:
 282         if (eoferr)
 283           {
 284             scm_i_input_error (eoferr,
 285                                port,
 286                                "end of file",
 287                                SCM_EOL);
 288           }
 289         return c;
 290
 291       case ';':
 292       lp:
 293         switch (c = scm_getc (port))
 294           {
 295           case EOF:
 296             goto goteof;
 297           default:
 298             goto lp;
 299           case SCM_LINE_INCREMENTORS:
 300             break;
 301           }
 302         break;
 303
 304       case '#':
 305         switch (c = scm_getc (port))
 306           {
 307           case EOF:
 308             eoferr = "read_sharp";
 309             goto goteof;
 310           case '!':
 311             scm_read_scsh_block_comment (c, port);
 312             break;
 313           case ';':
 314             scm_read_commented_expression (c, port);
 315             break;
 316           case '|':
 317             if (scm_is_false (scm_get_hash_procedure (c)))
 318               {
 319                 scm_read_r6rs_block_comment (c, port);
 320                 break;
 321               }
 322             /* fall through */
 323           default:
 324             scm_ungetc (c, port);
 325             return '#';
 326           }
 327         break;
 328
 329       case SCM_LINE_INCREMENTORS:
 330       case SCM_SINGLE_SPACES:
 331       case '\t':
 332         break;
 333
 334       default:
 335         return c;
 336       }
 337
 338   return 0;
 339 }
 340
 341
 342 \f
 343 /* Token readers.  */
 344
 345 static SCM scm_read_expression (SCM port);
 346 static SCM scm_read_sharp (int chr, SCM port);
 347 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 348
 349
 350 static SCM
 351 scm_read_sexp (scm_t_wchar chr, SCM port)
 352 #define FUNC_NAME "scm_i_lreadparen"
 353 {
 354   register int c;
 355   register SCM tmp;
 356   register SCM tl, ans = SCM_EOL;
 357   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 358   const int terminating_char = ((chr == '[') ? ']' : ')');
 359
 360   /* Need to capture line and column numbers here. */
 361   long line = SCM_LINUM (port);
 362   int column = SCM_COL (port) - 1;
 363
 364
 365   c = flush_ws (port, FUNC_NAME);
 366   if (terminating_char == c)
 367     return SCM_EOL;
 368
 369   scm_ungetc (c, port);
 370   if (scm_is_eq (scm_sym_dot,
 371                  (tmp = scm_read_expression (port))))
 372     {
 373       ans = scm_read_expression (port);
 374       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 375         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 376                            SCM_EOL);
 377       return ans;
 378     }
 379
 380   /* Build the head of the list structure. */
 381   ans = tl = scm_cons (tmp, SCM_EOL);
 382
 383   if (SCM_COPY_SOURCE_P)
 384     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 385                            ? copy
 386                            : tmp,
 387                            SCM_EOL);
 388
 389   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 390     {
 391       SCM new_tail;
 392
 393       scm_ungetc (c, port);
 394       if (scm_is_eq (scm_sym_dot,
 395                      (tmp = scm_read_expression (port))))
 396         {
 397           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 398
 399           if (SCM_COPY_SOURCE_P)
 400             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 401                                        SCM_EOL));
 402
 403           c = flush_ws (port, FUNC_NAME);
 404           if (terminating_char != c)
 405             scm_i_input_error (FUNC_NAME, port,
 406                                "in pair: missing close paren", SCM_EOL);
 407           goto exit;
 408         }
 409
 410       new_tail = scm_cons (tmp, SCM_EOL);
 411       SCM_SETCDR (tl, new_tail);
 412       tl = new_tail;
 413
 414       if (SCM_COPY_SOURCE_P)
 415         {
 416           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 417                                     ? copy
 418                                     : tmp, SCM_EOL);
 419           SCM_SETCDR (tl2, new_tail2);
 420           tl2 = new_tail2;
 421         }
 422     }
 423
 424  exit:
 425   if (SCM_RECORD_POSITIONS_P)
 426     scm_whash_insert (scm_source_whash,
 427                       ans,
 428                       scm_make_srcprops (line, column,
 429                                          SCM_FILENAME (port),
 430                                          SCM_COPY_SOURCE_P
 431                                          ? ans2
 432                                          : SCM_UNDEFINED,
 433                                          SCM_EOL));
 434   return ans;
 435 }
 436 #undef FUNC_NAME
 437
 438
 439 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 440    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 441    found.  */
 442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 443   do                                                               \
 444     {                                                              \
 445       scm_t_wchar a;                                               \
 446       size_t i = 0;                                                \
 447       c = 0;                                                       \
 448       while (i < ndigits)                                          \
 449         {                                                          \
 450           a = scm_getc (port);                                     \
 451           if (a == EOF)                                            \
 452             goto str_eof;                                          \
 453           if (terminator                                           \
 454               && (a == (scm_t_wchar) terminator)                   \
 455               && (i > 0))                                          \
 456             break;                                                 \
 457           if ('0' <= a && a <= '9')                                \
 458             a -= '0';                                              \
 459           else if ('A' <= a && a <= 'F')                           \
 460             a = a - 'A' + 10;                                      \
 461           else if ('a' <= a && a <= 'f')                           \
 462             a = a - 'a' + 10;                                      \
 463           else                                                     \
 464             {                                                      \
 465               c = a;                                               \
 466               goto bad_escaped;                                    \
 467             }                                                      \
 468           c = c * 16 + a;                                          \
 469           i ++;                                                    \
 470         }                                                          \
 471     } while (0)
 472
 473 static SCM
 474 scm_read_string (int chr, SCM port)
 475 #define FUNC_NAME "scm_lreadr"
 476 {
 477   /* For strings smaller than C_STR, this function creates only one Scheme
 478      object (the string returned).  */
 479
 480   SCM str = SCM_BOOL_F;
 481   unsigned c_str_len = 0;
 482   scm_t_wchar c;
 483
 484   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 485   while ('"' != (c = scm_getc (port)))
 486     {
 487       if (c == EOF)
 488         {
 489         str_eof:
 490           scm_i_input_error (FUNC_NAME, port,
 491                              "end of file in string constant", SCM_EOL);
 492         }
 493
 494       if (c_str_len + 1 >= scm_i_string_length (str))
 495         {
 496           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 497
 498           str = scm_string_append (scm_list_2 (str, addy));
 499         }
 500
 501       if (c == '\\')
 502         {
 503           switch (c = scm_getc (port))
 504             {
 505             case EOF:
 506               goto str_eof;
 507             case '"':
 508             case '\\':
 509               break;
 510             case '(':
 511             case ')':
 512               if (SCM_ESCAPED_PARENS_P)
 513                 break;
 514               goto bad_escaped;
 515             case '\n':
 516               continue;
 517             case '0':
 518               c = '\0';
 519               break;
 520             case 'f':
 521               c = '\f';
 522               break;
 523             case 'n':
 524               c = '\n';
 525               break;
 526             case 'r':
 527               c = '\r';
 528               break;
 529             case 't':
 530               c = '\t';
 531               break;
 532             case 'a':
 533               c = '\007';
 534               break;
 535             case 'v':
 536               c = '\v';
 537               break;
 538             case 'b':
 539               c = '\010';
 540               break;
 541             case 'x':
 542               if (SCM_R6RS_ESCAPES_P)
 543                 SCM_READ_HEX_ESCAPE (10, ';');
 544               else
 545                 SCM_READ_HEX_ESCAPE (2, '\0');
 546               break;
 547             case 'u':
 548               if (!SCM_R6RS_ESCAPES_P)
 549                 {
 550                   SCM_READ_HEX_ESCAPE (4, '\0');
 551                   break;
 552                 }
 553             case 'U':
 554               if (!SCM_R6RS_ESCAPES_P)
 555                 {
 556                   SCM_READ_HEX_ESCAPE (6, '\0');
 557                   break;
 558                 }
 559             default:
 560             bad_escaped:
 561               scm_i_input_error (FUNC_NAME, port,
 562                                  "illegal character in escape sequence: ~S",
 563                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 564             }
 565         }
 566       str = scm_i_string_start_writing (str);
 567       scm_i_string_set_x (str, c_str_len++, c);
 568       scm_i_string_stop_writing ();
 569     }
 570
 571   if (c_str_len > 0)
 572     {
 573       return scm_i_substring_copy (str, 0, c_str_len);
 574     }
 575
 576   return scm_nullstr;
 577 }
 578 #undef FUNC_NAME
 579
 580
 581 static SCM
 582 scm_read_number (scm_t_wchar chr, SCM port)
 583 {
 584   SCM result, str = SCM_EOL;
 585   char buffer[READER_BUFFER_SIZE];
 586   char *overflow_buffer = NULL;
 587   size_t bytes_read;
 588   int overflow;
 589   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 590
 591   scm_ungetc (chr, port);
 592   overflow = read_complete_token (port, buffer, sizeof (buffer),
 593                                   &overflow_buffer, &bytes_read);
 594
 595   if (!overflow)
 596     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 597   else
 598     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 599                             pt->ilseq_handler);
 600
 601   result = scm_string_to_number (str, SCM_UNDEFINED);
 602   if (!scm_is_true (result))
 603     {
 604       /* Return a symbol instead of a number */
 605       if (SCM_CASE_INSENSITIVE_P)
 606         str = scm_string_downcase_x (str);
 607       result = scm_string_to_symbol (str);
 608     }
 609
 610   if (overflow)
 611     free (overflow_buffer);
 612   SCM_COL (port) += scm_i_string_length (str);
 613   return result;
 614 }
 615
 616 static SCM
 617 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 618 {
 619   SCM result;
 620   int ends_with_colon = 0;
 621   size_t bytes_read;
 622   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 623   int overflow;
 624   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 625   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 626   SCM str;
 627
 628   scm_ungetc (chr, port);
 629   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 630                                   &overflow_buffer, &bytes_read);
 631   if (bytes_read > 0)
 632     {
 633       if (!overflow)
 634         ends_with_colon = buffer[bytes_read - 1] == ':';
 635       else
 636         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 637     }
 638
 639   if (postfix && ends_with_colon && (bytes_read > 1))
 640     {
 641       if (!overflow)
 642         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 643       else
 644         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 645                                 pt->ilseq_handler);
 646
 647       if (SCM_CASE_INSENSITIVE_P)
 648         str = scm_string_downcase_x (str);
 649       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 650     }
 651   else
 652     {
 653       if (!overflow)
 654         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 655       else
 656         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 657                                 pt->ilseq_handler);
 658
 659       if (SCM_CASE_INSENSITIVE_P)
 660         str = scm_string_downcase_x (str);
 661       result = scm_string_to_symbol (str);
 662     }
 663
 664   if (overflow)
 665     free (overflow_buffer);
 666   SCM_COL (port) += scm_i_string_length (str);
 667   return result;
 668 }
 669
 670 static SCM
 671 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 672 #define FUNC_NAME "scm_lreadr"
 673 {
 674   SCM result;
 675   size_t read;
 676   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 677   int overflow;
 678   unsigned int radix;
 679   SCM str;
 680   scm_t_port *pt;
 681
 682   switch (chr)
 683     {
 684     case 'B':
 685     case 'b':
 686       radix = 2;
 687       break;
 688
 689     case 'o':
 690     case 'O':
 691       radix = 8;
 692       break;
 693
 694     case 'd':
 695     case 'D':
 696       radix = 10;
 697       break;
 698
 699     case 'x':
 700     case 'X':
 701       radix = 16;
 702       break;
 703
 704     default:
 705       scm_ungetc (chr, port);
 706       scm_ungetc ('#', port);
 707       radix = 10;
 708     }
 709
 710   overflow = read_complete_token (port, buffer, sizeof (buffer),
 711                                   &overflow_buffer, &read);
 712
 713   pt = SCM_PTAB_ENTRY (port);
 714   if (!overflow)
 715     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 716   else
 717     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 718                             pt->ilseq_handler);
 719
 720   result = scm_string_to_number (str, scm_from_uint (radix));
 721
 722   if (overflow)
 723     free (overflow_buffer);
 724
 725   SCM_COL (port) += scm_i_string_length (str);
 726
 727   if (scm_is_true (result))
 728     return result;
 729
 730   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 731
 732   return SCM_BOOL_F;
 733 }
 734 #undef FUNC_NAME
 735
 736 static SCM
 737 scm_read_quote (int chr, SCM port)
 738 {
 739   SCM p;
 740   long line = SCM_LINUM (port);
 741   int column = SCM_COL (port) - 1;
 742
 743   switch (chr)
 744     {
 745     case '`':
 746       p = scm_sym_quasiquote;
 747       break;
 748
 749     case '\'':
 750       p = scm_sym_quote;
 751       break;
 752
 753     case ',':
 754       {
 755         scm_t_wchar c;
 756
 757         c = scm_getc (port);
 758         if ('@' == c)
 759           p = scm_sym_uq_splicing;
 760         else
 761           {
 762             scm_ungetc (c, port);
 763             p = scm_sym_unquote;
 764           }
 765         break;
 766       }
 767
 768     default:
 769       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 770                "scm_read_quote", chr);
 771       abort ();
 772     }
 773
 774   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 775   if (SCM_RECORD_POSITIONS_P)
 776     scm_whash_insert (scm_source_whash, p,
 777                       scm_make_srcprops (line, column,
 778                                          SCM_FILENAME (port),
 779                                          SCM_COPY_SOURCE_P
 780                                          ? (scm_cons2 (SCM_CAR (p),
 781                                                        SCM_CAR (SCM_CDR (p)),
 782                                                        SCM_EOL))
 783                                          : SCM_UNDEFINED,
 784                                          SCM_EOL));
 785
 786
 787   return p;
 788 }
 789
 790 SCM_SYMBOL (sym_syntax, "syntax");
 791 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 792 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 793 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 794
 795 static SCM
 796 scm_read_syntax (int chr, SCM port)
 797 {
 798   SCM p;
 799   long line = SCM_LINUM (port);
 800   int column = SCM_COL (port) - 1;
 801
 802   switch (chr)
 803     {
 804     case '`':
 805       p = sym_quasisyntax;
 806       break;
 807
 808     case '\'':
 809       p = sym_syntax;
 810       break;
 811
 812     case ',':
 813       {
 814         int c;
 815
 816         c = scm_getc (port);
 817         if ('@' == c)
 818           p = sym_unsyntax_splicing;
 819         else
 820           {
 821             scm_ungetc (c, port);
 822             p = sym_unsyntax;
 823           }
 824         break;
 825       }
 826
 827     default:
 828       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 829                "scm_read_syntax", chr);
 830       abort ();
 831     }
 832
 833   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 834   if (SCM_RECORD_POSITIONS_P)
 835     scm_whash_insert (scm_source_whash, p,
 836                       scm_make_srcprops (line, column,
 837                                          SCM_FILENAME (port),
 838                                          SCM_COPY_SOURCE_P
 839                                          ? (scm_cons2 (SCM_CAR (p),
 840                                                        SCM_CAR (SCM_CDR (p)),
 841                                                        SCM_EOL))
 842                                          : SCM_UNDEFINED,
 843                                          SCM_EOL));
 844
 845
 846   return p;
 847 }
 848
 849 static inline SCM
 850 scm_read_semicolon_comment (int chr, SCM port)
 851 {
 852   int c;
 853
 854   /* We use the get_byte here because there is no need to get the
 855      locale correct with comment input. This presumes that newline
 856      always represents itself no matter what the encoding is.  */
 857   for (c = scm_get_byte_or_eof (port);
 858        (c != EOF) && (c != '\n');
 859        c = scm_get_byte_or_eof (port));
 860
 861   return SCM_UNSPECIFIED;
 862 }
 863
 864 \f
 865 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 866
 867 static SCM
 868 scm_read_boolean (int chr, SCM port)
 869 {
 870   switch (chr)
 871     {
 872     case 't':
 873     case 'T':
 874       return SCM_BOOL_T;
 875
 876     case 'f':
 877     case 'F':
 878       return SCM_BOOL_F;
 879     }
 880
 881   return SCM_UNSPECIFIED;
 882 }
 883
 884 static SCM
 885 scm_read_character (scm_t_wchar chr, SCM port)
 886 #define FUNC_NAME "scm_lreadr"
 887 {
 888   char buffer[READER_CHAR_NAME_MAX_SIZE];
 889   SCM charname;
 890   size_t charname_len, bytes_read;
 891   scm_t_wchar cp;
 892   int overflow;
 893   scm_t_port *pt;
 894
 895   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 896   if (overflow)
 897     goto char_error;
 898
 899   if (bytes_read == 0)
 900     {
 901       chr = scm_getc (port);
 902       if (chr == EOF)
 903         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 904                            "while reading character", SCM_EOL);
 905
 906       /* CHR must be a token delimiter, like a whitespace.  */
 907       return (SCM_MAKE_CHAR (chr));
 908     }
 909
 910   pt = SCM_PTAB_ENTRY (port);
 911
 912   /* Simple ASCII characters can be processed immediately.  Also, simple
 913      ISO-8859-1 characters can be processed immediately if the encoding for this
 914      port is ISO-8859-1.  */
 915   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 916     {
 917       SCM_COL (port) += 1;
 918       return SCM_MAKE_CHAR (buffer[0]);
 919     }
 920
 921   /* Otherwise, convert the buffer into a proper scheme string for
 922      processing.  */
 923   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 924                                pt->ilseq_handler);
 925   charname_len = scm_i_string_length (charname);
 926   SCM_COL (port) += charname_len;
 927   cp = scm_i_string_ref (charname, 0);
 928   if (charname_len == 1)
 929     return SCM_MAKE_CHAR (cp);
 930
 931   /* Ignore dotted circles, which may be used to keep combining characters from
 932      combining with the backslash in #\charname.  */
 933   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 934     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 935
 936   if (cp >= '0' && cp < '8')
 937     {
 938       /* Dirk:FIXME::  This type of character syntax is not R5RS
 939        * compliant.  Further, it should be verified that the constant
 940        * does only consist of octal digits.  */
 941       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 942       if (SCM_I_INUMP (p))
 943         {
 944           scm_t_wchar c = SCM_I_INUM (p);
 945           if (SCM_IS_UNICODE_CHAR (c))
 946             return SCM_MAKE_CHAR (c);
 947           else
 948             scm_i_input_error (FUNC_NAME, port,
 949                                "out-of-range octal character escape: ~a",
 950                                scm_list_1 (charname));
 951         }
 952     }
 953
 954   if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
 955     {
 956       SCM p;
 957
 958       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 959       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 960                                 scm_from_uint (16));
 961       if (SCM_I_INUMP (p))
 962         {
 963           scm_t_wchar c = SCM_I_INUM (p);
 964           if (SCM_IS_UNICODE_CHAR (c))
 965             return SCM_MAKE_CHAR (c);
 966           else
 967             scm_i_input_error (FUNC_NAME, port,
 968                                "out-of-range hex character escape: ~a",
 969                                scm_list_1 (charname));
 970         }
 971     }
 972
 973   /* The names of characters should never have non-Latin1
 974      characters.  */
 975   if (scm_i_is_narrow_string (charname)
 976       || scm_i_try_narrow_string (charname))
 977     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 978                                        charname_len);
 979       if (scm_is_true (ch))
 980         return ch;
 981     }
 982
 983  char_error:
 984   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 985                      scm_list_1 (charname));
 986
 987   return SCM_UNSPECIFIED;
 988 }
 989 #undef FUNC_NAME
 990
 991 static inline SCM
 992 scm_read_keyword (int chr, SCM port)
 993 {
 994   SCM symbol;
 995
 996   /* Read the symbol that comprises the keyword.  Doing this instead of
 997      invoking a specific symbol reader function allows `scm_read_keyword ()'
 998      to adapt to the delimiters currently valid of symbols.
 999
1000      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1001   symbol = scm_read_expression (port);
1002   if (!scm_is_symbol (symbol))
1003     scm_i_input_error ("scm_read_keyword", port,
1004                        "keyword prefix `~a' not followed by a symbol: ~s",
1005                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1006
1007   return (scm_symbol_to_keyword (symbol));
1008 }
1009
1010 static inline SCM
1011 scm_read_vector (int chr, SCM port)
1012 {
1013   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1014      guarantee that it's going to do what we want.  After all, this is an
1015      implementation detail of `scm_read_vector ()', not a desirable
1016      property.  */
1017   return (scm_vector (scm_read_sexp (chr, port)));
1018 }
1019
1020 static inline SCM
1021 scm_read_srfi4_vector (int chr, SCM port)
1022 {
1023   return scm_i_read_array (port, chr);
1024 }
1025
1026 static SCM
1027 scm_read_bytevector (scm_t_wchar chr, SCM port)
1028 {
1029   chr = scm_getc (port);
1030   if (chr != 'u')
1031     goto syntax;
1032
1033   chr = scm_getc (port);
1034   if (chr != '8')
1035     goto syntax;
1036
1037   chr = scm_getc (port);
1038   if (chr != '(')
1039     goto syntax;
1040
1041   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1042
1043  syntax:
1044   scm_i_input_error ("read_bytevector", port,
1045                      "invalid bytevector prefix",
1046                      SCM_MAKE_CHAR (chr));
1047   return SCM_UNSPECIFIED;
1048 }
1049
1050 static SCM
1051 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1052 {
1053   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1054      terribly inefficient but who cares?  */
1055   SCM s_bits = SCM_EOL;
1056
1057   for (chr = scm_getc (port);
1058        (chr != EOF) && ((chr == '0') || (chr == '1'));
1059        chr = scm_getc (port))
1060     {
1061       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1062     }
1063
1064   if (chr != EOF)
1065     scm_ungetc (chr, port);
1066
1067   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1068 }
1069
1070 static inline SCM
1071 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1072 {
1073   int bang_seen = 0;
1074
1075   /* We can use the get_byte here because there is no need to get the
1076      locale correct when reading comments. This presumes that
1077      hash and exclamation points always represent themselves no
1078      matter what the source encoding is.*/
1079   for (;;)
1080     {
1081       int c = scm_get_byte_or_eof (port);
1082
1083       if (c == EOF)
1084         scm_i_input_error ("skip_block_comment", port,
1085                            "unterminated `#! ... !#' comment", SCM_EOL);
1086
1087       if (c == '!')
1088         bang_seen = 1;
1089       else if (c == '#' && bang_seen)
1090         break;
1091       else
1092         bang_seen = 0;
1093     }
1094
1095   return SCM_UNSPECIFIED;
1096 }
1097
1098 static SCM
1099 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1100 {
1101   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1102      nested.  So care must be taken.  */
1103   int nesting_level = 1;
1104   int opening_seen = 0, closing_seen = 0;
1105
1106   while (nesting_level > 0)
1107     {
1108       int c = scm_getc (port);
1109
1110       if (c == EOF)
1111         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1112                            "unterminated `#| ... |#' comment", SCM_EOL);
1113
1114       if (opening_seen)
1115         {
1116           if (c == '|')
1117             nesting_level++;
1118           opening_seen = 0;
1119         }
1120       else if (closing_seen)
1121         {
1122           if (c == '#')
1123             nesting_level--;
1124           closing_seen = 0;
1125         }
1126       else if (c == '|')
1127         closing_seen = 1;
1128       else if (c == '#')
1129         opening_seen = 1;
1130       else
1131         opening_seen = closing_seen = 0;
1132     }
1133
1134   return SCM_UNSPECIFIED;
1135 }
1136
1137 static SCM
1138 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1139 {
1140   scm_t_wchar c;
1141
1142   c = flush_ws (port, (char *) NULL);
1143   if (EOF == c)
1144     scm_i_input_error ("read_commented_expression", port,
1145                        "no expression after #; comment", SCM_EOL);
1146   scm_ungetc (c, port);
1147   scm_read_expression (port);
1148   return SCM_UNSPECIFIED;
1149 }
1150
1151 static SCM
1152 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1153 {
1154   /* Guile's extended symbol read syntax looks like this:
1155
1156        #{This is all a symbol name}#
1157
1158      So here, CHR is expected to be `{'.  */
1159   int saw_brace = 0, finished = 0;
1160   size_t len = 0;
1161   SCM buf = scm_i_make_string (1024, NULL);
1162
1163   buf = scm_i_string_start_writing (buf);
1164
1165   while ((chr = scm_getc (port)) != EOF)
1166     {
1167       if (saw_brace)
1168         {
1169           if (chr == '#')
1170             {
1171               finished = 1;
1172               break;
1173             }
1174           else
1175             {
1176               saw_brace = 0;
1177               scm_i_string_set_x (buf, len++, '}');
1178               scm_i_string_set_x (buf, len++, chr);
1179             }
1180         }
1181       else if (chr == '}')
1182         saw_brace = 1;
1183       else
1184         scm_i_string_set_x (buf, len++, chr);
1185
1186       if (len >= scm_i_string_length (buf) - 2)
1187         {
1188           SCM addy;
1189
1190           scm_i_string_stop_writing ();
1191           addy = scm_i_make_string (1024, NULL);
1192           buf = scm_string_append (scm_list_2 (buf, addy));
1193           len = 0;
1194           buf = scm_i_string_start_writing (buf);
1195         }
1196
1197       if (finished)
1198         break;
1199     }
1200   scm_i_string_stop_writing ();
1201
1202   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1203 }
1204
1205
1206 \f
1207 /* Top-level token readers, i.e., dispatchers.  */
1208
1209 static SCM
1210 scm_read_sharp_extension (int chr, SCM port)
1211 {
1212   SCM proc;
1213
1214   proc = scm_get_hash_procedure (chr);
1215   if (scm_is_true (scm_procedure_p (proc)))
1216     {
1217       long line = SCM_LINUM (port);
1218       int column = SCM_COL (port) - 2;
1219       SCM got;
1220
1221       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1222       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1223         {
1224           if (SCM_RECORD_POSITIONS_P)
1225             return (recsexpr (got, line, column,
1226                               SCM_FILENAME (port)));
1227           else
1228             return got;
1229         }
1230     }
1231
1232   return SCM_UNSPECIFIED;
1233 }
1234
1235 /* The reader for the sharp `#' character.  It basically dispatches reads
1236    among the above token readers.   */
1237 static SCM
1238 scm_read_sharp (scm_t_wchar chr, SCM port)
1239 #define FUNC_NAME "scm_lreadr"
1240 {
1241   SCM result;
1242
1243   chr = scm_getc (port);
1244
1245   result = scm_read_sharp_extension (chr, port);
1246   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1247     return result;
1248
1249   switch (chr)
1250     {
1251     case '\\':
1252       return (scm_read_character (chr, port));
1253     case '(':
1254       return (scm_read_vector (chr, port));
1255     case 's':
1256     case 'u':
1257     case 'f':
1258       /* This one may return either a boolean or an SRFI-4 vector.  */
1259       return (scm_read_srfi4_vector (chr, port));
1260     case 'v':
1261       return (scm_read_bytevector (chr, port));
1262     case '*':
1263       return (scm_read_guile_bit_vector (chr, port));
1264     case 't':
1265     case 'T':
1266     case 'F':
1267       /* This one may return either a boolean or an SRFI-4 vector.  */
1268       return (scm_read_boolean (chr, port));
1269     case ':':
1270       return (scm_read_keyword (chr, port));
1271     case '0': case '1': case '2': case '3': case '4':
1272     case '5': case '6': case '7': case '8': case '9':
1273     case '@':
1274 #if SCM_ENABLE_DEPRECATED
1275       /* See below for 'i' and 'e'. */
1276     case 'a':
1277     case 'c':
1278     case 'y':
1279     case 'h':
1280     case 'l':
1281 #endif
1282       return (scm_i_read_array (port, chr));
1283
1284     case 'i':
1285     case 'e':
1286 #if SCM_ENABLE_DEPRECATED
1287       {
1288         /* When next char is '(', it really is an old-style
1289            uniform array. */
1290         scm_t_wchar next_c = scm_getc (port);
1291         if (next_c != EOF)
1292           scm_ungetc (next_c, port);
1293         if (next_c == '(')
1294           return scm_i_read_array (port, chr);
1295         /* Fall through. */
1296       }
1297 #endif
1298     case 'b':
1299     case 'B':
1300     case 'o':
1301     case 'O':
1302     case 'd':
1303     case 'D':
1304     case 'x':
1305     case 'X':
1306     case 'I':
1307     case 'E':
1308       return (scm_read_number_and_radix (chr, port));
1309     case '{':
1310       return (scm_read_extended_symbol (chr, port));
1311     case '!':
1312       return (scm_read_scsh_block_comment (chr, port));
1313     case ';':
1314       return (scm_read_commented_expression (chr, port));
1315     case '`':
1316     case '\'':
1317     case ',':
1318       return (scm_read_syntax (chr, port));
1319     default:
1320       result = scm_read_sharp_extension (chr, port);
1321       if (scm_is_eq (result, SCM_UNSPECIFIED))
1322         {
1323           /* To remain compatible with 1.8 and earlier, the following
1324              characters have lower precedence than `read-hash-extend'
1325              characters.  */
1326           switch (chr)
1327             {
1328             case '|':
1329               return scm_read_r6rs_block_comment (chr, port);
1330             default:
1331               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1332                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1333             }
1334         }
1335       else
1336         return result;
1337     }
1338
1339   return SCM_UNSPECIFIED;
1340 }
1341 #undef FUNC_NAME
1342
1343 static SCM
1344 scm_read_expression (SCM port)
1345 #define FUNC_NAME "scm_read_expression"
1346 {
1347   while (1)
1348     {
1349       register scm_t_wchar chr;
1350
1351       chr = scm_getc (port);
1352
1353       switch (chr)
1354         {
1355         case SCM_WHITE_SPACES:
1356         case SCM_LINE_INCREMENTORS:
1357           break;
1358         case ';':
1359           (void) scm_read_semicolon_comment (chr, port);
1360           break;
1361         case '[':
1362           if (!SCM_SQUARE_BRACKETS_P)
1363             return (scm_read_mixed_case_symbol (chr, port));
1364           /* otherwise fall through */
1365         case '(':
1366           return (scm_read_sexp (chr, port));
1367         case '"':
1368           return (scm_read_string (chr, port));
1369         case '\'':
1370         case '`':
1371         case ',':
1372           return (scm_read_quote (chr, port));
1373         case '#':
1374           {
1375             SCM result;
1376             result = scm_read_sharp (chr, port);
1377             if (scm_is_eq (result, SCM_UNSPECIFIED))
1378               /* We read a comment or some such.  */
1379               break;
1380             else
1381               return result;
1382           }
1383         case ')':
1384           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1385           break;
1386         case EOF:
1387           return SCM_EOF_VAL;
1388         case ':':
1389           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1390             return scm_symbol_to_keyword (scm_read_expression (port));
1391           /* Fall through.  */
1392
1393         default:
1394           {
1395             if (((chr >= '0') && (chr <= '9'))
1396                 || (strchr ("+-.", chr)))
1397               return (scm_read_number (chr, port));
1398             else
1399               return (scm_read_mixed_case_symbol (chr, port));
1400           }
1401         }
1402     }
1403 }
1404 #undef FUNC_NAME
1405
1406 \f
1407 /* Actual reader.  */
1408
1409 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1410             (SCM port),
1411             "Read an s-expression from the input port @var{port}, or from\n"
1412             "the current input port if @var{port} is not specified.\n"
1413             "Any whitespace before the next token is discarded.")
1414 #define FUNC_NAME s_scm_read
1415 {
1416   int c;
1417
1418   if (SCM_UNBNDP (port))
1419     port = scm_current_input_port ();
1420   SCM_VALIDATE_OPINPORT (1, port);
1421
1422   c = flush_ws (port, (char *) NULL);
1423   if (EOF == c)
1424     return SCM_EOF_VAL;
1425   scm_ungetc (c, port);
1426
1427   return (scm_read_expression (port));
1428 }
1429 #undef FUNC_NAME
1430
1431
1432 \f
1433
1434 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1435 static SCM
1436 recsexpr (SCM obj, long line, int column, SCM filename)
1437 {
1438   if (!scm_is_pair(obj)) {
1439     return obj;
1440   } else {
1441     SCM tmp = obj, copy;
1442     /* If this sexpr is visible in the read:sharp source, we want to
1443        keep that information, so only record non-constant cons cells
1444        which haven't previously been read by the reader. */
1445     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1446       {
1447         if (SCM_COPY_SOURCE_P)
1448           {
1449             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1450                              SCM_UNDEFINED);
1451             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1452               {
1453                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1454                                                       line,
1455                                                       column,
1456                                                       filename),
1457                                             SCM_UNDEFINED));
1458                 copy = SCM_CDR (copy);
1459               }
1460             SCM_SETCDR (copy, tmp);
1461           }
1462         else
1463           {
1464             recsexpr (SCM_CAR (obj), line, column, filename);
1465             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1466               recsexpr (SCM_CAR (tmp), line, column, filename);
1467             copy = SCM_UNDEFINED;
1468           }
1469         scm_whash_insert (scm_source_whash,
1470                           obj,
1471                           scm_make_srcprops (line,
1472                                              column,
1473                                              filename,
1474                                              copy,
1475                                              SCM_EOL));
1476       }
1477     return obj;
1478   }
1479 }
1480
1481 /* Manipulate the read-hash-procedures alist.  This could be written in
1482    Scheme, but maybe it will also be used by C code during initialisation.  */
1483 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1484             (SCM chr, SCM proc),
1485             "Install the procedure @var{proc} for reading expressions\n"
1486             "starting with the character sequence @code{#} and @var{chr}.\n"
1487             "@var{proc} will be called with two arguments:  the character\n"
1488             "@var{chr} and the port to read further data from. The object\n"
1489             "returned will be the return value of @code{read}. \n"
1490             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1491             )
1492 #define FUNC_NAME s_scm_read_hash_extend
1493 {
1494   SCM this;
1495   SCM prev;
1496
1497   SCM_VALIDATE_CHAR (1, chr);
1498   SCM_ASSERT (scm_is_false (proc)
1499               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1500               proc, SCM_ARG2, FUNC_NAME);
1501
1502   /* Check if chr is already in the alist.  */
1503   this = *scm_read_hash_procedures;
1504   prev = SCM_BOOL_F;
1505   while (1)
1506     {
1507       if (scm_is_null (this))
1508         {
1509           /* not found, so add it to the beginning.  */
1510           if (scm_is_true (proc))
1511             {
1512               *scm_read_hash_procedures =
1513                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1514             }
1515           break;
1516         }
1517       if (scm_is_eq (chr, SCM_CAAR (this)))
1518         {
1519           /* already in the alist.  */
1520           if (scm_is_false (proc))
1521             {
1522               /* remove it.  */
1523               if (scm_is_false (prev))
1524                 {
1525                   *scm_read_hash_procedures =
1526                     SCM_CDR (*scm_read_hash_procedures);
1527                 }
1528               else
1529                 scm_set_cdr_x (prev, SCM_CDR (this));
1530             }
1531           else
1532             {
1533               /* replace it.  */
1534               scm_set_cdr_x (SCM_CAR (this), proc);
1535             }
1536           break;
1537         }
1538       prev = this;
1539       this = SCM_CDR (this);
1540     }
1541
1542   return SCM_UNSPECIFIED;
1543 }
1544 #undef FUNC_NAME
1545
1546 /* Recover the read-hash procedure corresponding to char c.  */
1547 static SCM
1548 scm_get_hash_procedure (int c)
1549 {
1550   SCM rest = *scm_read_hash_procedures;
1551
1552   while (1)
1553     {
1554       if (scm_is_null (rest))
1555         return SCM_BOOL_F;
1556
1557       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1558         return SCM_CDAR (rest);
1559
1560       rest = SCM_CDR (rest);
1561     }
1562 }
1563
1564 #define SCM_ENCODING_SEARCH_SIZE (500)
1565
1566 /* Search the first few hundred characters of a file for an Emacs-like coding
1567    declaration.  Returns either NULL or a string whose storage has been
1568    allocated with `scm_gc_malloc ()'.  */
1569 char *
1570 scm_i_scan_for_encoding (SCM port)
1571 {
1572   char header[SCM_ENCODING_SEARCH_SIZE+1];
1573   size_t bytes_read;
1574   char *encoding = NULL;
1575   int utf8_bom = 0;
1576   char *pos;
1577   int i;
1578   int in_comment;
1579
1580   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1581     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1582        "guile <(echo '(display "hello")')") so bail out.  */
1583     return NULL;
1584
1585   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1586
1587   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1588
1589   if (bytes_read > 3
1590       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1591     utf8_bom = 1;
1592
1593   /* search past "coding[:=]" */
1594   pos = header;
1595   while (1)
1596     {
1597       if ((pos = strstr(pos, "coding")) == NULL)
1598         return NULL;
1599
1600       pos += strlen("coding");
1601       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1602           (*pos == ':' || *pos == '='))
1603         {
1604           pos ++;
1605           break;
1606         }
1607     }
1608
1609   /* skip spaces */
1610   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1611          (*pos == ' ' || *pos == '\t'))
1612     pos ++;
1613
1614   /* grab the next token */
1615   i = 0;
1616   while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1617          && pos + i - header < bytes_read
1618          && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1619     i++;
1620
1621   if (i == 0)
1622     return NULL;
1623
1624   encoding = scm_gc_strndup (pos, i, "encoding");
1625   for (i = 0; i < strlen (encoding); i++)
1626     encoding[i] = toupper ((int) encoding[i]);
1627
1628   /* push backwards to make sure we were in a comment */
1629   in_comment = 0;
1630   while (pos - i - header > 0)
1631     {
1632       if (*(pos - i) == '\n')
1633         {
1634           /* This wasn't in a semicolon comment. Check for a
1635            hash-bang comment. */
1636           char *beg = strstr (header, "#!");
1637           char *end = strstr (header, "!#");
1638           if (beg < pos && pos < end)
1639             in_comment = 1;
1640           break;
1641         }
1642       if (*(pos - i) == ';')
1643         {
1644           in_comment = 1;
1645           break;
1646         }
1647       i ++;
1648     }
1649   if (!in_comment)
1650     /* This wasn't in a comment */
1651     return NULL;
1652
1653   if (utf8_bom && strcmp(encoding, "UTF-8"))
1654     scm_misc_error (NULL,
1655                     "the port input declares the encoding ~s but is encoded as UTF-8",
1656                     scm_list_1 (scm_from_locale_string (encoding)));
1657
1658   return encoding;
1659 }
1660
1661 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1662             (SCM port),
1663             "Scans the port for an Emacs-like character coding declaration\n"
1664             "near the top of the contents of a port with random-acessible contents.\n"
1665             "The coding declaration is of the form\n"
1666             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1667             "\n"
1668             "Returns a string containing the character encoding of the file\n"
1669             "if a declaration was found, or @code{#f} otherwise.\n")
1670 #define FUNC_NAME s_scm_file_encoding
1671 {
1672   char *enc;
1673   SCM s_enc;
1674
1675   enc = scm_i_scan_for_encoding (port);
1676   if (enc == NULL)
1677     return SCM_BOOL_F;
1678   else
1679     {
1680       s_enc = scm_from_locale_string (enc);
1681       return s_enc;
1682     }
1683
1684   return SCM_BOOL_F;
1685 }
1686 #undef FUNC_NAME
1687
1688 void
1689 scm_init_read ()
1690 {
1691   scm_read_hash_procedures =
1692     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1693
1694   scm_init_opts (scm_read_options, scm_read_opts);
1695 #include "libguile/read.x"
1696 }
1697
1698 /*
1699   Local Variables:
1700   c-file-style: "gnu"
1701   End:
1702 */