libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63 SCM_SYMBOL (sym_nil, "nil");
  64
  65 scm_t_option scm_read_opts[] = {
  66   { SCM_OPTION_BOOLEAN, "copy", 0,
  67     "Copy source code expressions." },
  68   { SCM_OPTION_BOOLEAN, "positions", 0,
  69     "Record positions of source code expressions." },
  70   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  71     "Convert symbols to lower case."},
  72   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  73     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  74   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  75     "Support Elisp vector syntax, namely `[...]'."},
  76   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  77     "Support `\\(' and `\\)' in strings."},
  78   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  79     "Use R6RS variable-length character and string hex escapes."},
  80   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  81     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  82   { 0, },
  83 };
  84
  85 /*
  86   Give meaningful error messages for errors
  87
  88   We use the format
  89
  90   FILE:LINE:COL: MESSAGE
  91   This happened in ....
  92
  93   This is not standard GNU format, but the test-suite likes the real
  94   message to be in front.
  95
  96  */
  97
  98
  99 void
 100 scm_i_input_error (char const *function,
 101                    SCM port, const char *message, SCM arg)
 102 {
 103   SCM fn = (scm_is_string (SCM_FILENAME(port))
 104             ? SCM_FILENAME(port)
 105             : scm_from_locale_string ("#<unknown port>"));
 106
 107   SCM string_port = scm_open_output_string ();
 108   SCM string = SCM_EOL;
 109   scm_simple_format (string_port,
 110                      scm_from_locale_string ("~A:~S:~S: ~A"),
 111                      scm_list_4 (fn,
 112                                  scm_from_long (SCM_LINUM (port) + 1),
 113                                  scm_from_int (SCM_COL (port) + 1),
 114                                  scm_from_locale_string (message)));
 115
 116   string = scm_get_output_string (string_port);
 117   scm_close_output_port (string_port);
 118   scm_error_scm (scm_from_locale_symbol ("read-error"),
 119                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 120                  string,
 121                  arg,
 122                  SCM_BOOL_F);
 123 }
 124
 125
 126 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 127             (SCM setting),
 128             "Option interface for the read options. Instead of using\n"
 129             "this procedure directly, use the procedures @code{read-enable},\n"
 130             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 131 #define FUNC_NAME s_scm_read_options
 132 {
 133   SCM ans = scm_options (setting,
 134                          scm_read_opts,
 135                          FUNC_NAME);
 136   if (SCM_COPY_SOURCE_P)
 137     SCM_RECORD_POSITIONS_P = 1;
 138   return ans;
 139 }
 140 #undef FUNC_NAME
 141
 142 /* An association list mapping extra hash characters to procedures.  */
 143 static SCM *scm_read_hash_procedures;
 144
 145
 146 \f
 147 /* Token readers.  */
 148
 149
 150 /* Size of the C buffer used to read symbols and numbers.  */
 151 #define READER_BUFFER_SIZE            128
 152
 153 /* Size of the C buffer used to read strings.  */
 154 #define READER_STRING_BUFFER_SIZE     512
 155
 156 /* The maximum size of Scheme character names.  */
 157 #define READER_CHAR_NAME_MAX_SIZE      50
 158
 159
 160 /* `isblank' is only in C99.  */
 161 #define CHAR_IS_BLANK_(_chr)                                    \
 162   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 163    || ((_chr) == '\f') || ((_chr) == '\r'))
 164
 165 #ifdef MSDOS
 166 # define CHAR_IS_BLANK(_chr)                    \
 167   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 168 #else
 169 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 170 #endif
 171
 172
 173 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 174    structure'').  */
 175 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 176   (CHAR_IS_BLANK (c)                                            \
 177    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 178    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 179
 180 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 181
 182 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 183    Structure''.  */
 184 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 185   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 186    || ((_chr) == 'd') || ((_chr) == 'l'))
 187
 188 /* Read an SCSH block comment.  */
 189 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 190 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 191 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 192 static SCM scm_read_shebang (scm_t_wchar, SCM);
 193 static SCM scm_get_hash_procedure (int);
 194
 195 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 196    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 197    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 198    bytes actually read.  */
 199 static inline int
 200 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 201  {
 202    *read = 0;
 203
 204    while (*read < buf_size)
 205      {
 206        int chr;
 207
 208        chr = scm_get_byte_or_eof (port);
 209
 210        if (chr == EOF)
 211         return 0;
 212       else if (CHAR_IS_DELIMITER (chr))
 213         {
 214           scm_unget_byte (chr, port);
 215           return 0;
 216         }
 217       else
 218         {
 219           *buf = (char) chr;
 220           buf++, (*read)++;
 221         }
 222      }
 223
 224    return 1;
 225  }
 226
 227 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 228    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 229    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 230    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 231    will be set the number of bytes actually read.  */
 232 static int
 233 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 234                            char **overflow_buffer, size_t *read)
 235 {
 236   int overflow = 0;
 237   size_t bytes_read, overflow_size;
 238
 239   *overflow_buffer = NULL;
 240   overflow_size = 0;
 241
 242   do
 243     {
 244       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 245       if (bytes_read == 0)
 246         break;
 247       if (overflow || overflow_size != 0)
 248         {
 249           if (overflow_size == 0)
 250             {
 251               *overflow_buffer = scm_malloc (bytes_read);
 252               memcpy (*overflow_buffer, buffer, bytes_read);
 253               overflow_size = bytes_read;
 254             }
 255           else
 256             {
 257               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 258               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 259               overflow_size += bytes_read;
 260             }
 261         }
 262     }
 263   while (overflow);
 264
 265   if (overflow_size)
 266     *read = overflow_size;
 267   else
 268     *read = bytes_read;
 269
 270   return (overflow_size != 0);
 271 }
 272
 273 /* Skip whitespace from PORT and return the first non-whitespace character
 274    read.  Raise an error on end-of-file.  */
 275 static int
 276 flush_ws (SCM port, const char *eoferr)
 277 {
 278   register scm_t_wchar c;
 279   while (1)
 280     switch (c = scm_getc (port))
 281       {
 282       case EOF:
 283       goteof:
 284         if (eoferr)
 285           {
 286             scm_i_input_error (eoferr,
 287                                port,
 288                                "end of file",
 289                                SCM_EOL);
 290           }
 291         return c;
 292
 293       case ';':
 294       lp:
 295         switch (c = scm_getc (port))
 296           {
 297           case EOF:
 298             goto goteof;
 299           default:
 300             goto lp;
 301           case SCM_LINE_INCREMENTORS:
 302             break;
 303           }
 304         break;
 305
 306       case '#':
 307         switch (c = scm_getc (port))
 308           {
 309           case EOF:
 310             eoferr = "read_sharp";
 311             goto goteof;
 312           case '!':
 313             scm_read_shebang (c, port);
 314             break;
 315           case ';':
 316             scm_read_commented_expression (c, port);
 317             break;
 318           case '|':
 319             if (scm_is_false (scm_get_hash_procedure (c)))
 320               {
 321                 scm_read_r6rs_block_comment (c, port);
 322                 break;
 323               }
 324             /* fall through */
 325           default:
 326             scm_ungetc (c, port);
 327             return '#';
 328           }
 329         break;
 330
 331       case SCM_LINE_INCREMENTORS:
 332       case SCM_SINGLE_SPACES:
 333       case '\t':
 334         break;
 335
 336       default:
 337         return c;
 338       }
 339
 340   return 0;
 341 }
 342
 343
 344 \f
 345 /* Token readers.  */
 346
 347 static SCM scm_read_expression (SCM port);
 348 static SCM scm_read_sharp (int chr, SCM port);
 349 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 350
 351
 352 static SCM
 353 scm_read_sexp (scm_t_wchar chr, SCM port)
 354 #define FUNC_NAME "scm_i_lreadparen"
 355 {
 356   register int c;
 357   register SCM tmp;
 358   register SCM tl, ans = SCM_EOL;
 359   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 360   const int terminating_char = ((chr == '[') ? ']' : ')');
 361
 362   /* Need to capture line and column numbers here. */
 363   long line = SCM_LINUM (port);
 364   int column = SCM_COL (port) - 1;
 365
 366
 367   c = flush_ws (port, FUNC_NAME);
 368   if (terminating_char == c)
 369     return SCM_EOL;
 370
 371   scm_ungetc (c, port);
 372   if (scm_is_eq (scm_sym_dot,
 373                  (tmp = scm_read_expression (port))))
 374     {
 375       ans = scm_read_expression (port);
 376       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 377         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 378                            SCM_EOL);
 379       return ans;
 380     }
 381
 382   /* Build the head of the list structure. */
 383   ans = tl = scm_cons (tmp, SCM_EOL);
 384
 385   if (SCM_COPY_SOURCE_P)
 386     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 387                            ? copy
 388                            : tmp,
 389                            SCM_EOL);
 390
 391   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 392     {
 393       SCM new_tail;
 394
 395       scm_ungetc (c, port);
 396       if (scm_is_eq (scm_sym_dot,
 397                      (tmp = scm_read_expression (port))))
 398         {
 399           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 400
 401           if (SCM_COPY_SOURCE_P)
 402             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 403                                        SCM_EOL));
 404
 405           c = flush_ws (port, FUNC_NAME);
 406           if (terminating_char != c)
 407             scm_i_input_error (FUNC_NAME, port,
 408                                "in pair: missing close paren", SCM_EOL);
 409           goto exit;
 410         }
 411
 412       new_tail = scm_cons (tmp, SCM_EOL);
 413       SCM_SETCDR (tl, new_tail);
 414       tl = new_tail;
 415
 416       if (SCM_COPY_SOURCE_P)
 417         {
 418           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 419                                     ? copy
 420                                     : tmp, SCM_EOL);
 421           SCM_SETCDR (tl2, new_tail2);
 422           tl2 = new_tail2;
 423         }
 424     }
 425
 426  exit:
 427   if (SCM_RECORD_POSITIONS_P)
 428     scm_whash_insert (scm_source_whash,
 429                       ans,
 430                       scm_make_srcprops (line, column,
 431                                          SCM_FILENAME (port),
 432                                          SCM_COPY_SOURCE_P
 433                                          ? ans2
 434                                          : SCM_UNDEFINED,
 435                                          SCM_EOL));
 436   return ans;
 437 }
 438 #undef FUNC_NAME
 439
 440
 441 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 442    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 443    found.  */
 444 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 445   do                                                               \
 446     {                                                              \
 447       scm_t_wchar a;                                               \
 448       size_t i = 0;                                                \
 449       c = 0;                                                       \
 450       while (i < ndigits)                                          \
 451         {                                                          \
 452           a = scm_getc (port);                                     \
 453           if (a == EOF)                                            \
 454             goto str_eof;                                          \
 455           if (terminator                                           \
 456               && (a == (scm_t_wchar) terminator)                   \
 457               && (i > 0))                                          \
 458             break;                                                 \
 459           if ('0' <= a && a <= '9')                                \
 460             a -= '0';                                              \
 461           else if ('A' <= a && a <= 'F')                           \
 462             a = a - 'A' + 10;                                      \
 463           else if ('a' <= a && a <= 'f')                           \
 464             a = a - 'a' + 10;                                      \
 465           else                                                     \
 466             {                                                      \
 467               c = a;                                               \
 468               goto bad_escaped;                                    \
 469             }                                                      \
 470           c = c * 16 + a;                                          \
 471           i ++;                                                    \
 472         }                                                          \
 473     } while (0)
 474
 475 static SCM
 476 scm_read_string (int chr, SCM port)
 477 #define FUNC_NAME "scm_lreadr"
 478 {
 479   /* For strings smaller than C_STR, this function creates only one Scheme
 480      object (the string returned).  */
 481
 482   SCM str = SCM_BOOL_F;
 483   unsigned c_str_len = 0;
 484   scm_t_wchar c;
 485
 486   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 487   while ('"' != (c = scm_getc (port)))
 488     {
 489       if (c == EOF)
 490         {
 491         str_eof:
 492           scm_i_input_error (FUNC_NAME, port,
 493                              "end of file in string constant", SCM_EOL);
 494         }
 495
 496       if (c_str_len + 1 >= scm_i_string_length (str))
 497         {
 498           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 499
 500           str = scm_string_append (scm_list_2 (str, addy));
 501         }
 502
 503       if (c == '\\')
 504         {
 505           switch (c = scm_getc (port))
 506             {
 507             case EOF:
 508               goto str_eof;
 509             case '"':
 510             case '\\':
 511               break;
 512             case '(':
 513             case ')':
 514               if (SCM_ESCAPED_PARENS_P)
 515                 break;
 516               goto bad_escaped;
 517             case '\n':
 518               continue;
 519             case '0':
 520               c = '\0';
 521               break;
 522             case 'f':
 523               c = '\f';
 524               break;
 525             case 'n':
 526               c = '\n';
 527               break;
 528             case 'r':
 529               c = '\r';
 530               break;
 531             case 't':
 532               c = '\t';
 533               break;
 534             case 'a':
 535               c = '\007';
 536               break;
 537             case 'v':
 538               c = '\v';
 539               break;
 540             case 'b':
 541               c = '\010';
 542               break;
 543             case 'x':
 544               if (SCM_R6RS_ESCAPES_P)
 545                 SCM_READ_HEX_ESCAPE (10, ';');
 546               else
 547                 SCM_READ_HEX_ESCAPE (2, '\0');
 548               break;
 549             case 'u':
 550               if (!SCM_R6RS_ESCAPES_P)
 551                 {
 552                   SCM_READ_HEX_ESCAPE (4, '\0');
 553                   break;
 554                 }
 555             case 'U':
 556               if (!SCM_R6RS_ESCAPES_P)
 557                 {
 558                   SCM_READ_HEX_ESCAPE (6, '\0');
 559                   break;
 560                 }
 561             default:
 562             bad_escaped:
 563               scm_i_input_error (FUNC_NAME, port,
 564                                  "illegal character in escape sequence: ~S",
 565                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 566             }
 567         }
 568       str = scm_i_string_start_writing (str);
 569       scm_i_string_set_x (str, c_str_len++, c);
 570       scm_i_string_stop_writing ();
 571     }
 572
 573   if (c_str_len > 0)
 574     {
 575       return scm_i_substring_copy (str, 0, c_str_len);
 576     }
 577
 578   return scm_nullstr;
 579 }
 580 #undef FUNC_NAME
 581
 582
 583 static SCM
 584 scm_read_number (scm_t_wchar chr, SCM port)
 585 {
 586   SCM result, str = SCM_EOL;
 587   char buffer[READER_BUFFER_SIZE];
 588   char *overflow_buffer = NULL;
 589   size_t bytes_read;
 590   int overflow;
 591   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 592
 593   scm_ungetc (chr, port);
 594   overflow = read_complete_token (port, buffer, sizeof (buffer),
 595                                   &overflow_buffer, &bytes_read);
 596
 597   if (!overflow)
 598     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 599   else
 600     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 601                             pt->ilseq_handler);
 602
 603   result = scm_string_to_number (str, SCM_UNDEFINED);
 604   if (!scm_is_true (result))
 605     {
 606       /* Return a symbol instead of a number */
 607       if (SCM_CASE_INSENSITIVE_P)
 608         str = scm_string_downcase_x (str);
 609       result = scm_string_to_symbol (str);
 610     }
 611
 612   if (overflow)
 613     free (overflow_buffer);
 614   SCM_COL (port) += scm_i_string_length (str);
 615   return result;
 616 }
 617
 618 static SCM
 619 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 620 {
 621   SCM result;
 622   int ends_with_colon = 0;
 623   size_t bytes_read;
 624   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 625   int overflow;
 626   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 627   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 628   SCM str;
 629
 630   scm_ungetc (chr, port);
 631   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 632                                   &overflow_buffer, &bytes_read);
 633   if (bytes_read > 0)
 634     {
 635       if (!overflow)
 636         ends_with_colon = buffer[bytes_read - 1] == ':';
 637       else
 638         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 639     }
 640
 641   if (postfix && ends_with_colon && (bytes_read > 1))
 642     {
 643       if (!overflow)
 644         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 645       else
 646         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 647                                 pt->ilseq_handler);
 648
 649       if (SCM_CASE_INSENSITIVE_P)
 650         str = scm_string_downcase_x (str);
 651       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 652     }
 653   else
 654     {
 655       if (!overflow)
 656         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 657       else
 658         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 659                                 pt->ilseq_handler);
 660
 661       if (SCM_CASE_INSENSITIVE_P)
 662         str = scm_string_downcase_x (str);
 663       result = scm_string_to_symbol (str);
 664     }
 665
 666   if (overflow)
 667     free (overflow_buffer);
 668   SCM_COL (port) += scm_i_string_length (str);
 669   return result;
 670 }
 671
 672 static SCM
 673 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 674 #define FUNC_NAME "scm_lreadr"
 675 {
 676   SCM result;
 677   size_t read;
 678   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 679   int overflow;
 680   unsigned int radix;
 681   SCM str;
 682   scm_t_port *pt;
 683
 684   switch (chr)
 685     {
 686     case 'B':
 687     case 'b':
 688       radix = 2;
 689       break;
 690
 691     case 'o':
 692     case 'O':
 693       radix = 8;
 694       break;
 695
 696     case 'd':
 697     case 'D':
 698       radix = 10;
 699       break;
 700
 701     case 'x':
 702     case 'X':
 703       radix = 16;
 704       break;
 705
 706     default:
 707       scm_ungetc (chr, port);
 708       scm_ungetc ('#', port);
 709       radix = 10;
 710     }
 711
 712   overflow = read_complete_token (port, buffer, sizeof (buffer),
 713                                   &overflow_buffer, &read);
 714
 715   pt = SCM_PTAB_ENTRY (port);
 716   if (!overflow)
 717     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 718   else
 719     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 720                             pt->ilseq_handler);
 721
 722   result = scm_string_to_number (str, scm_from_uint (radix));
 723
 724   if (overflow)
 725     free (overflow_buffer);
 726
 727   SCM_COL (port) += scm_i_string_length (str);
 728
 729   if (scm_is_true (result))
 730     return result;
 731
 732   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 733
 734   return SCM_BOOL_F;
 735 }
 736 #undef FUNC_NAME
 737
 738 static SCM
 739 scm_read_quote (int chr, SCM port)
 740 {
 741   SCM p;
 742   long line = SCM_LINUM (port);
 743   int column = SCM_COL (port) - 1;
 744
 745   switch (chr)
 746     {
 747     case '`':
 748       p = scm_sym_quasiquote;
 749       break;
 750
 751     case '\'':
 752       p = scm_sym_quote;
 753       break;
 754
 755     case ',':
 756       {
 757         scm_t_wchar c;
 758
 759         c = scm_getc (port);
 760         if ('@' == c)
 761           p = scm_sym_uq_splicing;
 762         else
 763           {
 764             scm_ungetc (c, port);
 765             p = scm_sym_unquote;
 766           }
 767         break;
 768       }
 769
 770     default:
 771       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 772                "scm_read_quote", chr);
 773       abort ();
 774     }
 775
 776   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 777   if (SCM_RECORD_POSITIONS_P)
 778     scm_whash_insert (scm_source_whash, p,
 779                       scm_make_srcprops (line, column,
 780                                          SCM_FILENAME (port),
 781                                          SCM_COPY_SOURCE_P
 782                                          ? (scm_cons2 (SCM_CAR (p),
 783                                                        SCM_CAR (SCM_CDR (p)),
 784                                                        SCM_EOL))
 785                                          : SCM_UNDEFINED,
 786                                          SCM_EOL));
 787
 788
 789   return p;
 790 }
 791
 792 SCM_SYMBOL (sym_syntax, "syntax");
 793 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 794 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 795 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 796
 797 static SCM
 798 scm_read_syntax (int chr, SCM port)
 799 {
 800   SCM p;
 801   long line = SCM_LINUM (port);
 802   int column = SCM_COL (port) - 1;
 803
 804   switch (chr)
 805     {
 806     case '`':
 807       p = sym_quasisyntax;
 808       break;
 809
 810     case '\'':
 811       p = sym_syntax;
 812       break;
 813
 814     case ',':
 815       {
 816         int c;
 817
 818         c = scm_getc (port);
 819         if ('@' == c)
 820           p = sym_unsyntax_splicing;
 821         else
 822           {
 823             scm_ungetc (c, port);
 824             p = sym_unsyntax;
 825           }
 826         break;
 827       }
 828
 829     default:
 830       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 831                "scm_read_syntax", chr);
 832       abort ();
 833     }
 834
 835   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 836   if (SCM_RECORD_POSITIONS_P)
 837     scm_whash_insert (scm_source_whash, p,
 838                       scm_make_srcprops (line, column,
 839                                          SCM_FILENAME (port),
 840                                          SCM_COPY_SOURCE_P
 841                                          ? (scm_cons2 (SCM_CAR (p),
 842                                                        SCM_CAR (SCM_CDR (p)),
 843                                                        SCM_EOL))
 844                                          : SCM_UNDEFINED,
 845                                          SCM_EOL));
 846
 847
 848   return p;
 849 }
 850
 851 static inline SCM
 852 scm_read_nil (int chr, SCM port)
 853 {
 854   SCM id = scm_read_mixed_case_symbol (chr, port);
 855
 856   if (!scm_is_eq (id, sym_nil))
 857     scm_i_input_error ("scm_read_nil", port,
 858                        "unexpected input while reading #nil: ~a",
 859                        scm_list_1 (id));
 860
 861   return SCM_ELISP_NIL;
 862 }
 863
 864 static inline SCM
 865 scm_read_semicolon_comment (int chr, SCM port)
 866 {
 867   int c;
 868
 869   /* We use the get_byte here because there is no need to get the
 870      locale correct with comment input. This presumes that newline
 871      always represents itself no matter what the encoding is.  */
 872   for (c = scm_get_byte_or_eof (port);
 873        (c != EOF) && (c != '\n');
 874        c = scm_get_byte_or_eof (port));
 875
 876   return SCM_UNSPECIFIED;
 877 }
 878
 879 \f
 880 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 881
 882 static SCM
 883 scm_read_boolean (int chr, SCM port)
 884 {
 885   switch (chr)
 886     {
 887     case 't':
 888     case 'T':
 889       return SCM_BOOL_T;
 890
 891     case 'f':
 892     case 'F':
 893       return SCM_BOOL_F;
 894     }
 895
 896   return SCM_UNSPECIFIED;
 897 }
 898
 899 static SCM
 900 scm_read_character (scm_t_wchar chr, SCM port)
 901 #define FUNC_NAME "scm_lreadr"
 902 {
 903   char buffer[READER_CHAR_NAME_MAX_SIZE];
 904   SCM charname;
 905   size_t charname_len, bytes_read;
 906   scm_t_wchar cp;
 907   int overflow;
 908   scm_t_port *pt;
 909
 910   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 911   if (overflow)
 912     goto char_error;
 913
 914   if (bytes_read == 0)
 915     {
 916       chr = scm_getc (port);
 917       if (chr == EOF)
 918         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 919                            "while reading character", SCM_EOL);
 920
 921       /* CHR must be a token delimiter, like a whitespace.  */
 922       return (SCM_MAKE_CHAR (chr));
 923     }
 924
 925   pt = SCM_PTAB_ENTRY (port);
 926
 927   /* Simple ASCII characters can be processed immediately.  Also, simple
 928      ISO-8859-1 characters can be processed immediately if the encoding for this
 929      port is ISO-8859-1.  */
 930   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 931     {
 932       SCM_COL (port) += 1;
 933       return SCM_MAKE_CHAR (buffer[0]);
 934     }
 935
 936   /* Otherwise, convert the buffer into a proper scheme string for
 937      processing.  */
 938   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 939                                pt->ilseq_handler);
 940   charname_len = scm_i_string_length (charname);
 941   SCM_COL (port) += charname_len;
 942   cp = scm_i_string_ref (charname, 0);
 943   if (charname_len == 1)
 944     return SCM_MAKE_CHAR (cp);
 945
 946   /* Ignore dotted circles, which may be used to keep combining characters from
 947      combining with the backslash in #\charname.  */
 948   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 949     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 950
 951   if (cp >= '0' && cp < '8')
 952     {
 953       /* Dirk:FIXME::  This type of character syntax is not R5RS
 954        * compliant.  Further, it should be verified that the constant
 955        * does only consist of octal digits.  */
 956       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 957       if (SCM_I_INUMP (p))
 958         {
 959           scm_t_wchar c = SCM_I_INUM (p);
 960           if (SCM_IS_UNICODE_CHAR (c))
 961             return SCM_MAKE_CHAR (c);
 962           else
 963             scm_i_input_error (FUNC_NAME, port,
 964                                "out-of-range octal character escape: ~a",
 965                                scm_list_1 (charname));
 966         }
 967     }
 968
 969   if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
 970     {
 971       SCM p;
 972
 973       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 974       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 975                                 scm_from_uint (16));
 976       if (SCM_I_INUMP (p))
 977         {
 978           scm_t_wchar c = SCM_I_INUM (p);
 979           if (SCM_IS_UNICODE_CHAR (c))
 980             return SCM_MAKE_CHAR (c);
 981           else
 982             scm_i_input_error (FUNC_NAME, port,
 983                                "out-of-range hex character escape: ~a",
 984                                scm_list_1 (charname));
 985         }
 986     }
 987
 988   /* The names of characters should never have non-Latin1
 989      characters.  */
 990   if (scm_i_is_narrow_string (charname)
 991       || scm_i_try_narrow_string (charname))
 992     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 993                                        charname_len);
 994       if (scm_is_true (ch))
 995         return ch;
 996     }
 997
 998  char_error:
 999   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1000                      scm_list_1 (charname));
1001
1002   return SCM_UNSPECIFIED;
1003 }
1004 #undef FUNC_NAME
1005
1006 static inline SCM
1007 scm_read_keyword (int chr, SCM port)
1008 {
1009   SCM symbol;
1010
1011   /* Read the symbol that comprises the keyword.  Doing this instead of
1012      invoking a specific symbol reader function allows `scm_read_keyword ()'
1013      to adapt to the delimiters currently valid of symbols.
1014
1015      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1016   symbol = scm_read_expression (port);
1017   if (!scm_is_symbol (symbol))
1018     scm_i_input_error ("scm_read_keyword", port,
1019                        "keyword prefix `~a' not followed by a symbol: ~s",
1020                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1021
1022   return (scm_symbol_to_keyword (symbol));
1023 }
1024
1025 static inline SCM
1026 scm_read_vector (int chr, SCM port)
1027 {
1028   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1029      guarantee that it's going to do what we want.  After all, this is an
1030      implementation detail of `scm_read_vector ()', not a desirable
1031      property.  */
1032   return (scm_vector (scm_read_sexp (chr, port)));
1033 }
1034
1035 static inline SCM
1036 scm_read_srfi4_vector (int chr, SCM port)
1037 {
1038   return scm_i_read_array (port, chr);
1039 }
1040
1041 static SCM
1042 scm_read_bytevector (scm_t_wchar chr, SCM port)
1043 {
1044   chr = scm_getc (port);
1045   if (chr != 'u')
1046     goto syntax;
1047
1048   chr = scm_getc (port);
1049   if (chr != '8')
1050     goto syntax;
1051
1052   chr = scm_getc (port);
1053   if (chr != '(')
1054     goto syntax;
1055
1056   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1057
1058  syntax:
1059   scm_i_input_error ("read_bytevector", port,
1060                      "invalid bytevector prefix",
1061                      SCM_MAKE_CHAR (chr));
1062   return SCM_UNSPECIFIED;
1063 }
1064
1065 static SCM
1066 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1067 {
1068   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1069      terribly inefficient but who cares?  */
1070   SCM s_bits = SCM_EOL;
1071
1072   for (chr = scm_getc (port);
1073        (chr != EOF) && ((chr == '0') || (chr == '1'));
1074        chr = scm_getc (port))
1075     {
1076       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1077     }
1078
1079   if (chr != EOF)
1080     scm_ungetc (chr, port);
1081
1082   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1083 }
1084
1085 static inline SCM
1086 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1087 {
1088   int bang_seen = 0;
1089
1090   /* We can use the get_byte here because there is no need to get the
1091      locale correct when reading comments. This presumes that
1092      hash and exclamation points always represent themselves no
1093      matter what the source encoding is.*/
1094   for (;;)
1095     {
1096       int c = scm_get_byte_or_eof (port);
1097
1098       if (c == EOF)
1099         scm_i_input_error ("skip_block_comment", port,
1100                            "unterminated `#! ... !#' comment", SCM_EOL);
1101
1102       if (c == '!')
1103         bang_seen = 1;
1104       else if (c == '#' && bang_seen)
1105         break;
1106       else
1107         bang_seen = 0;
1108     }
1109
1110   return SCM_UNSPECIFIED;
1111 }
1112
1113 static inline SCM
1114 scm_read_shebang (scm_t_wchar chr, SCM port)
1115 {
1116   int c = 0;
1117   if ((c = scm_get_byte_or_eof (port)) != 'r')
1118     {
1119       scm_ungetc (c, port);
1120       return scm_read_scsh_block_comment (chr, port);
1121     }
1122   if ((c = scm_get_byte_or_eof (port)) != '6')
1123     {
1124       scm_ungetc (c, port);
1125       scm_ungetc ('r', port);
1126       return scm_read_scsh_block_comment (chr, port);
1127     }
1128   if ((c = scm_get_byte_or_eof (port)) != 'r')
1129     {
1130       scm_ungetc (c, port);
1131       scm_ungetc ('6', port);
1132       scm_ungetc ('r', port);
1133       return scm_read_scsh_block_comment (chr, port);
1134     }
1135   if ((c = scm_get_byte_or_eof (port)) != 's')
1136     {
1137       scm_ungetc (c, port);
1138       scm_ungetc ('r', port);
1139       scm_ungetc ('6', port);
1140       scm_ungetc ('r', port);
1141       return scm_read_scsh_block_comment (chr, port);
1142     }
1143
1144   return SCM_UNSPECIFIED;
1145 }
1146
1147 static SCM
1148 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1149 {
1150   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1151      nested.  So care must be taken.  */
1152   int nesting_level = 1;
1153   int opening_seen = 0, closing_seen = 0;
1154
1155   while (nesting_level > 0)
1156     {
1157       int c = scm_getc (port);
1158
1159       if (c == EOF)
1160         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1161                            "unterminated `#| ... |#' comment", SCM_EOL);
1162
1163       if (opening_seen)
1164         {
1165           if (c == '|')
1166             nesting_level++;
1167           opening_seen = 0;
1168         }
1169       else if (closing_seen)
1170         {
1171           if (c == '#')
1172             nesting_level--;
1173           closing_seen = 0;
1174         }
1175       else if (c == '|')
1176         closing_seen = 1;
1177       else if (c == '#')
1178         opening_seen = 1;
1179       else
1180         opening_seen = closing_seen = 0;
1181     }
1182
1183   return SCM_UNSPECIFIED;
1184 }
1185
1186 static SCM
1187 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1188 {
1189   scm_t_wchar c;
1190
1191   c = flush_ws (port, (char *) NULL);
1192   if (EOF == c)
1193     scm_i_input_error ("read_commented_expression", port,
1194                        "no expression after #; comment", SCM_EOL);
1195   scm_ungetc (c, port);
1196   scm_read_expression (port);
1197   return SCM_UNSPECIFIED;
1198 }
1199
1200 static SCM
1201 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1202 {
1203   /* Guile's extended symbol read syntax looks like this:
1204
1205        #{This is all a symbol name}#
1206
1207      So here, CHR is expected to be `{'.  */
1208   int saw_brace = 0, finished = 0;
1209   size_t len = 0;
1210   SCM buf = scm_i_make_string (1024, NULL);
1211
1212   buf = scm_i_string_start_writing (buf);
1213
1214   while ((chr = scm_getc (port)) != EOF)
1215     {
1216       if (saw_brace)
1217         {
1218           if (chr == '#')
1219             {
1220               finished = 1;
1221               break;
1222             }
1223           else
1224             {
1225               saw_brace = 0;
1226               scm_i_string_set_x (buf, len++, '}');
1227               scm_i_string_set_x (buf, len++, chr);
1228             }
1229         }
1230       else if (chr == '}')
1231         saw_brace = 1;
1232       else
1233         scm_i_string_set_x (buf, len++, chr);
1234
1235       if (len >= scm_i_string_length (buf) - 2)
1236         {
1237           SCM addy;
1238
1239           scm_i_string_stop_writing ();
1240           addy = scm_i_make_string (1024, NULL);
1241           buf = scm_string_append (scm_list_2 (buf, addy));
1242           len = 0;
1243           buf = scm_i_string_start_writing (buf);
1244         }
1245
1246       if (finished)
1247         break;
1248     }
1249   scm_i_string_stop_writing ();
1250
1251   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1252 }
1253
1254
1255 \f
1256 /* Top-level token readers, i.e., dispatchers.  */
1257
1258 static SCM
1259 scm_read_sharp_extension (int chr, SCM port)
1260 {
1261   SCM proc;
1262
1263   proc = scm_get_hash_procedure (chr);
1264   if (scm_is_true (scm_procedure_p (proc)))
1265     {
1266       long line = SCM_LINUM (port);
1267       int column = SCM_COL (port) - 2;
1268       SCM got;
1269
1270       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1271       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1272         {
1273           if (SCM_RECORD_POSITIONS_P)
1274             return (recsexpr (got, line, column,
1275                               SCM_FILENAME (port)));
1276           else
1277             return got;
1278         }
1279     }
1280
1281   return SCM_UNSPECIFIED;
1282 }
1283
1284 /* The reader for the sharp `#' character.  It basically dispatches reads
1285    among the above token readers.   */
1286 static SCM
1287 scm_read_sharp (scm_t_wchar chr, SCM port)
1288 #define FUNC_NAME "scm_lreadr"
1289 {
1290   SCM result;
1291
1292   chr = scm_getc (port);
1293
1294   result = scm_read_sharp_extension (chr, port);
1295   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1296     return result;
1297
1298   switch (chr)
1299     {
1300     case '\\':
1301       return (scm_read_character (chr, port));
1302     case '(':
1303       return (scm_read_vector (chr, port));
1304     case 's':
1305     case 'u':
1306     case 'f':
1307       /* This one may return either a boolean or an SRFI-4 vector.  */
1308       return (scm_read_srfi4_vector (chr, port));
1309     case 'v':
1310       return (scm_read_bytevector (chr, port));
1311     case '*':
1312       return (scm_read_guile_bit_vector (chr, port));
1313     case 't':
1314     case 'T':
1315     case 'F':
1316       /* This one may return either a boolean or an SRFI-4 vector.  */
1317       return (scm_read_boolean (chr, port));
1318     case ':':
1319       return (scm_read_keyword (chr, port));
1320     case '0': case '1': case '2': case '3': case '4':
1321     case '5': case '6': case '7': case '8': case '9':
1322     case '@':
1323 #if SCM_ENABLE_DEPRECATED
1324       /* See below for 'i' and 'e'. */
1325     case 'a':
1326     case 'c':
1327     case 'y':
1328     case 'h':
1329     case 'l':
1330 #endif
1331       return (scm_i_read_array (port, chr));
1332
1333     case 'i':
1334     case 'e':
1335 #if SCM_ENABLE_DEPRECATED
1336       {
1337         /* When next char is '(', it really is an old-style
1338            uniform array. */
1339         scm_t_wchar next_c = scm_getc (port);
1340         if (next_c != EOF)
1341           scm_ungetc (next_c, port);
1342         if (next_c == '(')
1343           return scm_i_read_array (port, chr);
1344         /* Fall through. */
1345       }
1346 #endif
1347     case 'b':
1348     case 'B':
1349     case 'o':
1350     case 'O':
1351     case 'd':
1352     case 'D':
1353     case 'x':
1354     case 'X':
1355     case 'I':
1356     case 'E':
1357       return (scm_read_number_and_radix (chr, port));
1358     case '{':
1359       return (scm_read_extended_symbol (chr, port));
1360     case '!':
1361       return (scm_read_shebang (chr, port));
1362     case ';':
1363       return (scm_read_commented_expression (chr, port));
1364     case '`':
1365     case '\'':
1366     case ',':
1367       return (scm_read_syntax (chr, port));
1368     case 'n':
1369       return (scm_read_nil (chr, port));
1370     default:
1371       result = scm_read_sharp_extension (chr, port);
1372       if (scm_is_eq (result, SCM_UNSPECIFIED))
1373         {
1374           /* To remain compatible with 1.8 and earlier, the following
1375              characters have lower precedence than `read-hash-extend'
1376              characters.  */
1377           switch (chr)
1378             {
1379             case '|':
1380               return scm_read_r6rs_block_comment (chr, port);
1381             default:
1382               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1383                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1384             }
1385         }
1386       else
1387         return result;
1388     }
1389
1390   return SCM_UNSPECIFIED;
1391 }
1392 #undef FUNC_NAME
1393
1394 static SCM
1395 scm_read_expression (SCM port)
1396 #define FUNC_NAME "scm_read_expression"
1397 {
1398   while (1)
1399     {
1400       register scm_t_wchar chr;
1401
1402       chr = scm_getc (port);
1403
1404       switch (chr)
1405         {
1406         case SCM_WHITE_SPACES:
1407         case SCM_LINE_INCREMENTORS:
1408           break;
1409         case ';':
1410           (void) scm_read_semicolon_comment (chr, port);
1411           break;
1412         case '[':
1413           if (!SCM_SQUARE_BRACKETS_P)
1414             return (scm_read_mixed_case_symbol (chr, port));
1415           /* otherwise fall through */
1416         case '(':
1417           return (scm_read_sexp (chr, port));
1418         case '"':
1419           return (scm_read_string (chr, port));
1420         case '\'':
1421         case '`':
1422         case ',':
1423           return (scm_read_quote (chr, port));
1424         case '#':
1425           {
1426             SCM result;
1427             result = scm_read_sharp (chr, port);
1428             if (scm_is_eq (result, SCM_UNSPECIFIED))
1429               /* We read a comment or some such.  */
1430               break;
1431             else
1432               return result;
1433           }
1434         case ')':
1435           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1436           break;
1437         case EOF:
1438           return SCM_EOF_VAL;
1439         case ':':
1440           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1441             return scm_symbol_to_keyword (scm_read_expression (port));
1442           /* Fall through.  */
1443
1444         default:
1445           {
1446             if (((chr >= '0') && (chr <= '9'))
1447                 || (strchr ("+-.", chr)))
1448               return (scm_read_number (chr, port));
1449             else
1450               return (scm_read_mixed_case_symbol (chr, port));
1451           }
1452         }
1453     }
1454 }
1455 #undef FUNC_NAME
1456
1457 \f
1458 /* Actual reader.  */
1459
1460 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1461             (SCM port),
1462             "Read an s-expression from the input port @var{port}, or from\n"
1463             "the current input port if @var{port} is not specified.\n"
1464             "Any whitespace before the next token is discarded.")
1465 #define FUNC_NAME s_scm_read
1466 {
1467   int c;
1468
1469   if (SCM_UNBNDP (port))
1470     port = scm_current_input_port ();
1471   SCM_VALIDATE_OPINPORT (1, port);
1472
1473   c = flush_ws (port, (char *) NULL);
1474   if (EOF == c)
1475     return SCM_EOF_VAL;
1476   scm_ungetc (c, port);
1477
1478   return (scm_read_expression (port));
1479 }
1480 #undef FUNC_NAME
1481
1482
1483 \f
1484
1485 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1486 static SCM
1487 recsexpr (SCM obj, long line, int column, SCM filename)
1488 {
1489   if (!scm_is_pair(obj)) {
1490     return obj;
1491   } else {
1492     SCM tmp = obj, copy;
1493     /* If this sexpr is visible in the read:sharp source, we want to
1494        keep that information, so only record non-constant cons cells
1495        which haven't previously been read by the reader. */
1496     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1497       {
1498         if (SCM_COPY_SOURCE_P)
1499           {
1500             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1501                              SCM_UNDEFINED);
1502             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1503               {
1504                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1505                                                       line,
1506                                                       column,
1507                                                       filename),
1508                                             SCM_UNDEFINED));
1509                 copy = SCM_CDR (copy);
1510               }
1511             SCM_SETCDR (copy, tmp);
1512           }
1513         else
1514           {
1515             recsexpr (SCM_CAR (obj), line, column, filename);
1516             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1517               recsexpr (SCM_CAR (tmp), line, column, filename);
1518             copy = SCM_UNDEFINED;
1519           }
1520         scm_whash_insert (scm_source_whash,
1521                           obj,
1522                           scm_make_srcprops (line,
1523                                              column,
1524                                              filename,
1525                                              copy,
1526                                              SCM_EOL));
1527       }
1528     return obj;
1529   }
1530 }
1531
1532 /* Manipulate the read-hash-procedures alist.  This could be written in
1533    Scheme, but maybe it will also be used by C code during initialisation.  */
1534 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1535             (SCM chr, SCM proc),
1536             "Install the procedure @var{proc} for reading expressions\n"
1537             "starting with the character sequence @code{#} and @var{chr}.\n"
1538             "@var{proc} will be called with two arguments:  the character\n"
1539             "@var{chr} and the port to read further data from. The object\n"
1540             "returned will be the return value of @code{read}. \n"
1541             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1542             )
1543 #define FUNC_NAME s_scm_read_hash_extend
1544 {
1545   SCM this;
1546   SCM prev;
1547
1548   SCM_VALIDATE_CHAR (1, chr);
1549   SCM_ASSERT (scm_is_false (proc)
1550               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1551               proc, SCM_ARG2, FUNC_NAME);
1552
1553   /* Check if chr is already in the alist.  */
1554   this = *scm_read_hash_procedures;
1555   prev = SCM_BOOL_F;
1556   while (1)
1557     {
1558       if (scm_is_null (this))
1559         {
1560           /* not found, so add it to the beginning.  */
1561           if (scm_is_true (proc))
1562             {
1563               *scm_read_hash_procedures =
1564                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1565             }
1566           break;
1567         }
1568       if (scm_is_eq (chr, SCM_CAAR (this)))
1569         {
1570           /* already in the alist.  */
1571           if (scm_is_false (proc))
1572             {
1573               /* remove it.  */
1574               if (scm_is_false (prev))
1575                 {
1576                   *scm_read_hash_procedures =
1577                     SCM_CDR (*scm_read_hash_procedures);
1578                 }
1579               else
1580                 scm_set_cdr_x (prev, SCM_CDR (this));
1581             }
1582           else
1583             {
1584               /* replace it.  */
1585               scm_set_cdr_x (SCM_CAR (this), proc);
1586             }
1587           break;
1588         }
1589       prev = this;
1590       this = SCM_CDR (this);
1591     }
1592
1593   return SCM_UNSPECIFIED;
1594 }
1595 #undef FUNC_NAME
1596
1597 /* Recover the read-hash procedure corresponding to char c.  */
1598 static SCM
1599 scm_get_hash_procedure (int c)
1600 {
1601   SCM rest = *scm_read_hash_procedures;
1602
1603   while (1)
1604     {
1605       if (scm_is_null (rest))
1606         return SCM_BOOL_F;
1607
1608       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1609         return SCM_CDAR (rest);
1610
1611       rest = SCM_CDR (rest);
1612     }
1613 }
1614
1615 #define SCM_ENCODING_SEARCH_SIZE (500)
1616
1617 /* Search the first few hundred characters of a file for an Emacs-like coding
1618    declaration.  Returns either NULL or a string whose storage has been
1619    allocated with `scm_gc_malloc ()'.  */
1620 char *
1621 scm_i_scan_for_encoding (SCM port)
1622 {
1623   char header[SCM_ENCODING_SEARCH_SIZE+1];
1624   size_t bytes_read;
1625   char *encoding = NULL;
1626   int utf8_bom = 0;
1627   char *pos;
1628   int i;
1629   int in_comment;
1630
1631   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1632     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1633        "guile <(echo '(display "hello")')") so bail out.  */
1634     return NULL;
1635
1636   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1637
1638   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1639
1640   if (bytes_read > 3
1641       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1642     utf8_bom = 1;
1643
1644   /* search past "coding[:=]" */
1645   pos = header;
1646   while (1)
1647     {
1648       if ((pos = strstr(pos, "coding")) == NULL)
1649         return NULL;
1650
1651       pos += strlen("coding");
1652       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1653           (*pos == ':' || *pos == '='))
1654         {
1655           pos ++;
1656           break;
1657         }
1658     }
1659
1660   /* skip spaces */
1661   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1662          (*pos == ' ' || *pos == '\t'))
1663     pos ++;
1664
1665   /* grab the next token */
1666   i = 0;
1667   while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1668          && pos + i - header < bytes_read
1669          && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1670     i++;
1671
1672   if (i == 0)
1673     return NULL;
1674
1675   encoding = scm_gc_strndup (pos, i, "encoding");
1676   for (i = 0; i < strlen (encoding); i++)
1677     encoding[i] = toupper ((int) encoding[i]);
1678
1679   /* push backwards to make sure we were in a comment */
1680   in_comment = 0;
1681   while (pos - i - header > 0)
1682     {
1683       if (*(pos - i) == '\n')
1684         {
1685           /* This wasn't in a semicolon comment. Check for a
1686            hash-bang comment. */
1687           char *beg = strstr (header, "#!");
1688           char *end = strstr (header, "!#");
1689           if (beg < pos && pos < end)
1690             in_comment = 1;
1691           break;
1692         }
1693       if (*(pos - i) == ';')
1694         {
1695           in_comment = 1;
1696           break;
1697         }
1698       i ++;
1699     }
1700   if (!in_comment)
1701     /* This wasn't in a comment */
1702     return NULL;
1703
1704   if (utf8_bom && strcmp(encoding, "UTF-8"))
1705     scm_misc_error (NULL,
1706                     "the port input declares the encoding ~s but is encoded as UTF-8",
1707                     scm_list_1 (scm_from_locale_string (encoding)));
1708
1709   return encoding;
1710 }
1711
1712 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1713             (SCM port),
1714             "Scans the port for an Emacs-like character coding declaration\n"
1715             "near the top of the contents of a port with random-acessible contents.\n"
1716             "The coding declaration is of the form\n"
1717             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1718             "\n"
1719             "Returns a string containing the character encoding of the file\n"
1720             "if a declaration was found, or @code{#f} otherwise.\n")
1721 #define FUNC_NAME s_scm_file_encoding
1722 {
1723   char *enc;
1724   SCM s_enc;
1725
1726   enc = scm_i_scan_for_encoding (port);
1727   if (enc == NULL)
1728     return SCM_BOOL_F;
1729   else
1730     {
1731       s_enc = scm_from_locale_string (enc);
1732       return s_enc;
1733     }
1734
1735   return SCM_BOOL_F;
1736 }
1737 #undef FUNC_NAME
1738
1739 void
1740 scm_init_read ()
1741 {
1742   scm_read_hash_procedures =
1743     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1744
1745   scm_init_opts (scm_read_options, scm_read_opts);
1746 #include "libguile/read.x"
1747 }
1748
1749 /*
1750   Local Variables:
1751   c-file-style: "gnu"
1752   End:
1753 */