libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static int
 211 read_token (SCM port, char *buf, size_t buf_size, size_t *read)
 212 {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
 239    if the token doesn't fit in BUFFER_SIZE bytes.  */
 240 static char *
 241 read_complete_token (SCM port, char *buffer, size_t buffer_size,
 242                      size_t *read)
 243 {
 244   int overflow = 0;
 245   size_t bytes_read, overflow_size = 0;
 246   char *overflow_buffer = NULL;
 247
 248   do
 249     {
 250       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 251       if (bytes_read == 0)
 252         break;
 253       if (overflow || overflow_size != 0)
 254         {
 255           if (overflow_size == 0)
 256             {
 257               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
 258               memcpy (overflow_buffer, buffer, bytes_read);
 259               overflow_size = bytes_read;
 260             }
 261           else
 262             {
 263               char *new_buf =
 264                 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
 265
 266               memcpy (new_buf, overflow_buffer, overflow_size);
 267               memcpy (new_buf + overflow_size, buffer, bytes_read);
 268
 269               overflow_buffer = new_buf;
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size > 0 ? overflow_buffer : buffer);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port, long line, int column);
 360
 361
 362 static SCM
 363 maybe_annotate_source (SCM x, SCM port, long line, int column)
 364 {
 365   if (SCM_RECORD_POSITIONS_P)
 366     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 367   return x;
 368 }
 369
 370 static SCM
 371 scm_read_sexp (scm_t_wchar chr, SCM port)
 372 #define FUNC_NAME "scm_i_lreadparen"
 373 {
 374   int c;
 375   SCM tmp, tl, ans = SCM_EOL;
 376   const int terminating_char = ((chr == '[') ? ']' : ')');
 377
 378   /* Need to capture line and column numbers here. */
 379   long line = SCM_LINUM (port);
 380   int column = SCM_COL (port) - 1;
 381
 382   c = flush_ws (port, FUNC_NAME);
 383   if (terminating_char == c)
 384     return SCM_EOL;
 385
 386   scm_ungetc (c, port);
 387   tmp = scm_read_expression (port);
 388
 389   /* Note that it is possible for scm_read_expression to return
 390      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 391      check that it's a real dot by checking `c'.  */
 392   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 393     {
 394       ans = scm_read_expression (port);
 395       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 396         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 397                            SCM_EOL);
 398       return ans;
 399     }
 400
 401   /* Build the head of the list structure. */
 402   ans = tl = scm_cons (tmp, SCM_EOL);
 403
 404   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 405     {
 406       SCM new_tail;
 407
 408       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 409         scm_i_input_error (FUNC_NAME, port,
 410                            "in pair: mismatched close paren: ~A",
 411                            scm_list_1 (SCM_MAKE_CHAR (c)));
 412
 413       scm_ungetc (c, port);
 414       tmp = scm_read_expression (port);
 415
 416       /* See above note about scm_sym_dot.  */
 417       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 418         {
 419           SCM_SETCDR (tl, scm_read_expression (port));
 420
 421           c = flush_ws (port, FUNC_NAME);
 422           if (terminating_char != c)
 423             scm_i_input_error (FUNC_NAME, port,
 424                                "in pair: missing close paren", SCM_EOL);
 425           goto exit;
 426         }
 427
 428       new_tail = scm_cons (tmp, SCM_EOL);
 429       SCM_SETCDR (tl, new_tail);
 430       tl = new_tail;
 431     }
 432
 433  exit:
 434   return maybe_annotate_source (ans, port, line, column);
 435 }
 436 #undef FUNC_NAME
 437
 438
 439 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 440    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 441    found.  */
 442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 443   do                                                               \
 444     {                                                              \
 445       scm_t_wchar a;                                               \
 446       size_t i = 0;                                                \
 447       c = 0;                                                       \
 448       while (i < ndigits)                                          \
 449         {                                                          \
 450           a = scm_getc (port);                                     \
 451           if (a == EOF)                                            \
 452             goto str_eof;                                          \
 453           if (terminator                                           \
 454               && (a == (scm_t_wchar) terminator)                   \
 455               && (i > 0))                                          \
 456             break;                                                 \
 457           if ('0' <= a && a <= '9')                                \
 458             a -= '0';                                              \
 459           else if ('A' <= a && a <= 'F')                           \
 460             a = a - 'A' + 10;                                      \
 461           else if ('a' <= a && a <= 'f')                           \
 462             a = a - 'a' + 10;                                      \
 463           else                                                     \
 464             {                                                      \
 465               c = a;                                               \
 466               goto bad_escaped;                                    \
 467             }                                                      \
 468           c = c * 16 + a;                                          \
 469           i ++;                                                    \
 470         }                                                          \
 471     } while (0)
 472
 473 static void
 474 skip_intraline_whitespace (SCM port)
 475 {
 476   scm_t_wchar c;
 477
 478   do
 479     {
 480       c = scm_getc (port);
 481       if (c == EOF)
 482         return;
 483     }
 484   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 485
 486   scm_ungetc (c, port);
 487 }
 488
 489 static SCM
 490 scm_read_string (int chr, SCM port)
 491 #define FUNC_NAME "scm_lreadr"
 492 {
 493   /* For strings smaller than C_STR, this function creates only one Scheme
 494      object (the string returned).  */
 495
 496   SCM str = SCM_BOOL_F;
 497   unsigned c_str_len = 0;
 498   scm_t_wchar c;
 499
 500   /* Need to capture line and column numbers here. */
 501   long line = SCM_LINUM (port);
 502   int column = SCM_COL (port) - 1;
 503
 504   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 505   while ('"' != (c = scm_getc (port)))
 506     {
 507       if (c == EOF)
 508         {
 509         str_eof:
 510           scm_i_input_error (FUNC_NAME, port,
 511                              "end of file in string constant", SCM_EOL);
 512         }
 513
 514       if (c_str_len + 1 >= scm_i_string_length (str))
 515         {
 516           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 517
 518           str = scm_string_append (scm_list_2 (str, addy));
 519         }
 520
 521       if (c == '\\')
 522         {
 523           switch (c = scm_getc (port))
 524             {
 525             case EOF:
 526               goto str_eof;
 527             case '"':
 528             case '\\':
 529               break;
 530             case '\n':
 531               if (SCM_HUNGRY_EOL_ESCAPES_P)
 532                 skip_intraline_whitespace (port);
 533               continue;
 534             case '0':
 535               c = '\0';
 536               break;
 537             case 'f':
 538               c = '\f';
 539               break;
 540             case 'n':
 541               c = '\n';
 542               break;
 543             case 'r':
 544               c = '\r';
 545               break;
 546             case 't':
 547               c = '\t';
 548               break;
 549             case 'a':
 550               c = '\007';
 551               break;
 552             case 'v':
 553               c = '\v';
 554               break;
 555             case 'b':
 556               c = '\010';
 557               break;
 558             case 'x':
 559               if (SCM_R6RS_ESCAPES_P)
 560                 SCM_READ_HEX_ESCAPE (10, ';');
 561               else
 562                 SCM_READ_HEX_ESCAPE (2, '\0');
 563               break;
 564             case 'u':
 565               if (!SCM_R6RS_ESCAPES_P)
 566                 {
 567                   SCM_READ_HEX_ESCAPE (4, '\0');
 568                   break;
 569                 }
 570             case 'U':
 571               if (!SCM_R6RS_ESCAPES_P)
 572                 {
 573                   SCM_READ_HEX_ESCAPE (6, '\0');
 574                   break;
 575                 }
 576             default:
 577             bad_escaped:
 578               scm_i_input_error (FUNC_NAME, port,
 579                                  "illegal character in escape sequence: ~S",
 580                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 581             }
 582         }
 583       str = scm_i_string_start_writing (str);
 584       scm_i_string_set_x (str, c_str_len++, c);
 585       scm_i_string_stop_writing ();
 586     }
 587   return maybe_annotate_source (scm_i_substring_copy (str, 0, c_str_len),
 588                                 port, line, column);
 589 }
 590 #undef FUNC_NAME
 591
 592
 593 static SCM
 594 scm_read_number (scm_t_wchar chr, SCM port)
 595 {
 596   SCM result, str = SCM_EOL;
 597   char local_buffer[READER_BUFFER_SIZE], *buffer;
 598   size_t bytes_read;
 599   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 600
 601   /* Need to capture line and column numbers here. */
 602   long line = SCM_LINUM (port);
 603   int column = SCM_COL (port) - 1;
 604
 605   scm_ungetc (chr, port);
 606   buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
 607                                 &bytes_read);
 608
 609   str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 610
 611   result = scm_string_to_number (str, SCM_UNDEFINED);
 612   if (scm_is_false (result))
 613     {
 614       /* Return a symbol instead of a number */
 615       if (SCM_CASE_INSENSITIVE_P)
 616         str = scm_string_downcase_x (str);
 617       result = scm_string_to_symbol (str);
 618     }
 619   else if (SCM_NIMP (result))
 620     result = maybe_annotate_source (result, port, line, column);
 621
 622   SCM_COL (port) += scm_i_string_length (str);
 623   return result;
 624 }
 625
 626 static SCM
 627 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 628 {
 629   SCM result;
 630   int ends_with_colon = 0;
 631   size_t bytes_read;
 632   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 633   char local_buffer[READER_BUFFER_SIZE], *buffer;
 634   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 635   SCM str;
 636
 637   scm_ungetc (chr, port);
 638   buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
 639                                 &bytes_read);
 640   if (bytes_read > 0)
 641     ends_with_colon = buffer[bytes_read - 1] == ':';
 642
 643   if (postfix && ends_with_colon && (bytes_read > 1))
 644     {
 645       str = scm_from_stringn (buffer, bytes_read - 1,
 646                               pt->encoding, pt->ilseq_handler);
 647
 648       if (SCM_CASE_INSENSITIVE_P)
 649         str = scm_string_downcase_x (str);
 650       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 651     }
 652   else
 653     {
 654       str = scm_from_stringn (buffer, bytes_read,
 655                               pt->encoding, pt->ilseq_handler);
 656
 657       if (SCM_CASE_INSENSITIVE_P)
 658         str = scm_string_downcase_x (str);
 659       result = scm_string_to_symbol (str);
 660     }
 661
 662   SCM_COL (port) += scm_i_string_length (str);
 663   return result;
 664 }
 665
 666 static SCM
 667 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 668 #define FUNC_NAME "scm_lreadr"
 669 {
 670   SCM result;
 671   size_t read;
 672   char local_buffer[READER_BUFFER_SIZE], *buffer;
 673   unsigned int radix;
 674   SCM str;
 675   scm_t_port *pt;
 676
 677   switch (chr)
 678     {
 679     case 'B':
 680     case 'b':
 681       radix = 2;
 682       break;
 683
 684     case 'o':
 685     case 'O':
 686       radix = 8;
 687       break;
 688
 689     case 'd':
 690     case 'D':
 691       radix = 10;
 692       break;
 693
 694     case 'x':
 695     case 'X':
 696       radix = 16;
 697       break;
 698
 699     default:
 700       scm_ungetc (chr, port);
 701       scm_ungetc ('#', port);
 702       radix = 10;
 703     }
 704
 705   buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
 706                                 &read);
 707
 708   pt = SCM_PTAB_ENTRY (port);
 709   str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 710
 711   result = scm_string_to_number (str, scm_from_uint (radix));
 712
 713   SCM_COL (port) += scm_i_string_length (str);
 714
 715   if (scm_is_true (result))
 716     return result;
 717
 718   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 719
 720   return SCM_BOOL_F;
 721 }
 722 #undef FUNC_NAME
 723
 724 static SCM
 725 scm_read_quote (int chr, SCM port)
 726 {
 727   SCM p;
 728   long line = SCM_LINUM (port);
 729   int column = SCM_COL (port) - 1;
 730
 731   switch (chr)
 732     {
 733     case '`':
 734       p = scm_sym_quasiquote;
 735       break;
 736
 737     case '\'':
 738       p = scm_sym_quote;
 739       break;
 740
 741     case ',':
 742       {
 743         scm_t_wchar c;
 744
 745         c = scm_getc (port);
 746         if ('@' == c)
 747           p = scm_sym_uq_splicing;
 748         else
 749           {
 750             scm_ungetc (c, port);
 751             p = scm_sym_unquote;
 752           }
 753         break;
 754       }
 755
 756     default:
 757       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 758                "scm_read_quote", chr);
 759       abort ();
 760     }
 761
 762   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 763   return maybe_annotate_source (p, port, line, column);
 764 }
 765
 766 SCM_SYMBOL (sym_syntax, "syntax");
 767 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 768 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 769 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 770
 771 static SCM
 772 scm_read_syntax (int chr, SCM port)
 773 {
 774   SCM p;
 775   long line = SCM_LINUM (port);
 776   int column = SCM_COL (port) - 1;
 777
 778   switch (chr)
 779     {
 780     case '`':
 781       p = sym_quasisyntax;
 782       break;
 783
 784     case '\'':
 785       p = sym_syntax;
 786       break;
 787
 788     case ',':
 789       {
 790         int c;
 791
 792         c = scm_getc (port);
 793         if ('@' == c)
 794           p = sym_unsyntax_splicing;
 795         else
 796           {
 797             scm_ungetc (c, port);
 798             p = sym_unsyntax;
 799           }
 800         break;
 801       }
 802
 803     default:
 804       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 805                "scm_read_syntax", chr);
 806       abort ();
 807     }
 808
 809   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 810   return maybe_annotate_source (p, port, line, column);
 811 }
 812
 813 static SCM
 814 scm_read_nil (int chr, SCM port)
 815 {
 816   SCM id = scm_read_mixed_case_symbol (chr, port);
 817
 818   if (!scm_is_eq (id, sym_nil))
 819     scm_i_input_error ("scm_read_nil", port,
 820                        "unexpected input while reading #nil: ~a",
 821                        scm_list_1 (id));
 822
 823   return SCM_ELISP_NIL;
 824 }
 825
 826 static SCM
 827 scm_read_semicolon_comment (int chr, SCM port)
 828 {
 829   int c;
 830
 831   /* We use the get_byte here because there is no need to get the
 832      locale correct with comment input. This presumes that newline
 833      always represents itself no matter what the encoding is.  */
 834   for (c = scm_get_byte_or_eof (port);
 835        (c != EOF) && (c != '\n');
 836        c = scm_get_byte_or_eof (port));
 837
 838   return SCM_UNSPECIFIED;
 839 }
 840
 841 \f
 842 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 843
 844 static SCM
 845 scm_read_boolean (int chr, SCM port)
 846 {
 847   switch (chr)
 848     {
 849     case 't':
 850     case 'T':
 851       return SCM_BOOL_T;
 852
 853     case 'f':
 854     case 'F':
 855       return SCM_BOOL_F;
 856     }
 857
 858   return SCM_UNSPECIFIED;
 859 }
 860
 861 static SCM
 862 scm_read_character (scm_t_wchar chr, SCM port)
 863 #define FUNC_NAME "scm_lreadr"
 864 {
 865   char buffer[READER_CHAR_NAME_MAX_SIZE];
 866   SCM charname;
 867   size_t charname_len, bytes_read;
 868   scm_t_wchar cp;
 869   int overflow;
 870   scm_t_port *pt;
 871
 872   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 873   if (overflow)
 874     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 875
 876   if (bytes_read == 0)
 877     {
 878       chr = scm_getc (port);
 879       if (chr == EOF)
 880         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 881                            "while reading character", SCM_EOL);
 882
 883       /* CHR must be a token delimiter, like a whitespace.  */
 884       return (SCM_MAKE_CHAR (chr));
 885     }
 886
 887   pt = SCM_PTAB_ENTRY (port);
 888
 889   /* Simple ASCII characters can be processed immediately.  Also, simple
 890      ISO-8859-1 characters can be processed immediately if the encoding for this
 891      port is ISO-8859-1.  */
 892   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 893     {
 894       SCM_COL (port) += 1;
 895       return SCM_MAKE_CHAR (buffer[0]);
 896     }
 897
 898   /* Otherwise, convert the buffer into a proper scheme string for
 899      processing.  */
 900   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 901                                pt->ilseq_handler);
 902   charname_len = scm_i_string_length (charname);
 903   SCM_COL (port) += charname_len;
 904   cp = scm_i_string_ref (charname, 0);
 905   if (charname_len == 1)
 906     return SCM_MAKE_CHAR (cp);
 907
 908   /* Ignore dotted circles, which may be used to keep combining characters from
 909      combining with the backslash in #\charname.  */
 910   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 911     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 912
 913   if (cp >= '0' && cp < '8')
 914     {
 915       /* Dirk:FIXME::  This type of character syntax is not R5RS
 916        * compliant.  Further, it should be verified that the constant
 917        * does only consist of octal digits.  */
 918       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 919       if (SCM_I_INUMP (p))
 920         {
 921           scm_t_wchar c = scm_to_uint32 (p);
 922           if (SCM_IS_UNICODE_CHAR (c))
 923             return SCM_MAKE_CHAR (c);
 924           else
 925             scm_i_input_error (FUNC_NAME, port,
 926                                "out-of-range octal character escape: ~a",
 927                                scm_list_1 (charname));
 928         }
 929     }
 930
 931   if (cp == 'x' && (charname_len > 1))
 932     {
 933       SCM p;
 934
 935       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 936       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 937                                 scm_from_uint (16));
 938       if (SCM_I_INUMP (p))
 939         {
 940           scm_t_wchar c = scm_to_uint32 (p);
 941           if (SCM_IS_UNICODE_CHAR (c))
 942             return SCM_MAKE_CHAR (c);
 943           else
 944             scm_i_input_error (FUNC_NAME, port,
 945                                "out-of-range hex character escape: ~a",
 946                                scm_list_1 (charname));
 947         }
 948     }
 949
 950   /* The names of characters should never have non-Latin1
 951      characters.  */
 952   if (scm_i_is_narrow_string (charname)
 953       || scm_i_try_narrow_string (charname))
 954     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 955                                        charname_len);
 956       if (scm_is_true (ch))
 957         return ch;
 958     }
 959
 960   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 961                      scm_list_1 (charname));
 962
 963   return SCM_UNSPECIFIED;
 964 }
 965 #undef FUNC_NAME
 966
 967 static SCM
 968 scm_read_keyword (int chr, SCM port)
 969 {
 970   SCM symbol;
 971
 972   /* Read the symbol that comprises the keyword.  Doing this instead of
 973      invoking a specific symbol reader function allows `scm_read_keyword ()'
 974      to adapt to the delimiters currently valid of symbols.
 975
 976      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 977   symbol = scm_read_expression (port);
 978   if (!scm_is_symbol (symbol))
 979     scm_i_input_error ("scm_read_keyword", port,
 980                        "keyword prefix `~a' not followed by a symbol: ~s",
 981                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
 982
 983   return (scm_symbol_to_keyword (symbol));
 984 }
 985
 986 static SCM
 987 scm_read_vector (int chr, SCM port, long line, int column)
 988 {
 989   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
 990      guarantee that it's going to do what we want.  After all, this is an
 991      implementation detail of `scm_read_vector ()', not a desirable
 992      property.  */
 993   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
 994                                 port, line, column);
 995 }
 996
 997 static SCM
 998 scm_read_array (int chr, SCM port, long line, int column)
 999 {
1000   SCM result = scm_i_read_array (port, chr);
1001   if (scm_is_false (result))
1002     return result;
1003   else
1004     return maybe_annotate_source (result, port, line, column);
1005 }
1006
1007 static SCM
1008 scm_read_srfi4_vector (int chr, SCM port, long line, int column)
1009 {
1010   return scm_read_array (chr, port, line, column);
1011 }
1012
1013 static SCM
1014 scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
1015 {
1016   chr = scm_getc (port);
1017   if (chr != 'u')
1018     goto syntax;
1019
1020   chr = scm_getc (port);
1021   if (chr != '8')
1022     goto syntax;
1023
1024   chr = scm_getc (port);
1025   if (chr != '(')
1026     goto syntax;
1027
1028   return maybe_annotate_source
1029     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1030      port, line, column);
1031
1032  syntax:
1033   scm_i_input_error ("read_bytevector", port,
1034                      "invalid bytevector prefix",
1035                      SCM_MAKE_CHAR (chr));
1036   return SCM_UNSPECIFIED;
1037 }
1038
1039 static SCM
1040 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
1041 {
1042   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1043      terribly inefficient but who cares?  */
1044   SCM s_bits = SCM_EOL;
1045
1046   for (chr = scm_getc (port);
1047        (chr != EOF) && ((chr == '0') || (chr == '1'));
1048        chr = scm_getc (port))
1049     {
1050       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1051     }
1052
1053   if (chr != EOF)
1054     scm_ungetc (chr, port);
1055
1056   return maybe_annotate_source
1057     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1058      port, line, column);
1059 }
1060
1061 static SCM
1062 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1063 {
1064   int bang_seen = 0;
1065
1066   for (;;)
1067     {
1068       int c = scm_getc (port);
1069
1070       if (c == EOF)
1071         scm_i_input_error ("skip_block_comment", port,
1072                            "unterminated `#! ... !#' comment", SCM_EOL);
1073
1074       if (c == '!')
1075         bang_seen = 1;
1076       else if (c == '#' && bang_seen)
1077         break;
1078       else
1079         bang_seen = 0;
1080     }
1081
1082   return SCM_UNSPECIFIED;
1083 }
1084
1085 static SCM
1086 scm_read_shebang (scm_t_wchar chr, SCM port)
1087 {
1088   int c = 0;
1089   if ((c = scm_get_byte_or_eof (port)) != 'r')
1090     {
1091       scm_ungetc (c, port);
1092       return scm_read_scsh_block_comment (chr, port);
1093     }
1094   if ((c = scm_get_byte_or_eof (port)) != '6')
1095     {
1096       scm_ungetc (c, port);
1097       scm_ungetc ('r', port);
1098       return scm_read_scsh_block_comment (chr, port);
1099     }
1100   if ((c = scm_get_byte_or_eof (port)) != 'r')
1101     {
1102       scm_ungetc (c, port);
1103       scm_ungetc ('6', port);
1104       scm_ungetc ('r', port);
1105       return scm_read_scsh_block_comment (chr, port);
1106     }
1107   if ((c = scm_get_byte_or_eof (port)) != 's')
1108     {
1109       scm_ungetc (c, port);
1110       scm_ungetc ('r', port);
1111       scm_ungetc ('6', port);
1112       scm_ungetc ('r', port);
1113       return scm_read_scsh_block_comment (chr, port);
1114     }
1115
1116   return SCM_UNSPECIFIED;
1117 }
1118
1119 static SCM
1120 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1121 {
1122   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1123      nested.  So care must be taken.  */
1124   int nesting_level = 1;
1125
1126   int a = scm_getc (port);
1127
1128   if (a == EOF)
1129     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1130                        "unterminated `#| ... |#' comment", SCM_EOL);
1131
1132   while (nesting_level > 0)
1133     {
1134       int b = scm_getc (port);
1135
1136       if (b == EOF)
1137         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1138                            "unterminated `#| ... |#' comment", SCM_EOL);
1139
1140       if (a == '|' && b == '#')
1141         {
1142           nesting_level--;
1143           b = EOF;
1144         }
1145       else if (a == '#' && b == '|')
1146         {
1147           nesting_level++;
1148           b = EOF;
1149         }
1150
1151       a = b;
1152     }
1153
1154   return SCM_UNSPECIFIED;
1155 }
1156
1157 static SCM
1158 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1159 {
1160   scm_t_wchar c;
1161
1162   c = flush_ws (port, (char *) NULL);
1163   if (EOF == c)
1164     scm_i_input_error ("read_commented_expression", port,
1165                        "no expression after #; comment", SCM_EOL);
1166   scm_ungetc (c, port);
1167   scm_read_expression (port);
1168   return SCM_UNSPECIFIED;
1169 }
1170
1171 static SCM
1172 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1173 {
1174   /* Guile's extended symbol read syntax looks like this:
1175
1176        #{This is all a symbol name}#
1177
1178      So here, CHR is expected to be `{'.  */
1179   int saw_brace = 0;
1180   size_t len = 0;
1181   SCM buf = scm_i_make_string (1024, NULL, 0);
1182
1183   buf = scm_i_string_start_writing (buf);
1184
1185   while ((chr = scm_getc (port)) != EOF)
1186     {
1187       if (saw_brace)
1188         {
1189           if (chr == '#')
1190             {
1191               break;
1192             }
1193           else
1194             {
1195               saw_brace = 0;
1196               scm_i_string_set_x (buf, len++, '}');
1197             }
1198         }
1199
1200       if (chr == '}')
1201         saw_brace = 1;
1202       else if (chr == '\\')
1203         {
1204           /* It used to be that print.c would print extended-read-syntax
1205              symbols with backslashes before "non-standard" chars, but
1206              this routine wouldn't do anything with those escapes.
1207              Bummer.  What we've done is to change print.c to output
1208              R6RS hex escapes for those characters, relying on the fact
1209              that the extended read syntax would never put a `\' before
1210              an `x'.  For now, we just ignore other instances of
1211              backslash in the string.  */
1212           switch ((chr = scm_getc (port)))
1213             {
1214             case EOF:
1215               goto done;
1216             case 'x':
1217               {
1218                 scm_t_wchar c;
1219
1220                 SCM_READ_HEX_ESCAPE (10, ';');
1221                 scm_i_string_set_x (buf, len++, c);
1222                 break;
1223
1224               str_eof:
1225                 chr = EOF;
1226                 goto done;
1227
1228               bad_escaped:
1229                 scm_i_string_stop_writing ();
1230                 scm_i_input_error ("scm_read_extended_symbol", port,
1231                                    "illegal character in escape sequence: ~S",
1232                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1233                 break;
1234               }
1235             default:
1236               scm_i_string_set_x (buf, len++, chr);
1237               break;
1238             }
1239         }
1240       else
1241         scm_i_string_set_x (buf, len++, chr);
1242
1243       if (len >= scm_i_string_length (buf) - 2)
1244         {
1245           SCM addy;
1246
1247           scm_i_string_stop_writing ();
1248           addy = scm_i_make_string (1024, NULL, 0);
1249           buf = scm_string_append (scm_list_2 (buf, addy));
1250           len = 0;
1251           buf = scm_i_string_start_writing (buf);
1252         }
1253     }
1254
1255  done:
1256   scm_i_string_stop_writing ();
1257   if (chr == EOF)
1258     scm_i_input_error ("scm_read_extended_symbol", port,
1259                        "end of file while reading symbol", SCM_EOL);
1260
1261   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1262 }
1263
1264
1265 \f
1266 /* Top-level token readers, i.e., dispatchers.  */
1267
1268 static SCM
1269 scm_read_sharp_extension (int chr, SCM port)
1270 {
1271   SCM proc;
1272
1273   proc = scm_get_hash_procedure (chr);
1274   if (scm_is_true (scm_procedure_p (proc)))
1275     {
1276       long line = SCM_LINUM (port);
1277       int column = SCM_COL (port) - 2;
1278       SCM got;
1279
1280       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1281
1282       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1283         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1284
1285       return got;
1286     }
1287
1288   return SCM_UNSPECIFIED;
1289 }
1290
1291 /* The reader for the sharp `#' character.  It basically dispatches reads
1292    among the above token readers.   */
1293 static SCM
1294 scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
1295 #define FUNC_NAME "scm_lreadr"
1296 {
1297   SCM result;
1298
1299   chr = scm_getc (port);
1300
1301   result = scm_read_sharp_extension (chr, port);
1302   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1303     return result;
1304
1305   switch (chr)
1306     {
1307     case '\\':
1308       return (scm_read_character (chr, port));
1309     case '(':
1310       return (scm_read_vector (chr, port, line, column));
1311     case 's':
1312     case 'u':
1313     case 'f':
1314     case 'c':
1315       /* This one may return either a boolean or an SRFI-4 vector.  */
1316       return (scm_read_srfi4_vector (chr, port, line, column));
1317     case 'v':
1318       return (scm_read_bytevector (chr, port, line, column));
1319     case '*':
1320       return (scm_read_guile_bit_vector (chr, port, line, column));
1321     case 't':
1322     case 'T':
1323     case 'F':
1324       return (scm_read_boolean (chr, port));
1325     case ':':
1326       return (scm_read_keyword (chr, port));
1327     case '0': case '1': case '2': case '3': case '4':
1328     case '5': case '6': case '7': case '8': case '9':
1329     case '@':
1330 #if SCM_ENABLE_DEPRECATED
1331       /* See below for 'i' and 'e'. */
1332     case 'a':
1333     case 'y':
1334     case 'h':
1335     case 'l':
1336 #endif
1337       return (scm_read_array (chr, port, line, column));
1338
1339     case 'i':
1340     case 'e':
1341 #if SCM_ENABLE_DEPRECATED
1342       {
1343         /* When next char is '(', it really is an old-style
1344            uniform array. */
1345         scm_t_wchar next_c = scm_getc (port);
1346         if (next_c != EOF)
1347           scm_ungetc (next_c, port);
1348         if (next_c == '(')
1349           return scm_read_array (chr, port, line, column);
1350         /* Fall through. */
1351       }
1352 #endif
1353     case 'b':
1354     case 'B':
1355     case 'o':
1356     case 'O':
1357     case 'd':
1358     case 'D':
1359     case 'x':
1360     case 'X':
1361     case 'I':
1362     case 'E':
1363       return (scm_read_number_and_radix (chr, port));
1364     case '{':
1365       return (scm_read_extended_symbol (chr, port));
1366     case '!':
1367       return (scm_read_shebang (chr, port));
1368     case ';':
1369       return (scm_read_commented_expression (chr, port));
1370     case '`':
1371     case '\'':
1372     case ',':
1373       return (scm_read_syntax (chr, port));
1374     case 'n':
1375       return (scm_read_nil (chr, port));
1376     default:
1377       result = scm_read_sharp_extension (chr, port);
1378       if (scm_is_eq (result, SCM_UNSPECIFIED))
1379         {
1380           /* To remain compatible with 1.8 and earlier, the following
1381              characters have lower precedence than `read-hash-extend'
1382              characters.  */
1383           switch (chr)
1384             {
1385             case '|':
1386               return scm_read_r6rs_block_comment (chr, port);
1387             default:
1388               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1389                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1390             }
1391         }
1392       else
1393         return result;
1394     }
1395
1396   return SCM_UNSPECIFIED;
1397 }
1398 #undef FUNC_NAME
1399
1400 static SCM
1401 scm_read_expression (SCM port)
1402 #define FUNC_NAME "scm_read_expression"
1403 {
1404   while (1)
1405     {
1406       scm_t_wchar chr;
1407
1408       chr = scm_getc (port);
1409
1410       switch (chr)
1411         {
1412         case SCM_WHITE_SPACES:
1413         case SCM_LINE_INCREMENTORS:
1414           break;
1415         case ';':
1416           (void) scm_read_semicolon_comment (chr, port);
1417           break;
1418         case '[':
1419           if (!SCM_SQUARE_BRACKETS_P)
1420             return (scm_read_mixed_case_symbol (chr, port));
1421           /* otherwise fall through */
1422         case '(':
1423           return (scm_read_sexp (chr, port));
1424         case '"':
1425           return (scm_read_string (chr, port));
1426         case '\'':
1427         case '`':
1428         case ',':
1429           return (scm_read_quote (chr, port));
1430         case '#':
1431           {
1432             long line  = SCM_LINUM (port);
1433             int column = SCM_COL (port) - 1;
1434             SCM result = scm_read_sharp (chr, port, line, column);
1435             if (scm_is_eq (result, SCM_UNSPECIFIED))
1436               /* We read a comment or some such.  */
1437               break;
1438             else
1439               return result;
1440           }
1441         case ')':
1442           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1443           break;
1444         case ']':
1445           if (SCM_SQUARE_BRACKETS_P)
1446             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1447           /* otherwise fall through */
1448         case EOF:
1449           return SCM_EOF_VAL;
1450         case ':':
1451           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1452             return scm_symbol_to_keyword (scm_read_expression (port));
1453           /* Fall through.  */
1454
1455         default:
1456           {
1457             if (((chr >= '0') && (chr <= '9'))
1458                 || (strchr ("+-.", chr)))
1459               return (scm_read_number (chr, port));
1460             else
1461               return (scm_read_mixed_case_symbol (chr, port));
1462           }
1463         }
1464     }
1465 }
1466 #undef FUNC_NAME
1467
1468 \f
1469 /* Actual reader.  */
1470
1471 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1472             (SCM port),
1473             "Read an s-expression from the input port @var{port}, or from\n"
1474             "the current input port if @var{port} is not specified.\n"
1475             "Any whitespace before the next token is discarded.")
1476 #define FUNC_NAME s_scm_read
1477 {
1478   int c;
1479
1480   if (SCM_UNBNDP (port))
1481     port = scm_current_input_port ();
1482   SCM_VALIDATE_OPINPORT (1, port);
1483
1484   c = flush_ws (port, (char *) NULL);
1485   if (EOF == c)
1486     return SCM_EOF_VAL;
1487   scm_ungetc (c, port);
1488
1489   return (scm_read_expression (port));
1490 }
1491 #undef FUNC_NAME
1492
1493
1494 \f
1495
1496 /* Manipulate the read-hash-procedures alist.  This could be written in
1497    Scheme, but maybe it will also be used by C code during initialisation.  */
1498 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1499             (SCM chr, SCM proc),
1500             "Install the procedure @var{proc} for reading expressions\n"
1501             "starting with the character sequence @code{#} and @var{chr}.\n"
1502             "@var{proc} will be called with two arguments:  the character\n"
1503             "@var{chr} and the port to read further data from. The object\n"
1504             "returned will be the return value of @code{read}. \n"
1505             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1506             )
1507 #define FUNC_NAME s_scm_read_hash_extend
1508 {
1509   SCM this;
1510   SCM prev;
1511
1512   SCM_VALIDATE_CHAR (1, chr);
1513   SCM_ASSERT (scm_is_false (proc)
1514               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1515               proc, SCM_ARG2, FUNC_NAME);
1516
1517   /* Check if chr is already in the alist.  */
1518   this = scm_i_read_hash_procedures_ref ();
1519   prev = SCM_BOOL_F;
1520   while (1)
1521     {
1522       if (scm_is_null (this))
1523         {
1524           /* not found, so add it to the beginning.  */
1525           if (scm_is_true (proc))
1526             {
1527               SCM new = scm_cons (scm_cons (chr, proc),
1528                                   scm_i_read_hash_procedures_ref ());
1529               scm_i_read_hash_procedures_set_x (new);
1530             }
1531           break;
1532         }
1533       if (scm_is_eq (chr, SCM_CAAR (this)))
1534         {
1535           /* already in the alist.  */
1536           if (scm_is_false (proc))
1537             {
1538               /* remove it.  */
1539               if (scm_is_false (prev))
1540                 {
1541                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1542                   scm_i_read_hash_procedures_set_x (rest);
1543                 }
1544               else
1545                 scm_set_cdr_x (prev, SCM_CDR (this));
1546             }
1547           else
1548             {
1549               /* replace it.  */
1550               scm_set_cdr_x (SCM_CAR (this), proc);
1551             }
1552           break;
1553         }
1554       prev = this;
1555       this = SCM_CDR (this);
1556     }
1557
1558   return SCM_UNSPECIFIED;
1559 }
1560 #undef FUNC_NAME
1561
1562 /* Recover the read-hash procedure corresponding to char c.  */
1563 static SCM
1564 scm_get_hash_procedure (int c)
1565 {
1566   SCM rest = scm_i_read_hash_procedures_ref ();
1567
1568   while (1)
1569     {
1570       if (scm_is_null (rest))
1571         return SCM_BOOL_F;
1572
1573       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1574         return SCM_CDAR (rest);
1575
1576       rest = SCM_CDR (rest);
1577     }
1578 }
1579
1580 #define SCM_ENCODING_SEARCH_SIZE (500)
1581
1582 /* Search the first few hundred characters of a file for an Emacs-like coding
1583    declaration.  Returns either NULL or a string whose storage has been
1584    allocated with `scm_gc_malloc ()'.  */
1585 char *
1586 scm_i_scan_for_encoding (SCM port)
1587 {
1588   scm_t_port *pt;
1589   char header[SCM_ENCODING_SEARCH_SIZE+1];
1590   size_t bytes_read, encoding_length, i;
1591   char *encoding = NULL;
1592   int utf8_bom = 0;
1593   char *pos, *encoding_start;
1594   int in_comment;
1595
1596   pt = SCM_PTAB_ENTRY (port);
1597
1598   if (pt->rw_active == SCM_PORT_WRITE)
1599     scm_flush (port);
1600
1601   if (pt->rw_random)
1602     pt->rw_active = SCM_PORT_READ;
1603
1604   if (pt->read_pos == pt->read_end)
1605     {
1606       /* We can use the read buffer, and thus avoid a seek. */
1607       if (scm_fill_input (port) == EOF)
1608         return NULL;
1609
1610       bytes_read = pt->read_end - pt->read_pos;
1611       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1612         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1613
1614       if (bytes_read <= 1)
1615         /* An unbuffered port -- don't scan.  */
1616         return NULL;
1617
1618       memcpy (header, pt->read_pos, bytes_read);
1619       header[bytes_read] = '\0';
1620     }
1621   else
1622     {
1623       /* Try to read some bytes and then seek back.  Not all ports
1624          support seeking back; and indeed some file ports (like
1625          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1626          check performed by SCM_FPORT_FDES---but fail to seek
1627          backwards.  Hence this block comes second.  We prefer to use
1628          the read buffer in-place.  */
1629       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1630         return NULL;
1631
1632       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1633       header[bytes_read] = '\0';
1634       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1635     }
1636
1637   if (bytes_read > 3
1638       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1639     utf8_bom = 1;
1640
1641   /* search past "coding[:=]" */
1642   pos = header;
1643   while (1)
1644     {
1645       if ((pos = strstr(pos, "coding")) == NULL)
1646         return NULL;
1647
1648       pos += strlen("coding");
1649       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1650           (*pos == ':' || *pos == '='))
1651         {
1652           pos ++;
1653           break;
1654         }
1655     }
1656
1657   /* skip spaces */
1658   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1659          (*pos == ' ' || *pos == '\t'))
1660     pos ++;
1661
1662   /* grab the next token */
1663   encoding_start = pos;
1664   i = 0;
1665   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1666          && encoding_start + i - header < bytes_read
1667          && (isalnum ((int) encoding_start[i])
1668              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1669     i++;
1670
1671   encoding_length = i;
1672   if (encoding_length == 0)
1673     return NULL;
1674
1675   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1676   for (i = 0; i < encoding_length; i++)
1677     encoding[i] = toupper ((int) encoding[i]);
1678
1679   /* push backwards to make sure we were in a comment */
1680   in_comment = 0;
1681   pos = encoding_start;
1682   while (pos >= header)
1683     {
1684       if (*pos == ';')
1685         {
1686           in_comment = 1;
1687           break;
1688         }
1689       else if (*pos == '\n' || pos == header)
1690         {
1691           /* This wasn't in a semicolon comment. Check for a
1692            hash-bang comment. */
1693           char *beg = strstr (header, "#!");
1694           char *end = strstr (header, "!#");
1695           if (beg < encoding_start && encoding_start + encoding_length <= end)
1696             in_comment = 1;
1697           break;
1698         }
1699       else
1700         {
1701           pos --;
1702           continue;
1703         }
1704     }
1705   if (!in_comment)
1706     /* This wasn't in a comment */
1707     return NULL;
1708
1709   if (utf8_bom && strcmp(encoding, "UTF-8"))
1710     scm_misc_error (NULL,
1711                     "the port input declares the encoding ~s but is encoded as UTF-8",
1712                     scm_list_1 (scm_from_locale_string (encoding)));
1713
1714   return encoding;
1715 }
1716
1717 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1718             (SCM port),
1719             "Scans the port for an Emacs-like character coding declaration\n"
1720             "near the top of the contents of a port with random-accessible contents.\n"
1721             "The coding declaration is of the form\n"
1722             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1723             "\n"
1724             "Returns a string containing the character encoding of the file\n"
1725             "if a declaration was found, or @code{#f} otherwise.\n")
1726 #define FUNC_NAME s_scm_file_encoding
1727 {
1728   char *enc;
1729   SCM s_enc;
1730
1731   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1732
1733   enc = scm_i_scan_for_encoding (port);
1734   if (enc == NULL)
1735     return SCM_BOOL_F;
1736   else
1737     {
1738       s_enc = scm_from_locale_string (enc);
1739       return s_enc;
1740     }
1741
1742   return SCM_BOOL_F;
1743 }
1744 #undef FUNC_NAME
1745
1746 void
1747 scm_init_read ()
1748 {
1749   SCM read_hash_procs;
1750
1751   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1752
1753   scm_i_read_hash_procedures =
1754     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1755
1756   scm_init_opts (scm_read_options, scm_read_opts);
1757 #include "libguile/read.x"
1758 }
1759
1760 /*
1761   Local Variables:
1762   c-file-style: "gnu"
1763   End:
1764 */