libguile/read.c

   1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
   2  *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port, long line, int column);
 360
 361
 362 static SCM
 363 maybe_annotate_source (SCM x, SCM port, long line, int column)
 364 {
 365   if (SCM_RECORD_POSITIONS_P)
 366     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
 367   return x;
 368 }
 369
 370 static SCM
 371 scm_read_sexp (scm_t_wchar chr, SCM port)
 372 #define FUNC_NAME "scm_i_lreadparen"
 373 {
 374   int c;
 375   SCM tmp, tl, ans = SCM_EOL;
 376   const int terminating_char = ((chr == '[') ? ']' : ')');
 377
 378   /* Need to capture line and column numbers here. */
 379   long line = SCM_LINUM (port);
 380   int column = SCM_COL (port) - 1;
 381
 382   c = flush_ws (port, FUNC_NAME);
 383   if (terminating_char == c)
 384     return SCM_EOL;
 385
 386   scm_ungetc (c, port);
 387   tmp = scm_read_expression (port);
 388
 389   /* Note that it is possible for scm_read_expression to return
 390      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 391      check that it's a real dot by checking `c'.  */
 392   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 393     {
 394       ans = scm_read_expression (port);
 395       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 396         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 397                            SCM_EOL);
 398       return ans;
 399     }
 400
 401   /* Build the head of the list structure. */
 402   ans = tl = scm_cons (tmp, SCM_EOL);
 403
 404   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 405     {
 406       SCM new_tail;
 407
 408       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 409         scm_i_input_error (FUNC_NAME, port,
 410                            "in pair: mismatched close paren: ~A",
 411                            scm_list_1 (SCM_MAKE_CHAR (c)));
 412
 413       scm_ungetc (c, port);
 414       tmp = scm_read_expression (port);
 415
 416       /* See above note about scm_sym_dot.  */
 417       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 418         {
 419           SCM_SETCDR (tl, scm_read_expression (port));
 420
 421           c = flush_ws (port, FUNC_NAME);
 422           if (terminating_char != c)
 423             scm_i_input_error (FUNC_NAME, port,
 424                                "in pair: missing close paren", SCM_EOL);
 425           goto exit;
 426         }
 427
 428       new_tail = scm_cons (tmp, SCM_EOL);
 429       SCM_SETCDR (tl, new_tail);
 430       tl = new_tail;
 431     }
 432
 433  exit:
 434   return maybe_annotate_source (ans, port, line, column);
 435 }
 436 #undef FUNC_NAME
 437
 438
 439 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 440    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 441    found.  */
 442 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 443   do                                                               \
 444     {                                                              \
 445       scm_t_wchar a;                                               \
 446       size_t i = 0;                                                \
 447       c = 0;                                                       \
 448       while (i < ndigits)                                          \
 449         {                                                          \
 450           a = scm_getc (port);                                     \
 451           if (a == EOF)                                            \
 452             goto str_eof;                                          \
 453           if (terminator                                           \
 454               && (a == (scm_t_wchar) terminator)                   \
 455               && (i > 0))                                          \
 456             break;                                                 \
 457           if ('0' <= a && a <= '9')                                \
 458             a -= '0';                                              \
 459           else if ('A' <= a && a <= 'F')                           \
 460             a = a - 'A' + 10;                                      \
 461           else if ('a' <= a && a <= 'f')                           \
 462             a = a - 'a' + 10;                                      \
 463           else                                                     \
 464             {                                                      \
 465               c = a;                                               \
 466               goto bad_escaped;                                    \
 467             }                                                      \
 468           c = c * 16 + a;                                          \
 469           i ++;                                                    \
 470         }                                                          \
 471     } while (0)
 472
 473 static void
 474 skip_intraline_whitespace (SCM port)
 475 {
 476   scm_t_wchar c;
 477
 478   do
 479     {
 480       c = scm_getc (port);
 481       if (c == EOF)
 482         return;
 483     }
 484   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 485
 486   scm_ungetc (c, port);
 487 }
 488
 489 static SCM
 490 scm_read_string (int chr, SCM port)
 491 #define FUNC_NAME "scm_lreadr"
 492 {
 493   /* For strings smaller than C_STR, this function creates only one Scheme
 494      object (the string returned).  */
 495
 496   SCM str = SCM_BOOL_F;
 497   unsigned c_str_len = 0;
 498   scm_t_wchar c;
 499
 500   /* Need to capture line and column numbers here. */
 501   long line = SCM_LINUM (port);
 502   int column = SCM_COL (port) - 1;
 503
 504   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 505   while ('"' != (c = scm_getc (port)))
 506     {
 507       if (c == EOF)
 508         {
 509         str_eof:
 510           scm_i_input_error (FUNC_NAME, port,
 511                              "end of file in string constant", SCM_EOL);
 512         }
 513
 514       if (c_str_len + 1 >= scm_i_string_length (str))
 515         {
 516           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 517
 518           str = scm_string_append (scm_list_2 (str, addy));
 519         }
 520
 521       if (c == '\\')
 522         {
 523           switch (c = scm_getc (port))
 524             {
 525             case EOF:
 526               goto str_eof;
 527             case '"':
 528             case '\\':
 529               break;
 530             case '\n':
 531               if (SCM_HUNGRY_EOL_ESCAPES_P)
 532                 skip_intraline_whitespace (port);
 533               continue;
 534             case '0':
 535               c = '\0';
 536               break;
 537             case 'f':
 538               c = '\f';
 539               break;
 540             case 'n':
 541               c = '\n';
 542               break;
 543             case 'r':
 544               c = '\r';
 545               break;
 546             case 't':
 547               c = '\t';
 548               break;
 549             case 'a':
 550               c = '\007';
 551               break;
 552             case 'v':
 553               c = '\v';
 554               break;
 555             case 'b':
 556               c = '\010';
 557               break;
 558             case 'x':
 559               if (SCM_R6RS_ESCAPES_P)
 560                 SCM_READ_HEX_ESCAPE (10, ';');
 561               else
 562                 SCM_READ_HEX_ESCAPE (2, '\0');
 563               break;
 564             case 'u':
 565               if (!SCM_R6RS_ESCAPES_P)
 566                 {
 567                   SCM_READ_HEX_ESCAPE (4, '\0');
 568                   break;
 569                 }
 570             case 'U':
 571               if (!SCM_R6RS_ESCAPES_P)
 572                 {
 573                   SCM_READ_HEX_ESCAPE (6, '\0');
 574                   break;
 575                 }
 576             default:
 577             bad_escaped:
 578               scm_i_input_error (FUNC_NAME, port,
 579                                  "illegal character in escape sequence: ~S",
 580                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 581             }
 582         }
 583       str = scm_i_string_start_writing (str);
 584       scm_i_string_set_x (str, c_str_len++, c);
 585       scm_i_string_stop_writing ();
 586     }
 587   return maybe_annotate_source (scm_i_substring_copy (str, 0, c_str_len),
 588                                 port, line, column);
 589 }
 590 #undef FUNC_NAME
 591
 592
 593 static SCM
 594 scm_read_number (scm_t_wchar chr, SCM port)
 595 {
 596   SCM result, str = SCM_EOL;
 597   char buffer[READER_BUFFER_SIZE];
 598   char *overflow_buffer = NULL;
 599   size_t bytes_read;
 600   int overflow;
 601   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 602
 603   /* Need to capture line and column numbers here. */
 604   long line = SCM_LINUM (port);
 605   int column = SCM_COL (port) - 1;
 606
 607   scm_ungetc (chr, port);
 608   overflow = read_complete_token (port, buffer, sizeof (buffer),
 609                                   &overflow_buffer, &bytes_read);
 610
 611   if (!overflow)
 612     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 613   else
 614     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 615                             pt->ilseq_handler);
 616
 617   result = scm_string_to_number (str, SCM_UNDEFINED);
 618   if (scm_is_false (result))
 619     {
 620       /* Return a symbol instead of a number */
 621       if (SCM_CASE_INSENSITIVE_P)
 622         str = scm_string_downcase_x (str);
 623       result = scm_string_to_symbol (str);
 624     }
 625   else if (SCM_NIMP (result))
 626     result = maybe_annotate_source (result, port, line, column);
 627
 628   if (overflow)
 629     free (overflow_buffer);
 630   SCM_COL (port) += scm_i_string_length (str);
 631   return result;
 632 }
 633
 634 static SCM
 635 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 636 {
 637   SCM result;
 638   int ends_with_colon = 0;
 639   size_t bytes_read;
 640   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 641   int overflow;
 642   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 643   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 644   SCM str;
 645
 646   scm_ungetc (chr, port);
 647   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 648                                   &overflow_buffer, &bytes_read);
 649   if (bytes_read > 0)
 650     {
 651       if (!overflow)
 652         ends_with_colon = buffer[bytes_read - 1] == ':';
 653       else
 654         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 655     }
 656
 657   if (postfix && ends_with_colon && (bytes_read > 1))
 658     {
 659       if (!overflow)
 660         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 661       else
 662         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 663                                 pt->ilseq_handler);
 664
 665       if (SCM_CASE_INSENSITIVE_P)
 666         str = scm_string_downcase_x (str);
 667       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 668     }
 669   else
 670     {
 671       if (!overflow)
 672         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 673       else
 674         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 675                                 pt->ilseq_handler);
 676
 677       if (SCM_CASE_INSENSITIVE_P)
 678         str = scm_string_downcase_x (str);
 679       result = scm_string_to_symbol (str);
 680     }
 681
 682   if (overflow)
 683     free (overflow_buffer);
 684   SCM_COL (port) += scm_i_string_length (str);
 685   return result;
 686 }
 687
 688 static SCM
 689 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 690 #define FUNC_NAME "scm_lreadr"
 691 {
 692   SCM result;
 693   size_t read;
 694   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 695   int overflow;
 696   unsigned int radix;
 697   SCM str;
 698   scm_t_port *pt;
 699
 700   switch (chr)
 701     {
 702     case 'B':
 703     case 'b':
 704       radix = 2;
 705       break;
 706
 707     case 'o':
 708     case 'O':
 709       radix = 8;
 710       break;
 711
 712     case 'd':
 713     case 'D':
 714       radix = 10;
 715       break;
 716
 717     case 'x':
 718     case 'X':
 719       radix = 16;
 720       break;
 721
 722     default:
 723       scm_ungetc (chr, port);
 724       scm_ungetc ('#', port);
 725       radix = 10;
 726     }
 727
 728   overflow = read_complete_token (port, buffer, sizeof (buffer),
 729                                   &overflow_buffer, &read);
 730
 731   pt = SCM_PTAB_ENTRY (port);
 732   if (!overflow)
 733     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 734   else
 735     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 736                             pt->ilseq_handler);
 737
 738   result = scm_string_to_number (str, scm_from_uint (radix));
 739
 740   if (overflow)
 741     free (overflow_buffer);
 742
 743   SCM_COL (port) += scm_i_string_length (str);
 744
 745   if (scm_is_true (result))
 746     return result;
 747
 748   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 749
 750   return SCM_BOOL_F;
 751 }
 752 #undef FUNC_NAME
 753
 754 static SCM
 755 scm_read_quote (int chr, SCM port)
 756 {
 757   SCM p;
 758   long line = SCM_LINUM (port);
 759   int column = SCM_COL (port) - 1;
 760
 761   switch (chr)
 762     {
 763     case '`':
 764       p = scm_sym_quasiquote;
 765       break;
 766
 767     case '\'':
 768       p = scm_sym_quote;
 769       break;
 770
 771     case ',':
 772       {
 773         scm_t_wchar c;
 774
 775         c = scm_getc (port);
 776         if ('@' == c)
 777           p = scm_sym_uq_splicing;
 778         else
 779           {
 780             scm_ungetc (c, port);
 781             p = scm_sym_unquote;
 782           }
 783         break;
 784       }
 785
 786     default:
 787       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 788                "scm_read_quote", chr);
 789       abort ();
 790     }
 791
 792   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 793   return maybe_annotate_source (p, port, line, column);
 794 }
 795
 796 SCM_SYMBOL (sym_syntax, "syntax");
 797 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 798 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 799 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 800
 801 static SCM
 802 scm_read_syntax (int chr, SCM port)
 803 {
 804   SCM p;
 805   long line = SCM_LINUM (port);
 806   int column = SCM_COL (port) - 1;
 807
 808   switch (chr)
 809     {
 810     case '`':
 811       p = sym_quasisyntax;
 812       break;
 813
 814     case '\'':
 815       p = sym_syntax;
 816       break;
 817
 818     case ',':
 819       {
 820         int c;
 821
 822         c = scm_getc (port);
 823         if ('@' == c)
 824           p = sym_unsyntax_splicing;
 825         else
 826           {
 827             scm_ungetc (c, port);
 828             p = sym_unsyntax;
 829           }
 830         break;
 831       }
 832
 833     default:
 834       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 835                "scm_read_syntax", chr);
 836       abort ();
 837     }
 838
 839   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 840   return maybe_annotate_source (p, port, line, column);
 841 }
 842
 843 static SCM
 844 scm_read_nil (int chr, SCM port)
 845 {
 846   SCM id = scm_read_mixed_case_symbol (chr, port);
 847
 848   if (!scm_is_eq (id, sym_nil))
 849     scm_i_input_error ("scm_read_nil", port,
 850                        "unexpected input while reading #nil: ~a",
 851                        scm_list_1 (id));
 852
 853   return SCM_ELISP_NIL;
 854 }
 855
 856 static SCM
 857 scm_read_semicolon_comment (int chr, SCM port)
 858 {
 859   int c;
 860
 861   /* We use the get_byte here because there is no need to get the
 862      locale correct with comment input. This presumes that newline
 863      always represents itself no matter what the encoding is.  */
 864   for (c = scm_get_byte_or_eof (port);
 865        (c != EOF) && (c != '\n');
 866        c = scm_get_byte_or_eof (port));
 867
 868   return SCM_UNSPECIFIED;
 869 }
 870
 871 \f
 872 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 873
 874 static SCM
 875 scm_read_boolean (int chr, SCM port)
 876 {
 877   switch (chr)
 878     {
 879     case 't':
 880     case 'T':
 881       return SCM_BOOL_T;
 882
 883     case 'f':
 884     case 'F':
 885       return SCM_BOOL_F;
 886     }
 887
 888   return SCM_UNSPECIFIED;
 889 }
 890
 891 static SCM
 892 scm_read_character (scm_t_wchar chr, SCM port)
 893 #define FUNC_NAME "scm_lreadr"
 894 {
 895   char buffer[READER_CHAR_NAME_MAX_SIZE];
 896   SCM charname;
 897   size_t charname_len, bytes_read;
 898   scm_t_wchar cp;
 899   int overflow;
 900   scm_t_port *pt;
 901
 902   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 903   if (overflow)
 904     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 905
 906   if (bytes_read == 0)
 907     {
 908       chr = scm_getc (port);
 909       if (chr == EOF)
 910         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 911                            "while reading character", SCM_EOL);
 912
 913       /* CHR must be a token delimiter, like a whitespace.  */
 914       return (SCM_MAKE_CHAR (chr));
 915     }
 916
 917   pt = SCM_PTAB_ENTRY (port);
 918
 919   /* Simple ASCII characters can be processed immediately.  Also, simple
 920      ISO-8859-1 characters can be processed immediately if the encoding for this
 921      port is ISO-8859-1.  */
 922   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 923     {
 924       SCM_COL (port) += 1;
 925       return SCM_MAKE_CHAR (buffer[0]);
 926     }
 927
 928   /* Otherwise, convert the buffer into a proper scheme string for
 929      processing.  */
 930   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 931                                pt->ilseq_handler);
 932   charname_len = scm_i_string_length (charname);
 933   SCM_COL (port) += charname_len;
 934   cp = scm_i_string_ref (charname, 0);
 935   if (charname_len == 1)
 936     return SCM_MAKE_CHAR (cp);
 937
 938   /* Ignore dotted circles, which may be used to keep combining characters from
 939      combining with the backslash in #\charname.  */
 940   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 941     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 942
 943   if (cp >= '0' && cp < '8')
 944     {
 945       /* Dirk:FIXME::  This type of character syntax is not R5RS
 946        * compliant.  Further, it should be verified that the constant
 947        * does only consist of octal digits.  */
 948       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 949       if (SCM_I_INUMP (p))
 950         {
 951           scm_t_wchar c = scm_to_uint32 (p);
 952           if (SCM_IS_UNICODE_CHAR (c))
 953             return SCM_MAKE_CHAR (c);
 954           else
 955             scm_i_input_error (FUNC_NAME, port,
 956                                "out-of-range octal character escape: ~a",
 957                                scm_list_1 (charname));
 958         }
 959     }
 960
 961   if (cp == 'x' && (charname_len > 1))
 962     {
 963       SCM p;
 964
 965       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 966       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 967                                 scm_from_uint (16));
 968       if (SCM_I_INUMP (p))
 969         {
 970           scm_t_wchar c = scm_to_uint32 (p);
 971           if (SCM_IS_UNICODE_CHAR (c))
 972             return SCM_MAKE_CHAR (c);
 973           else
 974             scm_i_input_error (FUNC_NAME, port,
 975                                "out-of-range hex character escape: ~a",
 976                                scm_list_1 (charname));
 977         }
 978     }
 979
 980   /* The names of characters should never have non-Latin1
 981      characters.  */
 982   if (scm_i_is_narrow_string (charname)
 983       || scm_i_try_narrow_string (charname))
 984     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 985                                        charname_len);
 986       if (scm_is_true (ch))
 987         return ch;
 988     }
 989
 990   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 991                      scm_list_1 (charname));
 992
 993   return SCM_UNSPECIFIED;
 994 }
 995 #undef FUNC_NAME
 996
 997 static SCM
 998 scm_read_keyword (int chr, SCM port)
 999 {
1000   SCM symbol;
1001
1002   /* Read the symbol that comprises the keyword.  Doing this instead of
1003      invoking a specific symbol reader function allows `scm_read_keyword ()'
1004      to adapt to the delimiters currently valid of symbols.
1005
1006      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1007   symbol = scm_read_expression (port);
1008   if (!scm_is_symbol (symbol))
1009     scm_i_input_error ("scm_read_keyword", port,
1010                        "keyword prefix `~a' not followed by a symbol: ~s",
1011                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1012
1013   return (scm_symbol_to_keyword (symbol));
1014 }
1015
1016 static SCM
1017 scm_read_vector (int chr, SCM port, long line, int column)
1018 {
1019   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1020      guarantee that it's going to do what we want.  After all, this is an
1021      implementation detail of `scm_read_vector ()', not a desirable
1022      property.  */
1023   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1024                                 port, line, column);
1025 }
1026
1027 static SCM
1028 scm_read_array (int chr, SCM port, long line, int column)
1029 {
1030   SCM result = scm_i_read_array (port, chr);
1031   if (scm_is_false (result))
1032     return result;
1033   else
1034     return maybe_annotate_source (result, port, line, column);
1035 }
1036
1037 static SCM
1038 scm_read_srfi4_vector (int chr, SCM port, long line, int column)
1039 {
1040   return scm_read_array (chr, port, line, column);
1041 }
1042
1043 static SCM
1044 scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
1045 {
1046   chr = scm_getc (port);
1047   if (chr != 'u')
1048     goto syntax;
1049
1050   chr = scm_getc (port);
1051   if (chr != '8')
1052     goto syntax;
1053
1054   chr = scm_getc (port);
1055   if (chr != '(')
1056     goto syntax;
1057
1058   return maybe_annotate_source
1059     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1060      port, line, column);
1061
1062  syntax:
1063   scm_i_input_error ("read_bytevector", port,
1064                      "invalid bytevector prefix",
1065                      SCM_MAKE_CHAR (chr));
1066   return SCM_UNSPECIFIED;
1067 }
1068
1069 static SCM
1070 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
1071 {
1072   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1073      terribly inefficient but who cares?  */
1074   SCM s_bits = SCM_EOL;
1075
1076   for (chr = scm_getc (port);
1077        (chr != EOF) && ((chr == '0') || (chr == '1'));
1078        chr = scm_getc (port))
1079     {
1080       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1081     }
1082
1083   if (chr != EOF)
1084     scm_ungetc (chr, port);
1085
1086   return maybe_annotate_source
1087     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1088      port, line, column);
1089 }
1090
1091 static SCM
1092 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1093 {
1094   int bang_seen = 0;
1095
1096   for (;;)
1097     {
1098       int c = scm_getc (port);
1099
1100       if (c == EOF)
1101         scm_i_input_error ("skip_block_comment", port,
1102                            "unterminated `#! ... !#' comment", SCM_EOL);
1103
1104       if (c == '!')
1105         bang_seen = 1;
1106       else if (c == '#' && bang_seen)
1107         break;
1108       else
1109         bang_seen = 0;
1110     }
1111
1112   return SCM_UNSPECIFIED;
1113 }
1114
1115 static SCM
1116 scm_read_shebang (scm_t_wchar chr, SCM port)
1117 {
1118   int c = 0;
1119   if ((c = scm_get_byte_or_eof (port)) != 'r')
1120     {
1121       scm_ungetc (c, port);
1122       return scm_read_scsh_block_comment (chr, port);
1123     }
1124   if ((c = scm_get_byte_or_eof (port)) != '6')
1125     {
1126       scm_ungetc (c, port);
1127       scm_ungetc ('r', port);
1128       return scm_read_scsh_block_comment (chr, port);
1129     }
1130   if ((c = scm_get_byte_or_eof (port)) != 'r')
1131     {
1132       scm_ungetc (c, port);
1133       scm_ungetc ('6', port);
1134       scm_ungetc ('r', port);
1135       return scm_read_scsh_block_comment (chr, port);
1136     }
1137   if ((c = scm_get_byte_or_eof (port)) != 's')
1138     {
1139       scm_ungetc (c, port);
1140       scm_ungetc ('r', port);
1141       scm_ungetc ('6', port);
1142       scm_ungetc ('r', port);
1143       return scm_read_scsh_block_comment (chr, port);
1144     }
1145
1146   return SCM_UNSPECIFIED;
1147 }
1148
1149 static SCM
1150 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1151 {
1152   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1153      nested.  So care must be taken.  */
1154   int nesting_level = 1;
1155
1156   int a = scm_getc (port);
1157
1158   if (a == EOF)
1159     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1160                        "unterminated `#| ... |#' comment", SCM_EOL);
1161
1162   while (nesting_level > 0)
1163     {
1164       int b = scm_getc (port);
1165
1166       if (b == EOF)
1167         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1168                            "unterminated `#| ... |#' comment", SCM_EOL);
1169
1170       if (a == '|' && b == '#')
1171         {
1172           nesting_level--;
1173           b = EOF;
1174         }
1175       else if (a == '#' && b == '|')
1176         {
1177           nesting_level++;
1178           b = EOF;
1179         }
1180
1181       a = b;
1182     }
1183
1184   return SCM_UNSPECIFIED;
1185 }
1186
1187 static SCM
1188 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1189 {
1190   scm_t_wchar c;
1191
1192   c = flush_ws (port, (char *) NULL);
1193   if (EOF == c)
1194     scm_i_input_error ("read_commented_expression", port,
1195                        "no expression after #; comment", SCM_EOL);
1196   scm_ungetc (c, port);
1197   scm_read_expression (port);
1198   return SCM_UNSPECIFIED;
1199 }
1200
1201 static SCM
1202 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1203 {
1204   /* Guile's extended symbol read syntax looks like this:
1205
1206        #{This is all a symbol name}#
1207
1208      So here, CHR is expected to be `{'.  */
1209   int saw_brace = 0;
1210   size_t len = 0;
1211   SCM buf = scm_i_make_string (1024, NULL, 0);
1212
1213   buf = scm_i_string_start_writing (buf);
1214
1215   while ((chr = scm_getc (port)) != EOF)
1216     {
1217       if (saw_brace)
1218         {
1219           if (chr == '#')
1220             {
1221               break;
1222             }
1223           else
1224             {
1225               saw_brace = 0;
1226               scm_i_string_set_x (buf, len++, '}');
1227             }
1228         }
1229
1230       if (chr == '}')
1231         saw_brace = 1;
1232       else if (chr == '\\')
1233         {
1234           /* It used to be that print.c would print extended-read-syntax
1235              symbols with backslashes before "non-standard" chars, but
1236              this routine wouldn't do anything with those escapes.
1237              Bummer.  What we've done is to change print.c to output
1238              R6RS hex escapes for those characters, relying on the fact
1239              that the extended read syntax would never put a `\' before
1240              an `x'.  For now, we just ignore other instances of
1241              backslash in the string.  */
1242           switch ((chr = scm_getc (port)))
1243             {
1244             case EOF:
1245               goto done;
1246             case 'x':
1247               {
1248                 scm_t_wchar c;
1249
1250                 SCM_READ_HEX_ESCAPE (10, ';');
1251                 scm_i_string_set_x (buf, len++, c);
1252                 break;
1253
1254               str_eof:
1255                 chr = EOF;
1256                 goto done;
1257
1258               bad_escaped:
1259                 scm_i_string_stop_writing ();
1260                 scm_i_input_error ("scm_read_extended_symbol", port,
1261                                    "illegal character in escape sequence: ~S",
1262                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1263                 break;
1264               }
1265             default:
1266               scm_i_string_set_x (buf, len++, chr);
1267               break;
1268             }
1269         }
1270       else
1271         scm_i_string_set_x (buf, len++, chr);
1272
1273       if (len >= scm_i_string_length (buf) - 2)
1274         {
1275           SCM addy;
1276
1277           scm_i_string_stop_writing ();
1278           addy = scm_i_make_string (1024, NULL, 0);
1279           buf = scm_string_append (scm_list_2 (buf, addy));
1280           len = 0;
1281           buf = scm_i_string_start_writing (buf);
1282         }
1283     }
1284
1285  done:
1286   scm_i_string_stop_writing ();
1287   if (chr == EOF)
1288     scm_i_input_error ("scm_read_extended_symbol", port,
1289                        "end of file while reading symbol", SCM_EOL);
1290
1291   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1292 }
1293
1294
1295 \f
1296 /* Top-level token readers, i.e., dispatchers.  */
1297
1298 static SCM
1299 scm_read_sharp_extension (int chr, SCM port)
1300 {
1301   SCM proc;
1302
1303   proc = scm_get_hash_procedure (chr);
1304   if (scm_is_true (scm_procedure_p (proc)))
1305     {
1306       long line = SCM_LINUM (port);
1307       int column = SCM_COL (port) - 2;
1308       SCM got;
1309
1310       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1311
1312       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1313         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1314
1315       return got;
1316     }
1317
1318   return SCM_UNSPECIFIED;
1319 }
1320
1321 /* The reader for the sharp `#' character.  It basically dispatches reads
1322    among the above token readers.   */
1323 static SCM
1324 scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
1325 #define FUNC_NAME "scm_lreadr"
1326 {
1327   SCM result;
1328
1329   chr = scm_getc (port);
1330
1331   result = scm_read_sharp_extension (chr, port);
1332   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1333     return result;
1334
1335   switch (chr)
1336     {
1337     case '\\':
1338       return (scm_read_character (chr, port));
1339     case '(':
1340       return (scm_read_vector (chr, port, line, column));
1341     case 's':
1342     case 'u':
1343     case 'f':
1344     case 'c':
1345       /* This one may return either a boolean or an SRFI-4 vector.  */
1346       return (scm_read_srfi4_vector (chr, port, line, column));
1347     case 'v':
1348       return (scm_read_bytevector (chr, port, line, column));
1349     case '*':
1350       return (scm_read_guile_bit_vector (chr, port, line, column));
1351     case 't':
1352     case 'T':
1353     case 'F':
1354       return (scm_read_boolean (chr, port));
1355     case ':':
1356       return (scm_read_keyword (chr, port));
1357     case '0': case '1': case '2': case '3': case '4':
1358     case '5': case '6': case '7': case '8': case '9':
1359     case '@':
1360 #if SCM_ENABLE_DEPRECATED
1361       /* See below for 'i' and 'e'. */
1362     case 'a':
1363     case 'y':
1364     case 'h':
1365     case 'l':
1366 #endif
1367       return (scm_read_array (chr, port, line, column));
1368
1369     case 'i':
1370     case 'e':
1371 #if SCM_ENABLE_DEPRECATED
1372       {
1373         /* When next char is '(', it really is an old-style
1374            uniform array. */
1375         scm_t_wchar next_c = scm_getc (port);
1376         if (next_c != EOF)
1377           scm_ungetc (next_c, port);
1378         if (next_c == '(')
1379           return scm_read_array (chr, port, line, column);
1380         /* Fall through. */
1381       }
1382 #endif
1383     case 'b':
1384     case 'B':
1385     case 'o':
1386     case 'O':
1387     case 'd':
1388     case 'D':
1389     case 'x':
1390     case 'X':
1391     case 'I':
1392     case 'E':
1393       return (scm_read_number_and_radix (chr, port));
1394     case '{':
1395       return (scm_read_extended_symbol (chr, port));
1396     case '!':
1397       return (scm_read_shebang (chr, port));
1398     case ';':
1399       return (scm_read_commented_expression (chr, port));
1400     case '`':
1401     case '\'':
1402     case ',':
1403       return (scm_read_syntax (chr, port));
1404     case 'n':
1405       return (scm_read_nil (chr, port));
1406     default:
1407       result = scm_read_sharp_extension (chr, port);
1408       if (scm_is_eq (result, SCM_UNSPECIFIED))
1409         {
1410           /* To remain compatible with 1.8 and earlier, the following
1411              characters have lower precedence than `read-hash-extend'
1412              characters.  */
1413           switch (chr)
1414             {
1415             case '|':
1416               return scm_read_r6rs_block_comment (chr, port);
1417             default:
1418               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1419                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1420             }
1421         }
1422       else
1423         return result;
1424     }
1425
1426   return SCM_UNSPECIFIED;
1427 }
1428 #undef FUNC_NAME
1429
1430 static SCM
1431 scm_read_expression (SCM port)
1432 #define FUNC_NAME "scm_read_expression"
1433 {
1434   while (1)
1435     {
1436       scm_t_wchar chr;
1437
1438       chr = scm_getc (port);
1439
1440       switch (chr)
1441         {
1442         case SCM_WHITE_SPACES:
1443         case SCM_LINE_INCREMENTORS:
1444           break;
1445         case ';':
1446           (void) scm_read_semicolon_comment (chr, port);
1447           break;
1448         case '[':
1449           if (!SCM_SQUARE_BRACKETS_P)
1450             return (scm_read_mixed_case_symbol (chr, port));
1451           /* otherwise fall through */
1452         case '(':
1453           return (scm_read_sexp (chr, port));
1454         case '"':
1455           return (scm_read_string (chr, port));
1456         case '\'':
1457         case '`':
1458         case ',':
1459           return (scm_read_quote (chr, port));
1460         case '#':
1461           {
1462             long line  = SCM_LINUM (port);
1463             int column = SCM_COL (port) - 1;
1464             SCM result = scm_read_sharp (chr, port, line, column);
1465             if (scm_is_eq (result, SCM_UNSPECIFIED))
1466               /* We read a comment or some such.  */
1467               break;
1468             else
1469               return result;
1470           }
1471         case ')':
1472           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1473           break;
1474         case ']':
1475           if (SCM_SQUARE_BRACKETS_P)
1476             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1477           /* otherwise fall through */
1478         case EOF:
1479           return SCM_EOF_VAL;
1480         case ':':
1481           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1482             return scm_symbol_to_keyword (scm_read_expression (port));
1483           /* Fall through.  */
1484
1485         default:
1486           {
1487             if (((chr >= '0') && (chr <= '9'))
1488                 || (strchr ("+-.", chr)))
1489               return (scm_read_number (chr, port));
1490             else
1491               return (scm_read_mixed_case_symbol (chr, port));
1492           }
1493         }
1494     }
1495 }
1496 #undef FUNC_NAME
1497
1498 \f
1499 /* Actual reader.  */
1500
1501 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1502             (SCM port),
1503             "Read an s-expression from the input port @var{port}, or from\n"
1504             "the current input port if @var{port} is not specified.\n"
1505             "Any whitespace before the next token is discarded.")
1506 #define FUNC_NAME s_scm_read
1507 {
1508   int c;
1509
1510   if (SCM_UNBNDP (port))
1511     port = scm_current_input_port ();
1512   SCM_VALIDATE_OPINPORT (1, port);
1513
1514   c = flush_ws (port, (char *) NULL);
1515   if (EOF == c)
1516     return SCM_EOF_VAL;
1517   scm_ungetc (c, port);
1518
1519   return (scm_read_expression (port));
1520 }
1521 #undef FUNC_NAME
1522
1523
1524 \f
1525
1526 /* Manipulate the read-hash-procedures alist.  This could be written in
1527    Scheme, but maybe it will also be used by C code during initialisation.  */
1528 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1529             (SCM chr, SCM proc),
1530             "Install the procedure @var{proc} for reading expressions\n"
1531             "starting with the character sequence @code{#} and @var{chr}.\n"
1532             "@var{proc} will be called with two arguments:  the character\n"
1533             "@var{chr} and the port to read further data from. The object\n"
1534             "returned will be the return value of @code{read}. \n"
1535             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1536             )
1537 #define FUNC_NAME s_scm_read_hash_extend
1538 {
1539   SCM this;
1540   SCM prev;
1541
1542   SCM_VALIDATE_CHAR (1, chr);
1543   SCM_ASSERT (scm_is_false (proc)
1544               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1545               proc, SCM_ARG2, FUNC_NAME);
1546
1547   /* Check if chr is already in the alist.  */
1548   this = scm_i_read_hash_procedures_ref ();
1549   prev = SCM_BOOL_F;
1550   while (1)
1551     {
1552       if (scm_is_null (this))
1553         {
1554           /* not found, so add it to the beginning.  */
1555           if (scm_is_true (proc))
1556             {
1557               SCM new = scm_cons (scm_cons (chr, proc),
1558                                   scm_i_read_hash_procedures_ref ());
1559               scm_i_read_hash_procedures_set_x (new);
1560             }
1561           break;
1562         }
1563       if (scm_is_eq (chr, SCM_CAAR (this)))
1564         {
1565           /* already in the alist.  */
1566           if (scm_is_false (proc))
1567             {
1568               /* remove it.  */
1569               if (scm_is_false (prev))
1570                 {
1571                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1572                   scm_i_read_hash_procedures_set_x (rest);
1573                 }
1574               else
1575                 scm_set_cdr_x (prev, SCM_CDR (this));
1576             }
1577           else
1578             {
1579               /* replace it.  */
1580               scm_set_cdr_x (SCM_CAR (this), proc);
1581             }
1582           break;
1583         }
1584       prev = this;
1585       this = SCM_CDR (this);
1586     }
1587
1588   return SCM_UNSPECIFIED;
1589 }
1590 #undef FUNC_NAME
1591
1592 /* Recover the read-hash procedure corresponding to char c.  */
1593 static SCM
1594 scm_get_hash_procedure (int c)
1595 {
1596   SCM rest = scm_i_read_hash_procedures_ref ();
1597
1598   while (1)
1599     {
1600       if (scm_is_null (rest))
1601         return SCM_BOOL_F;
1602
1603       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1604         return SCM_CDAR (rest);
1605
1606       rest = SCM_CDR (rest);
1607     }
1608 }
1609
1610 #define SCM_ENCODING_SEARCH_SIZE (500)
1611
1612 /* Search the first few hundred characters of a file for an Emacs-like coding
1613    declaration.  Returns either NULL or a string whose storage has been
1614    allocated with `scm_gc_malloc ()'.  */
1615 char *
1616 scm_i_scan_for_encoding (SCM port)
1617 {
1618   scm_t_port *pt;
1619   char header[SCM_ENCODING_SEARCH_SIZE+1];
1620   size_t bytes_read, encoding_length, i;
1621   char *encoding = NULL;
1622   int utf8_bom = 0;
1623   char *pos, *encoding_start;
1624   int in_comment;
1625
1626   pt = SCM_PTAB_ENTRY (port);
1627
1628   if (pt->rw_active == SCM_PORT_WRITE)
1629     scm_flush (port);
1630
1631   if (pt->rw_random)
1632     pt->rw_active = SCM_PORT_READ;
1633
1634   if (pt->read_pos == pt->read_end)
1635     {
1636       /* We can use the read buffer, and thus avoid a seek. */
1637       if (scm_fill_input (port) == EOF)
1638         return NULL;
1639
1640       bytes_read = pt->read_end - pt->read_pos;
1641       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1642         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1643
1644       if (bytes_read <= 1)
1645         /* An unbuffered port -- don't scan.  */
1646         return NULL;
1647
1648       memcpy (header, pt->read_pos, bytes_read);
1649       header[bytes_read] = '\0';
1650     }
1651   else
1652     {
1653       /* Try to read some bytes and then seek back.  Not all ports
1654          support seeking back; and indeed some file ports (like
1655          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1656          check performed by SCM_FPORT_FDES---but fail to seek
1657          backwards.  Hence this block comes second.  We prefer to use
1658          the read buffer in-place.  */
1659       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1660         return NULL;
1661
1662       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1663       header[bytes_read] = '\0';
1664       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1665     }
1666
1667   if (bytes_read > 3
1668       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1669     utf8_bom = 1;
1670
1671   /* search past "coding[:=]" */
1672   pos = header;
1673   while (1)
1674     {
1675       if ((pos = strstr(pos, "coding")) == NULL)
1676         return NULL;
1677
1678       pos += strlen("coding");
1679       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1680           (*pos == ':' || *pos == '='))
1681         {
1682           pos ++;
1683           break;
1684         }
1685     }
1686
1687   /* skip spaces */
1688   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1689          (*pos == ' ' || *pos == '\t'))
1690     pos ++;
1691
1692   /* grab the next token */
1693   encoding_start = pos;
1694   i = 0;
1695   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1696          && encoding_start + i - header < bytes_read
1697          && (isalnum ((int) encoding_start[i])
1698              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1699     i++;
1700
1701   encoding_length = i;
1702   if (encoding_length == 0)
1703     return NULL;
1704
1705   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1706   for (i = 0; i < encoding_length; i++)
1707     encoding[i] = toupper ((int) encoding[i]);
1708
1709   /* push backwards to make sure we were in a comment */
1710   in_comment = 0;
1711   pos = encoding_start;
1712   while (pos >= header)
1713     {
1714       if (*pos == ';')
1715         {
1716           in_comment = 1;
1717           break;
1718         }
1719       else if (*pos == '\n' || pos == header)
1720         {
1721           /* This wasn't in a semicolon comment. Check for a
1722            hash-bang comment. */
1723           char *beg = strstr (header, "#!");
1724           char *end = strstr (header, "!#");
1725           if (beg < encoding_start && encoding_start + encoding_length <= end)
1726             in_comment = 1;
1727           break;
1728         }
1729       else
1730         {
1731           pos --;
1732           continue;
1733         }
1734     }
1735   if (!in_comment)
1736     /* This wasn't in a comment */
1737     return NULL;
1738
1739   if (utf8_bom && strcmp(encoding, "UTF-8"))
1740     scm_misc_error (NULL,
1741                     "the port input declares the encoding ~s but is encoded as UTF-8",
1742                     scm_list_1 (scm_from_locale_string (encoding)));
1743
1744   return encoding;
1745 }
1746
1747 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1748             (SCM port),
1749             "Scans the port for an Emacs-like character coding declaration\n"
1750             "near the top of the contents of a port with random-accessible contents.\n"
1751             "The coding declaration is of the form\n"
1752             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1753             "\n"
1754             "Returns a string containing the character encoding of the file\n"
1755             "if a declaration was found, or @code{#f} otherwise.\n")
1756 #define FUNC_NAME s_scm_file_encoding
1757 {
1758   char *enc;
1759   SCM s_enc;
1760
1761   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1762
1763   enc = scm_i_scan_for_encoding (port);
1764   if (enc == NULL)
1765     return SCM_BOOL_F;
1766   else
1767     {
1768       s_enc = scm_from_locale_string (enc);
1769       return s_enc;
1770     }
1771
1772   return SCM_BOOL_F;
1773 }
1774 #undef FUNC_NAME
1775
1776 void
1777 scm_init_read ()
1778 {
1779   SCM read_hash_procs;
1780
1781   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
1782
1783   scm_i_read_hash_procedures =
1784     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1785
1786   scm_init_opts (scm_read_options, scm_read_opts);
1787 #include "libguile/read.x"
1788 }
1789
1790 /*
1791   Local Variables:
1792   c-file-style: "gnu"
1793   End:
1794 */