libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32 #include <unictype.h>
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/bytevectors.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/eval.h"
  38 #include "libguile/arrays.h"
  39 #include "libguile/bitvectors.h"
  40 #include "libguile/keywords.h"
  41 #include "libguile/alist.h"
  42 #include "libguile/srcprop.h"
  43 #include "libguile/hashtab.h"
  44 #include "libguile/hash.h"
  45 #include "libguile/ports.h"
  46 #include "libguile/fports.h"
  47 #include "libguile/root.h"
  48 #include "libguile/strings.h"
  49 #include "libguile/strports.h"
  50 #include "libguile/vectors.h"
  51 #include "libguile/validate.h"
  52 #include "libguile/srfi-4.h"
  53 #include "libguile/srfi-13.h"
  54
  55 #include "libguile/read.h"
  56 #include "libguile/private-options.h"
  57
  58
  59 \f
  60
  61 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  62 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  63 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  64 SCM_SYMBOL (sym_nil, "nil");
  65
  66 scm_t_option scm_read_opts[] = {
  67   { SCM_OPTION_BOOLEAN, "copy", 0,
  68     "Copy source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "positions", 1,
  70     "Record positions of source code expressions." },
  71   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  72     "Convert symbols to lower case."},
  73   { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  74     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  75   { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  76     "Use R6RS variable-length character and string hex escapes."},
  77   { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  78     "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  79   { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  80     "In strings, consume leading whitespace after an escaped end-of-line."},
  81   { 0, },
  82 };
  83
  84 /*
  85   Give meaningful error messages for errors
  86
  87   We use the format
  88
  89   FILE:LINE:COL: MESSAGE
  90   This happened in ....
  91
  92   This is not standard GNU format, but the test-suite likes the real
  93   message to be in front.
  94
  95  */
  96
  97
  98 void
  99 scm_i_input_error (char const *function,
 100                    SCM port, const char *message, SCM arg)
 101 {
 102   SCM fn = (scm_is_string (SCM_FILENAME(port))
 103             ? SCM_FILENAME(port)
 104             : scm_from_locale_string ("#<unknown port>"));
 105
 106   SCM string_port = scm_open_output_string ();
 107   SCM string = SCM_EOL;
 108   scm_simple_format (string_port,
 109                      scm_from_locale_string ("~A:~S:~S: ~A"),
 110                      scm_list_4 (fn,
 111                                  scm_from_long (SCM_LINUM (port) + 1),
 112                                  scm_from_int (SCM_COL (port) + 1),
 113                                  scm_from_locale_string (message)));
 114
 115   string = scm_get_output_string (string_port);
 116   scm_close_output_port (string_port);
 117   scm_error_scm (scm_from_latin1_symbol ("read-error"),
 118                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 119                  string,
 120                  arg,
 121                  SCM_BOOL_F);
 122 }
 123
 124
 125 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 126             (SCM setting),
 127             "Option interface for the read options. Instead of using\n"
 128             "this procedure directly, use the procedures @code{read-enable},\n"
 129             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 130 #define FUNC_NAME s_scm_read_options
 131 {
 132   SCM ans = scm_options (setting,
 133                          scm_read_opts,
 134                          FUNC_NAME);
 135   if (SCM_COPY_SOURCE_P)
 136     SCM_RECORD_POSITIONS_P = 1;
 137   return ans;
 138 }
 139 #undef FUNC_NAME
 140
 141 /* A fluid referring to an association list mapping extra hash
 142    characters to procedures.  */
 143 static SCM *scm_i_read_hash_procedures;
 144
 145 static inline SCM
 146 scm_i_read_hash_procedures_ref (void)
 147 {
 148   return scm_fluid_ref (*scm_i_read_hash_procedures);
 149 }
 150
 151 static inline void
 152 scm_i_read_hash_procedures_set_x (SCM value)
 153 {
 154   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
 155 }
 156
 157 \f
 158 /* Token readers.  */
 159
 160
 161 /* Size of the C buffer used to read symbols and numbers.  */
 162 #define READER_BUFFER_SIZE            128
 163
 164 /* Size of the C buffer used to read strings.  */
 165 #define READER_STRING_BUFFER_SIZE     512
 166
 167 /* The maximum size of Scheme character names.  */
 168 #define READER_CHAR_NAME_MAX_SIZE      50
 169
 170
 171 /* `isblank' is only in C99.  */
 172 #define CHAR_IS_BLANK_(_chr)                                    \
 173   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 174    || ((_chr) == '\f') || ((_chr) == '\r'))
 175
 176 #ifdef MSDOS
 177 # define CHAR_IS_BLANK(_chr)                    \
 178   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 179 #else
 180 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 181 #endif
 182
 183
 184 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 185    structure'').  */
 186 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 187   (CHAR_IS_BLANK (c)                                            \
 188    || (c == ')') || (c == '(') || (c == ';') || (c == '"')      \
 189    || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
 190
 191 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 192
 193 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 194    Structure''.  */
 195 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 196   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 197    || ((_chr) == 'd') || ((_chr) == 'l'))
 198
 199 /* Read an SCSH block comment.  */
 200 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 201 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 202 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 203 static SCM scm_read_shebang (scm_t_wchar, SCM);
 204 static SCM scm_get_hash_procedure (int);
 205
 206 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 207    result in the pre-allocated buffer BUF.  Return zero if the whole token has
 208    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
 209    bytes actually read.  */
 210 static inline int
 211 read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
 212  {
 213    *read = 0;
 214
 215    while (*read < buf_size)
 216      {
 217        int chr;
 218
 219        chr = scm_get_byte_or_eof_unlocked (port);
 220
 221        if (chr == EOF)
 222         return 0;
 223       else if (CHAR_IS_DELIMITER (chr))
 224         {
 225           scm_unget_byte_unlocked (chr, port);
 226           return 0;
 227         }
 228       else
 229         {
 230           *buf = (char) chr;
 231           buf++, (*read)++;
 232         }
 233      }
 234
 235    return 1;
 236  }
 237
 238 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
 239    result in the pre-allocated buffer BUFFER, if the whole token has fewer than
 240    BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
 241    caller.  Return zero if the token fits in BUFFER, non-zero otherwise. READ
 242    will be set the number of bytes actually read.  */
 243 static int
 244 read_complete_token (SCM port, char *buffer, const size_t buffer_size,
 245                            char **overflow_buffer, size_t *read)
 246 {
 247   int overflow = 0;
 248   size_t bytes_read, overflow_size;
 249
 250   *overflow_buffer = NULL;
 251   overflow_size = 0;
 252
 253   do
 254     {
 255       overflow = read_token (port, buffer, buffer_size, &bytes_read);
 256       if (bytes_read == 0)
 257         break;
 258       if (overflow || overflow_size != 0)
 259         {
 260           if (overflow_size == 0)
 261             {
 262               *overflow_buffer = scm_malloc (bytes_read);
 263               memcpy (*overflow_buffer, buffer, bytes_read);
 264               overflow_size = bytes_read;
 265             }
 266           else
 267             {
 268               *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
 269               memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
 270               overflow_size += bytes_read;
 271             }
 272         }
 273     }
 274   while (overflow);
 275
 276   if (overflow_size)
 277     *read = overflow_size;
 278   else
 279     *read = bytes_read;
 280
 281   return (overflow_size != 0);
 282 }
 283
 284 /* Skip whitespace from PORT and return the first non-whitespace character
 285    read.  Raise an error on end-of-file.  */
 286 static int
 287 flush_ws (SCM port, const char *eoferr)
 288 {
 289   register scm_t_wchar c;
 290   while (1)
 291     switch (c = scm_getc_unlocked (port))
 292       {
 293       case EOF:
 294       goteof:
 295         if (eoferr)
 296           {
 297             scm_i_input_error (eoferr,
 298                                port,
 299                                "end of file",
 300                                SCM_EOL);
 301           }
 302         return c;
 303
 304       case ';':
 305       lp:
 306         switch (c = scm_getc_unlocked (port))
 307           {
 308           case EOF:
 309             goto goteof;
 310           default:
 311             goto lp;
 312           case SCM_LINE_INCREMENTORS:
 313             break;
 314           }
 315         break;
 316
 317       case '#':
 318         switch (c = scm_getc_unlocked (port))
 319           {
 320           case EOF:
 321             eoferr = "read_sharp";
 322             goto goteof;
 323           case '!':
 324             scm_read_shebang (c, port);
 325             break;
 326           case ';':
 327             scm_read_commented_expression (c, port);
 328             break;
 329           case '|':
 330             if (scm_is_false (scm_get_hash_procedure (c)))
 331               {
 332                 scm_read_r6rs_block_comment (c, port);
 333                 break;
 334               }
 335             /* fall through */
 336           default:
 337             scm_ungetc_unlocked (c, port);
 338             return '#';
 339           }
 340         break;
 341
 342       case SCM_LINE_INCREMENTORS:
 343       case SCM_SINGLE_SPACES:
 344       case '\t':
 345         break;
 346
 347       default:
 348         return c;
 349       }
 350
 351   return 0;
 352 }
 353
 354
 355 \f
 356 /* Token readers.  */
 357
 358 static SCM scm_read_expression (SCM port);
 359 static SCM scm_read_sharp (int chr, SCM port);
 360
 361
 362 static SCM
 363 scm_read_sexp (scm_t_wchar chr, SCM port)
 364 #define FUNC_NAME "scm_i_lreadparen"
 365 {
 366   int c;
 367   SCM tmp, tl, ans = SCM_EOL;
 368   const int terminating_char = ((chr == '[') ? ']' : ')');
 369
 370   /* Need to capture line and column numbers here. */
 371   long line = SCM_LINUM (port);
 372   int column = SCM_COL (port) - 1;
 373
 374   c = flush_ws (port, FUNC_NAME);
 375   if (terminating_char == c)
 376     return SCM_EOL;
 377
 378   scm_ungetc_unlocked (c, port);
 379   tmp = scm_read_expression (port);
 380
 381   /* Note that it is possible for scm_read_expression to return
 382      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
 383      check that it's a real dot by checking `c'.  */
 384   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 385     {
 386       ans = scm_read_expression (port);
 387       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 388         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 389                            SCM_EOL);
 390       return ans;
 391     }
 392
 393   /* Build the head of the list structure. */
 394   ans = tl = scm_cons (tmp, SCM_EOL);
 395
 396   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 397     {
 398       SCM new_tail;
 399
 400       if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
 401         scm_i_input_error (FUNC_NAME, port,
 402                            "in pair: mismatched close paren: ~A",
 403                            scm_list_1 (SCM_MAKE_CHAR (c)));
 404
 405       scm_ungetc_unlocked (c, port);
 406       tmp = scm_read_expression (port);
 407
 408       /* See above note about scm_sym_dot.  */
 409       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
 410         {
 411           SCM_SETCDR (tl, scm_read_expression (port));
 412
 413           c = flush_ws (port, FUNC_NAME);
 414           if (terminating_char != c)
 415             scm_i_input_error (FUNC_NAME, port,
 416                                "in pair: missing close paren", SCM_EOL);
 417           goto exit;
 418         }
 419
 420       new_tail = scm_cons (tmp, SCM_EOL);
 421       SCM_SETCDR (tl, new_tail);
 422       tl = new_tail;
 423     }
 424
 425  exit:
 426   if (SCM_RECORD_POSITIONS_P)
 427     scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
 428
 429   return ans;
 430 }
 431 #undef FUNC_NAME
 432
 433
 434 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
 435    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
 436    found.  */
 437 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
 438   do                                                               \
 439     {                                                              \
 440       scm_t_wchar a;                                               \
 441       size_t i = 0;                                                \
 442       c = 0;                                                       \
 443       while (i < ndigits)                                          \
 444         {                                                          \
 445           a = scm_getc_unlocked (port);                                     \
 446           if (a == EOF)                                            \
 447             goto str_eof;                                          \
 448           if (terminator                                           \
 449               && (a == (scm_t_wchar) terminator)                   \
 450               && (i > 0))                                          \
 451             break;                                                 \
 452           if ('0' <= a && a <= '9')                                \
 453             a -= '0';                                              \
 454           else if ('A' <= a && a <= 'F')                           \
 455             a = a - 'A' + 10;                                      \
 456           else if ('a' <= a && a <= 'f')                           \
 457             a = a - 'a' + 10;                                      \
 458           else                                                     \
 459             {                                                      \
 460               c = a;                                               \
 461               goto bad_escaped;                                    \
 462             }                                                      \
 463           c = c * 16 + a;                                          \
 464           i ++;                                                    \
 465         }                                                          \
 466     } while (0)
 467
 468 static void
 469 skip_intraline_whitespace (SCM port)
 470 {
 471   scm_t_wchar c;
 472
 473   do
 474     {
 475       c = scm_getc_unlocked (port);
 476       if (c == EOF)
 477         return;
 478     }
 479   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
 480
 481   scm_ungetc_unlocked (c, port);
 482 }
 483
 484 static SCM
 485 scm_read_string (int chr, SCM port)
 486 #define FUNC_NAME "scm_lreadr"
 487 {
 488   /* For strings smaller than C_STR, this function creates only one Scheme
 489      object (the string returned).  */
 490
 491   SCM str = SCM_BOOL_F;
 492   unsigned c_str_len = 0;
 493   scm_t_wchar c;
 494
 495   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 496   while ('"' != (c = scm_getc_unlocked (port)))
 497     {
 498       if (c == EOF)
 499         {
 500         str_eof:
 501           scm_i_input_error (FUNC_NAME, port,
 502                              "end of file in string constant", SCM_EOL);
 503         }
 504
 505       if (c_str_len + 1 >= scm_i_string_length (str))
 506         {
 507           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
 508
 509           str = scm_string_append (scm_list_2 (str, addy));
 510         }
 511
 512       if (c == '\\')
 513         {
 514           switch (c = scm_getc_unlocked (port))
 515             {
 516             case EOF:
 517               goto str_eof;
 518             case '"':
 519             case '\\':
 520               break;
 521             case '\n':
 522               if (SCM_HUNGRY_EOL_ESCAPES_P)
 523                 skip_intraline_whitespace (port);
 524               continue;
 525             case '0':
 526               c = '\0';
 527               break;
 528             case 'f':
 529               c = '\f';
 530               break;
 531             case 'n':
 532               c = '\n';
 533               break;
 534             case 'r':
 535               c = '\r';
 536               break;
 537             case 't':
 538               c = '\t';
 539               break;
 540             case 'a':
 541               c = '\007';
 542               break;
 543             case 'v':
 544               c = '\v';
 545               break;
 546             case 'b':
 547               c = '\010';
 548               break;
 549             case 'x':
 550               if (SCM_R6RS_ESCAPES_P)
 551                 SCM_READ_HEX_ESCAPE (10, ';');
 552               else
 553                 SCM_READ_HEX_ESCAPE (2, '\0');
 554               break;
 555             case 'u':
 556               if (!SCM_R6RS_ESCAPES_P)
 557                 {
 558                   SCM_READ_HEX_ESCAPE (4, '\0');
 559                   break;
 560                 }
 561             case 'U':
 562               if (!SCM_R6RS_ESCAPES_P)
 563                 {
 564                   SCM_READ_HEX_ESCAPE (6, '\0');
 565                   break;
 566                 }
 567             default:
 568             bad_escaped:
 569               scm_i_input_error (FUNC_NAME, port,
 570                                  "illegal character in escape sequence: ~S",
 571                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 572             }
 573         }
 574       str = scm_i_string_start_writing (str);
 575       scm_i_string_set_x (str, c_str_len++, c);
 576       scm_i_string_stop_writing ();
 577     }
 578
 579   if (c_str_len > 0)
 580     {
 581       return scm_i_substring_copy (str, 0, c_str_len);
 582     }
 583
 584   return scm_nullstr;
 585 }
 586 #undef FUNC_NAME
 587
 588
 589 static SCM
 590 scm_read_number (scm_t_wchar chr, SCM port)
 591 {
 592   SCM result, str = SCM_EOL;
 593   char buffer[READER_BUFFER_SIZE];
 594   char *overflow_buffer = NULL;
 595   size_t bytes_read;
 596   int overflow;
 597   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 598
 599   scm_ungetc_unlocked (chr, port);
 600   overflow = read_complete_token (port, buffer, sizeof (buffer),
 601                                   &overflow_buffer, &bytes_read);
 602
 603   if (!overflow)
 604     str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 605   else
 606     str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 607                             pt->ilseq_handler);
 608
 609   result = scm_string_to_number (str, SCM_UNDEFINED);
 610   if (!scm_is_true (result))
 611     {
 612       /* Return a symbol instead of a number */
 613       if (SCM_CASE_INSENSITIVE_P)
 614         str = scm_string_downcase_x (str);
 615       result = scm_string_to_symbol (str);
 616     }
 617
 618   if (overflow)
 619     free (overflow_buffer);
 620   SCM_COL (port) += scm_i_string_length (str);
 621   return result;
 622 }
 623
 624 static SCM
 625 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 626 {
 627   SCM result;
 628   int ends_with_colon = 0;
 629   size_t bytes_read;
 630   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 631   int overflow;
 632   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 633   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 634   SCM str;
 635
 636   scm_ungetc_unlocked (chr, port);
 637   overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
 638                                   &overflow_buffer, &bytes_read);
 639   if (bytes_read > 0)
 640     {
 641       if (!overflow)
 642         ends_with_colon = buffer[bytes_read - 1] == ':';
 643       else
 644         ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
 645     }
 646
 647   if (postfix && ends_with_colon && (bytes_read > 1))
 648     {
 649       if (!overflow)
 650         str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
 651       else
 652         str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
 653                                 pt->ilseq_handler);
 654
 655       if (SCM_CASE_INSENSITIVE_P)
 656         str = scm_string_downcase_x (str);
 657       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
 658     }
 659   else
 660     {
 661       if (!overflow)
 662         str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
 663       else
 664         str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
 665                                 pt->ilseq_handler);
 666
 667       if (SCM_CASE_INSENSITIVE_P)
 668         str = scm_string_downcase_x (str);
 669       result = scm_string_to_symbol (str);
 670     }
 671
 672   if (overflow)
 673     free (overflow_buffer);
 674   SCM_COL (port) += scm_i_string_length (str);
 675   return result;
 676 }
 677
 678 static SCM
 679 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 680 #define FUNC_NAME "scm_lreadr"
 681 {
 682   SCM result;
 683   size_t read;
 684   char buffer[READER_BUFFER_SIZE], *overflow_buffer;
 685   int overflow;
 686   unsigned int radix;
 687   SCM str;
 688   scm_t_port *pt;
 689
 690   switch (chr)
 691     {
 692     case 'B':
 693     case 'b':
 694       radix = 2;
 695       break;
 696
 697     case 'o':
 698     case 'O':
 699       radix = 8;
 700       break;
 701
 702     case 'd':
 703     case 'D':
 704       radix = 10;
 705       break;
 706
 707     case 'x':
 708     case 'X':
 709       radix = 16;
 710       break;
 711
 712     default:
 713       scm_ungetc_unlocked (chr, port);
 714       scm_ungetc_unlocked ('#', port);
 715       radix = 10;
 716     }
 717
 718   overflow = read_complete_token (port, buffer, sizeof (buffer),
 719                                   &overflow_buffer, &read);
 720
 721   pt = SCM_PTAB_ENTRY (port);
 722   if (!overflow)
 723     str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
 724   else
 725     str = scm_from_stringn (overflow_buffer, read, pt->encoding,
 726                             pt->ilseq_handler);
 727
 728   result = scm_string_to_number (str, scm_from_uint (radix));
 729
 730   if (overflow)
 731     free (overflow_buffer);
 732
 733   SCM_COL (port) += scm_i_string_length (str);
 734
 735   if (scm_is_true (result))
 736     return result;
 737
 738   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 739
 740   return SCM_BOOL_F;
 741 }
 742 #undef FUNC_NAME
 743
 744 static SCM
 745 scm_read_quote (int chr, SCM port)
 746 {
 747   SCM p;
 748   long line = SCM_LINUM (port);
 749   int column = SCM_COL (port) - 1;
 750
 751   switch (chr)
 752     {
 753     case '`':
 754       p = scm_sym_quasiquote;
 755       break;
 756
 757     case '\'':
 758       p = scm_sym_quote;
 759       break;
 760
 761     case ',':
 762       {
 763         scm_t_wchar c;
 764
 765         c = scm_getc_unlocked (port);
 766         if ('@' == c)
 767           p = scm_sym_uq_splicing;
 768         else
 769           {
 770             scm_ungetc_unlocked (c, port);
 771             p = scm_sym_unquote;
 772           }
 773         break;
 774       }
 775
 776     default:
 777       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 778                "scm_read_quote", chr);
 779       abort ();
 780     }
 781
 782   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 783   if (SCM_RECORD_POSITIONS_P)
 784     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 785
 786   return p;
 787 }
 788
 789 SCM_SYMBOL (sym_syntax, "syntax");
 790 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 791 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 792 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 793
 794 static SCM
 795 scm_read_syntax (int chr, SCM port)
 796 {
 797   SCM p;
 798   long line = SCM_LINUM (port);
 799   int column = SCM_COL (port) - 1;
 800
 801   switch (chr)
 802     {
 803     case '`':
 804       p = sym_quasisyntax;
 805       break;
 806
 807     case '\'':
 808       p = sym_syntax;
 809       break;
 810
 811     case ',':
 812       {
 813         int c;
 814
 815         c = scm_getc_unlocked (port);
 816         if ('@' == c)
 817           p = sym_unsyntax_splicing;
 818         else
 819           {
 820             scm_ungetc_unlocked (c, port);
 821             p = sym_unsyntax;
 822           }
 823         break;
 824       }
 825
 826     default:
 827       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 828                "scm_read_syntax", chr);
 829       abort ();
 830     }
 831
 832   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 833   if (SCM_RECORD_POSITIONS_P)
 834     scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
 835
 836   return p;
 837 }
 838
 839 static inline SCM
 840 scm_read_nil (int chr, SCM port)
 841 {
 842   SCM id = scm_read_mixed_case_symbol (chr, port);
 843
 844   if (!scm_is_eq (id, sym_nil))
 845     scm_i_input_error ("scm_read_nil", port,
 846                        "unexpected input while reading #nil: ~a",
 847                        scm_list_1 (id));
 848
 849   return SCM_ELISP_NIL;
 850 }
 851
 852 static inline SCM
 853 scm_read_semicolon_comment (int chr, SCM port)
 854 {
 855   int c;
 856
 857   /* We use the get_byte here because there is no need to get the
 858      locale correct with comment input. This presumes that newline
 859      always represents itself no matter what the encoding is.  */
 860   for (c = scm_get_byte_or_eof_unlocked (port);
 861        (c != EOF) && (c != '\n');
 862        c = scm_get_byte_or_eof_unlocked (port));
 863
 864   return SCM_UNSPECIFIED;
 865 }
 866
 867 \f
 868 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 869
 870 static SCM
 871 scm_read_boolean (int chr, SCM port)
 872 {
 873   switch (chr)
 874     {
 875     case 't':
 876     case 'T':
 877       return SCM_BOOL_T;
 878
 879     case 'f':
 880     case 'F':
 881       return SCM_BOOL_F;
 882     }
 883
 884   return SCM_UNSPECIFIED;
 885 }
 886
 887 static SCM
 888 scm_read_character (scm_t_wchar chr, SCM port)
 889 #define FUNC_NAME "scm_lreadr"
 890 {
 891   char buffer[READER_CHAR_NAME_MAX_SIZE];
 892   SCM charname;
 893   size_t charname_len, bytes_read;
 894   scm_t_wchar cp;
 895   int overflow;
 896   scm_t_port *pt;
 897
 898   overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
 899   if (overflow)
 900     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
 901
 902   if (bytes_read == 0)
 903     {
 904       chr = scm_getc_unlocked (port);
 905       if (chr == EOF)
 906         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 907                            "while reading character", SCM_EOL);
 908
 909       /* CHR must be a token delimiter, like a whitespace.  */
 910       return (SCM_MAKE_CHAR (chr));
 911     }
 912
 913   pt = SCM_PTAB_ENTRY (port);
 914
 915   /* Simple ASCII characters can be processed immediately.  Also, simple
 916      ISO-8859-1 characters can be processed immediately if the encoding for this
 917      port is ISO-8859-1.  */
 918   if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
 919     {
 920       SCM_COL (port) += 1;
 921       return SCM_MAKE_CHAR (buffer[0]);
 922     }
 923
 924   /* Otherwise, convert the buffer into a proper scheme string for
 925      processing.  */
 926   charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
 927                                pt->ilseq_handler);
 928   charname_len = scm_i_string_length (charname);
 929   SCM_COL (port) += charname_len;
 930   cp = scm_i_string_ref (charname, 0);
 931   if (charname_len == 1)
 932     return SCM_MAKE_CHAR (cp);
 933
 934   /* Ignore dotted circles, which may be used to keep combining characters from
 935      combining with the backslash in #\charname.  */
 936   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 937     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 938
 939   if (cp >= '0' && cp < '8')
 940     {
 941       /* Dirk:FIXME::  This type of character syntax is not R5RS
 942        * compliant.  Further, it should be verified that the constant
 943        * does only consist of octal digits.  */
 944       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 945       if (SCM_I_INUMP (p))
 946         {
 947           scm_t_wchar c = scm_to_uint32 (p);
 948           if (SCM_IS_UNICODE_CHAR (c))
 949             return SCM_MAKE_CHAR (c);
 950           else
 951             scm_i_input_error (FUNC_NAME, port,
 952                                "out-of-range octal character escape: ~a",
 953                                scm_list_1 (charname));
 954         }
 955     }
 956
 957   if (cp == 'x' && (charname_len > 1))
 958     {
 959       SCM p;
 960
 961       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
 962       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
 963                                 scm_from_uint (16));
 964       if (SCM_I_INUMP (p))
 965         {
 966           scm_t_wchar c = scm_to_uint32 (p);
 967           if (SCM_IS_UNICODE_CHAR (c))
 968             return SCM_MAKE_CHAR (c);
 969           else
 970             scm_i_input_error (FUNC_NAME, port,
 971                                "out-of-range hex character escape: ~a",
 972                                scm_list_1 (charname));
 973         }
 974     }
 975
 976   /* The names of characters should never have non-Latin1
 977      characters.  */
 978   if (scm_i_is_narrow_string (charname)
 979       || scm_i_try_narrow_string (charname))
 980     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 981                                        charname_len);
 982       if (scm_is_true (ch))
 983         return ch;
 984     }
 985
 986   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 987                      scm_list_1 (charname));
 988
 989   return SCM_UNSPECIFIED;
 990 }
 991 #undef FUNC_NAME
 992
 993 static inline SCM
 994 scm_read_keyword (int chr, SCM port)
 995 {
 996   SCM symbol;
 997
 998   /* Read the symbol that comprises the keyword.  Doing this instead of
 999      invoking a specific symbol reader function allows `scm_read_keyword ()'
1000      to adapt to the delimiters currently valid of symbols.
1001
1002      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1003   symbol = scm_read_expression (port);
1004   if (!scm_is_symbol (symbol))
1005     scm_i_input_error ("scm_read_keyword", port,
1006                        "keyword prefix `~a' not followed by a symbol: ~s",
1007                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009   return (scm_symbol_to_keyword (symbol));
1010 }
1011
1012 static inline SCM
1013 scm_read_vector (int chr, SCM port)
1014 {
1015   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016      guarantee that it's going to do what we want.  After all, this is an
1017      implementation detail of `scm_read_vector ()', not a desirable
1018      property.  */
1019   return (scm_vector (scm_read_sexp (chr, port)));
1020 }
1021
1022 static inline SCM
1023 scm_read_srfi4_vector (int chr, SCM port)
1024 {
1025   return scm_i_read_array (port, chr);
1026 }
1027
1028 static SCM
1029 scm_read_bytevector (scm_t_wchar chr, SCM port)
1030 {
1031   chr = scm_getc_unlocked (port);
1032   if (chr != 'u')
1033     goto syntax;
1034
1035   chr = scm_getc_unlocked (port);
1036   if (chr != '8')
1037     goto syntax;
1038
1039   chr = scm_getc_unlocked (port);
1040   if (chr != '(')
1041     goto syntax;
1042
1043   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045  syntax:
1046   scm_i_input_error ("read_bytevector", port,
1047                      "invalid bytevector prefix",
1048                      SCM_MAKE_CHAR (chr));
1049   return SCM_UNSPECIFIED;
1050 }
1051
1052 static SCM
1053 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
1054 {
1055   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1056      terribly inefficient but who cares?  */
1057   SCM s_bits = SCM_EOL;
1058
1059   for (chr = scm_getc_unlocked (port);
1060        (chr != EOF) && ((chr == '0') || (chr == '1'));
1061        chr = scm_getc_unlocked (port))
1062     {
1063       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1064     }
1065
1066   if (chr != EOF)
1067     scm_ungetc_unlocked (chr, port);
1068
1069   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070 }
1071
1072 static inline SCM
1073 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1074 {
1075   int bang_seen = 0;
1076
1077   for (;;)
1078     {
1079       int c = scm_getc_unlocked (port);
1080
1081       if (c == EOF)
1082         scm_i_input_error ("skip_block_comment", port,
1083                            "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085       if (c == '!')
1086         bang_seen = 1;
1087       else if (c == '#' && bang_seen)
1088         break;
1089       else
1090         bang_seen = 0;
1091     }
1092
1093   return SCM_UNSPECIFIED;
1094 }
1095
1096 static SCM
1097 scm_read_shebang (scm_t_wchar chr, SCM port)
1098 {
1099   int c = 0;
1100   if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
1101     {
1102       scm_ungetc_unlocked (c, port);
1103       return scm_read_scsh_block_comment (chr, port);
1104     }
1105   if ((c = scm_get_byte_or_eof_unlocked (port)) != '6')
1106     {
1107       scm_ungetc_unlocked (c, port);
1108       scm_ungetc_unlocked ('r', port);
1109       return scm_read_scsh_block_comment (chr, port);
1110     }
1111   if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
1112     {
1113       scm_ungetc_unlocked (c, port);
1114       scm_ungetc_unlocked ('6', port);
1115       scm_ungetc_unlocked ('r', port);
1116       return scm_read_scsh_block_comment (chr, port);
1117     }
1118   if ((c = scm_get_byte_or_eof_unlocked (port)) != 's')
1119     {
1120       scm_ungetc_unlocked (c, port);
1121       scm_ungetc_unlocked ('r', port);
1122       scm_ungetc_unlocked ('6', port);
1123       scm_ungetc_unlocked ('r', port);
1124       return scm_read_scsh_block_comment (chr, port);
1125     }
1126
1127   return SCM_UNSPECIFIED;
1128 }
1129
1130 static SCM
1131 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132 {
1133   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134      nested.  So care must be taken.  */
1135   int nesting_level = 1;
1136
1137   int a = scm_getc_unlocked (port);
1138
1139   if (a == EOF)
1140     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1141                        "unterminated `#| ... |#' comment", SCM_EOL);
1142
1143   while (nesting_level > 0)
1144     {
1145       int b = scm_getc_unlocked (port);
1146
1147       if (b == EOF)
1148         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1149                            "unterminated `#| ... |#' comment", SCM_EOL);
1150
1151       if (a == '|' && b == '#')
1152         {
1153           nesting_level--;
1154           b = EOF;
1155         }
1156       else if (a == '#' && b == '|')
1157         {
1158           nesting_level++;
1159           b = EOF;
1160         }
1161
1162       a = b;
1163     }
1164
1165   return SCM_UNSPECIFIED;
1166 }
1167
1168 static SCM
1169 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1170 {
1171   scm_t_wchar c;
1172
1173   c = flush_ws (port, (char *) NULL);
1174   if (EOF == c)
1175     scm_i_input_error ("read_commented_expression", port,
1176                        "no expression after #; comment", SCM_EOL);
1177   scm_ungetc_unlocked (c, port);
1178   scm_read_expression (port);
1179   return SCM_UNSPECIFIED;
1180 }
1181
1182 static SCM
1183 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1184 {
1185   /* Guile's extended symbol read syntax looks like this:
1186
1187        #{This is all a symbol name}#
1188
1189      So here, CHR is expected to be `{'.  */
1190   int saw_brace = 0;
1191   size_t len = 0;
1192   SCM buf = scm_i_make_string (1024, NULL, 0);
1193
1194   buf = scm_i_string_start_writing (buf);
1195
1196   while ((chr = scm_getc_unlocked (port)) != EOF)
1197     {
1198       if (saw_brace)
1199         {
1200           if (chr == '#')
1201             {
1202               break;
1203             }
1204           else
1205             {
1206               saw_brace = 0;
1207               scm_i_string_set_x (buf, len++, '}');
1208             }
1209         }
1210
1211       if (chr == '}')
1212         saw_brace = 1;
1213       else if (chr == '\\')
1214         {
1215           /* It used to be that print.c would print extended-read-syntax
1216              symbols with backslashes before "non-standard" chars, but
1217              this routine wouldn't do anything with those escapes.
1218              Bummer.  What we've done is to change print.c to output
1219              R6RS hex escapes for those characters, relying on the fact
1220              that the extended read syntax would never put a `\' before
1221              an `x'.  For now, we just ignore other instances of
1222              backslash in the string.  */
1223           switch ((chr = scm_getc_unlocked (port)))
1224             {
1225             case EOF:
1226               goto done;
1227             case 'x':
1228               {
1229                 scm_t_wchar c;
1230
1231                 SCM_READ_HEX_ESCAPE (10, ';');
1232                 scm_i_string_set_x (buf, len++, c);
1233                 break;
1234
1235               str_eof:
1236                 chr = EOF;
1237                 goto done;
1238
1239               bad_escaped:
1240                 scm_i_string_stop_writing ();
1241                 scm_i_input_error ("scm_read_extended_symbol", port,
1242                                    "illegal character in escape sequence: ~S",
1243                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1244                 break;
1245               }
1246             default:
1247               scm_i_string_set_x (buf, len++, chr);
1248               break;
1249             }
1250         }
1251       else
1252         scm_i_string_set_x (buf, len++, chr);
1253
1254       if (len >= scm_i_string_length (buf) - 2)
1255         {
1256           SCM addy;
1257
1258           scm_i_string_stop_writing ();
1259           addy = scm_i_make_string (1024, NULL, 0);
1260           buf = scm_string_append (scm_list_2 (buf, addy));
1261           len = 0;
1262           buf = scm_i_string_start_writing (buf);
1263         }
1264     }
1265
1266  done:
1267   scm_i_string_stop_writing ();
1268   if (chr == EOF)
1269     scm_i_input_error ("scm_read_extended_symbol", port,
1270                        "end of file while reading symbol", SCM_EOL);
1271
1272   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1273 }
1274
1275
1276 \f
1277 /* Top-level token readers, i.e., dispatchers.  */
1278
1279 static SCM
1280 scm_read_sharp_extension (int chr, SCM port)
1281 {
1282   SCM proc;
1283
1284   proc = scm_get_hash_procedure (chr);
1285   if (scm_is_true (scm_procedure_p (proc)))
1286     {
1287       long line = SCM_LINUM (port);
1288       int column = SCM_COL (port) - 2;
1289       SCM got;
1290
1291       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1292
1293       if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1294         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1295
1296       return got;
1297     }
1298
1299   return SCM_UNSPECIFIED;
1300 }
1301
1302 /* The reader for the sharp `#' character.  It basically dispatches reads
1303    among the above token readers.   */
1304 static SCM
1305 scm_read_sharp (scm_t_wchar chr, SCM port)
1306 #define FUNC_NAME "scm_lreadr"
1307 {
1308   SCM result;
1309
1310   chr = scm_getc_unlocked (port);
1311
1312   result = scm_read_sharp_extension (chr, port);
1313   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1314     return result;
1315
1316   switch (chr)
1317     {
1318     case '\\':
1319       return (scm_read_character (chr, port));
1320     case '(':
1321       return (scm_read_vector (chr, port));
1322     case 's':
1323     case 'u':
1324     case 'f':
1325     case 'c':
1326       /* This one may return either a boolean or an SRFI-4 vector.  */
1327       return (scm_read_srfi4_vector (chr, port));
1328     case 'v':
1329       return (scm_read_bytevector (chr, port));
1330     case '*':
1331       return (scm_read_guile_bit_vector (chr, port));
1332     case 't':
1333     case 'T':
1334     case 'F':
1335       /* This one may return either a boolean or an SRFI-4 vector.  */
1336       return (scm_read_boolean (chr, port));
1337     case ':':
1338       return (scm_read_keyword (chr, port));
1339     case '0': case '1': case '2': case '3': case '4':
1340     case '5': case '6': case '7': case '8': case '9':
1341     case '@':
1342       return (scm_i_read_array (port, chr));
1343
1344     case 'i':
1345     case 'e':
1346     case 'b':
1347     case 'B':
1348     case 'o':
1349     case 'O':
1350     case 'd':
1351     case 'D':
1352     case 'x':
1353     case 'X':
1354     case 'I':
1355     case 'E':
1356       return (scm_read_number_and_radix (chr, port));
1357     case '{':
1358       return (scm_read_extended_symbol (chr, port));
1359     case '!':
1360       return (scm_read_shebang (chr, port));
1361     case ';':
1362       return (scm_read_commented_expression (chr, port));
1363     case '`':
1364     case '\'':
1365     case ',':
1366       return (scm_read_syntax (chr, port));
1367     case 'n':
1368       return (scm_read_nil (chr, port));
1369     default:
1370       result = scm_read_sharp_extension (chr, port);
1371       if (scm_is_eq (result, SCM_UNSPECIFIED))
1372         {
1373           /* To remain compatible with 1.8 and earlier, the following
1374              characters have lower precedence than `read-hash-extend'
1375              characters.  */
1376           switch (chr)
1377             {
1378             case '|':
1379               return scm_read_r6rs_block_comment (chr, port);
1380             default:
1381               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1382                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1383             }
1384         }
1385       else
1386         return result;
1387     }
1388
1389   return SCM_UNSPECIFIED;
1390 }
1391 #undef FUNC_NAME
1392
1393 static SCM
1394 scm_read_expression (SCM port)
1395 #define FUNC_NAME "scm_read_expression"
1396 {
1397   while (1)
1398     {
1399       register scm_t_wchar chr;
1400
1401       chr = scm_getc_unlocked (port);
1402
1403       switch (chr)
1404         {
1405         case SCM_WHITE_SPACES:
1406         case SCM_LINE_INCREMENTORS:
1407           break;
1408         case ';':
1409           (void) scm_read_semicolon_comment (chr, port);
1410           break;
1411         case '[':
1412           if (!SCM_SQUARE_BRACKETS_P)
1413             return (scm_read_mixed_case_symbol (chr, port));
1414           /* otherwise fall through */
1415         case '(':
1416           return (scm_read_sexp (chr, port));
1417         case '"':
1418           return (scm_read_string (chr, port));
1419         case '\'':
1420         case '`':
1421         case ',':
1422           return (scm_read_quote (chr, port));
1423         case '#':
1424           {
1425             SCM result;
1426             result = scm_read_sharp (chr, port);
1427             if (scm_is_eq (result, SCM_UNSPECIFIED))
1428               /* We read a comment or some such.  */
1429               break;
1430             else
1431               return result;
1432           }
1433         case ')':
1434           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1435           break;
1436         case ']':
1437           if (SCM_SQUARE_BRACKETS_P)
1438             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1439           /* otherwise fall through */
1440         case EOF:
1441           return SCM_EOF_VAL;
1442         case ':':
1443           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1444             return scm_symbol_to_keyword (scm_read_expression (port));
1445           /* Fall through.  */
1446
1447         default:
1448           {
1449             if (((chr >= '0') && (chr <= '9'))
1450                 || (strchr ("+-.", chr)))
1451               return (scm_read_number (chr, port));
1452             else
1453               return (scm_read_mixed_case_symbol (chr, port));
1454           }
1455         }
1456     }
1457 }
1458 #undef FUNC_NAME
1459
1460 \f
1461 /* Actual reader.  */
1462
1463 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1464             (SCM port),
1465             "Read an s-expression from the input port @var{port}, or from\n"
1466             "the current input port if @var{port} is not specified.\n"
1467             "Any whitespace before the next token is discarded.")
1468 #define FUNC_NAME s_scm_read
1469 {
1470   int c;
1471
1472   if (SCM_UNBNDP (port))
1473     port = scm_current_input_port ();
1474   SCM_VALIDATE_OPINPORT (1, port);
1475
1476   c = flush_ws (port, (char *) NULL);
1477   if (EOF == c)
1478     return SCM_EOF_VAL;
1479   scm_ungetc_unlocked (c, port);
1480
1481   return (scm_read_expression (port));
1482 }
1483 #undef FUNC_NAME
1484
1485
1486 \f
1487
1488 /* Manipulate the read-hash-procedures alist.  This could be written in
1489    Scheme, but maybe it will also be used by C code during initialisation.  */
1490 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1491             (SCM chr, SCM proc),
1492             "Install the procedure @var{proc} for reading expressions\n"
1493             "starting with the character sequence @code{#} and @var{chr}.\n"
1494             "@var{proc} will be called with two arguments:  the character\n"
1495             "@var{chr} and the port to read further data from. The object\n"
1496             "returned will be the return value of @code{read}. \n"
1497             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1498             )
1499 #define FUNC_NAME s_scm_read_hash_extend
1500 {
1501   SCM this;
1502   SCM prev;
1503
1504   SCM_VALIDATE_CHAR (1, chr);
1505   SCM_ASSERT (scm_is_false (proc)
1506               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1507               proc, SCM_ARG2, FUNC_NAME);
1508
1509   /* Check if chr is already in the alist.  */
1510   this = scm_i_read_hash_procedures_ref ();
1511   prev = SCM_BOOL_F;
1512   while (1)
1513     {
1514       if (scm_is_null (this))
1515         {
1516           /* not found, so add it to the beginning.  */
1517           if (scm_is_true (proc))
1518             {
1519               SCM new = scm_cons (scm_cons (chr, proc),
1520                                   scm_i_read_hash_procedures_ref ());
1521               scm_i_read_hash_procedures_set_x (new);
1522             }
1523           break;
1524         }
1525       if (scm_is_eq (chr, SCM_CAAR (this)))
1526         {
1527           /* already in the alist.  */
1528           if (scm_is_false (proc))
1529             {
1530               /* remove it.  */
1531               if (scm_is_false (prev))
1532                 {
1533                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1534                   scm_i_read_hash_procedures_set_x (rest);
1535                 }
1536               else
1537                 scm_set_cdr_x (prev, SCM_CDR (this));
1538             }
1539           else
1540             {
1541               /* replace it.  */
1542               scm_set_cdr_x (SCM_CAR (this), proc);
1543             }
1544           break;
1545         }
1546       prev = this;
1547       this = SCM_CDR (this);
1548     }
1549
1550   return SCM_UNSPECIFIED;
1551 }
1552 #undef FUNC_NAME
1553
1554 /* Recover the read-hash procedure corresponding to char c.  */
1555 static SCM
1556 scm_get_hash_procedure (int c)
1557 {
1558   SCM rest = scm_i_read_hash_procedures_ref ();
1559
1560   while (1)
1561     {
1562       if (scm_is_null (rest))
1563         return SCM_BOOL_F;
1564
1565       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1566         return SCM_CDAR (rest);
1567
1568       rest = SCM_CDR (rest);
1569     }
1570 }
1571
1572 #define SCM_ENCODING_SEARCH_SIZE (500)
1573
1574 /* Search the first few hundred characters of a file for an Emacs-like coding
1575    declaration.  Returns either NULL or a string whose storage has been
1576    allocated with `scm_gc_malloc ()'.  */
1577 char *
1578 scm_i_scan_for_encoding (SCM port)
1579 {
1580   scm_t_port *pt;
1581   char header[SCM_ENCODING_SEARCH_SIZE+1];
1582   size_t bytes_read, encoding_length, i;
1583   char *encoding = NULL;
1584   int utf8_bom = 0;
1585   char *pos, *encoding_start;
1586   int in_comment;
1587
1588   pt = SCM_PTAB_ENTRY (port);
1589
1590   if (pt->rw_active == SCM_PORT_WRITE)
1591     scm_flush_unlocked (port);
1592
1593   if (pt->rw_random)
1594     pt->rw_active = SCM_PORT_READ;
1595
1596   if (pt->read_pos == pt->read_end)
1597     {
1598       /* We can use the read buffer, and thus avoid a seek. */
1599       if (scm_fill_input_unlocked (port) == EOF)
1600         return NULL;
1601
1602       bytes_read = pt->read_end - pt->read_pos;
1603       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1604         bytes_read = SCM_ENCODING_SEARCH_SIZE;
1605
1606       if (bytes_read <= 1)
1607         /* An unbuffered port -- don't scan.  */
1608         return NULL;
1609
1610       memcpy (header, pt->read_pos, bytes_read);
1611       header[bytes_read] = '\0';
1612     }
1613   else
1614     {
1615       /* Try to read some bytes and then seek back.  Not all ports
1616          support seeking back; and indeed some file ports (like
1617          /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1618          check performed by SCM_FPORT_FDES---but fail to seek
1619          backwards.  Hence this block comes second.  We prefer to use
1620          the read buffer in-place.  */
1621       if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1622         return NULL;
1623
1624       bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
1625       header[bytes_read] = '\0';
1626       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1627     }
1628
1629   if (bytes_read > 3
1630       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1631     utf8_bom = 1;
1632
1633   /* search past "coding[:=]" */
1634   pos = header;
1635   while (1)
1636     {
1637       if ((pos = strstr(pos, "coding")) == NULL)
1638         return NULL;
1639
1640       pos += strlen("coding");
1641       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1642           (*pos == ':' || *pos == '='))
1643         {
1644           pos ++;
1645           break;
1646         }
1647     }
1648
1649   /* skip spaces */
1650   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1651          (*pos == ' ' || *pos == '\t'))
1652     pos ++;
1653
1654   /* grab the next token */
1655   encoding_start = pos;
1656   i = 0;
1657   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1658          && encoding_start + i - header < bytes_read
1659          && (isalnum ((int) encoding_start[i])
1660              || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
1661     i++;
1662
1663   encoding_length = i;
1664   if (encoding_length == 0)
1665     return NULL;
1666
1667   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1668   for (i = 0; i < encoding_length; i++)
1669     encoding[i] = toupper ((int) encoding[i]);
1670
1671   /* push backwards to make sure we were in a comment */
1672   in_comment = 0;
1673   pos = encoding_start;
1674   while (pos >= header)
1675     {
1676       if (*pos == ';')
1677         {
1678           in_comment = 1;
1679           break;
1680         }
1681       else if (*pos == '\n' || pos == header)
1682         {
1683           /* This wasn't in a semicolon comment. Check for a
1684            hash-bang comment. */
1685           char *beg = strstr (header, "#!");
1686           char *end = strstr (header, "!#");
1687           if (beg < encoding_start && encoding_start + encoding_length <= end)
1688             in_comment = 1;
1689           break;
1690         }
1691       else
1692         {
1693           pos --;
1694           continue;
1695         }
1696     }
1697   if (!in_comment)
1698     /* This wasn't in a comment */
1699     return NULL;
1700
1701   if (utf8_bom && strcmp(encoding, "UTF-8"))
1702     scm_misc_error (NULL,
1703                     "the port input declares the encoding ~s but is encoded as UTF-8",
1704                     scm_list_1 (scm_from_locale_string (encoding)));
1705
1706   return encoding;
1707 }
1708
1709 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1710             (SCM port),
1711             "Scans the port for an Emacs-like character coding declaration\n"
1712             "near the top of the contents of a port with random-accessible contents.\n"
1713             "The coding declaration is of the form\n"
1714             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1715             "\n"
1716             "Returns a string containing the character encoding of the file\n"
1717             "if a declaration was found, or @code{#f} otherwise.\n")
1718 #define FUNC_NAME s_scm_file_encoding
1719 {
1720   char *enc;
1721   SCM s_enc;
1722
1723   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1724
1725   enc = scm_i_scan_for_encoding (port);
1726   if (enc == NULL)
1727     return SCM_BOOL_F;
1728   else
1729     {
1730       s_enc = scm_from_locale_string (enc);
1731       return s_enc;
1732     }
1733
1734   return SCM_BOOL_F;
1735 }
1736 #undef FUNC_NAME
1737
1738 void
1739 scm_init_read ()
1740 {
1741   SCM read_hash_procs;
1742
1743   read_hash_procs = scm_make_fluid ();
1744   scm_fluid_set_x (read_hash_procs, SCM_EOL);
1745
1746   scm_i_read_hash_procedures =
1747     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
1748
1749   scm_init_opts (scm_read_options, scm_read_opts);
1750 #include "libguile/read.x"
1751 }
1752
1753 /*
1754   Local Variables:
1755   c-file-style: "gnu"
1756   End:
1757 */