libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30
  31 #include "libguile/_scm.h"
  32 #include "libguile/bytevectors.h"
  33 #include "libguile/chars.h"
  34 #include "libguile/eval.h"
  35 #include "libguile/unif.h"
  36 #include "libguile/keywords.h"
  37 #include "libguile/alist.h"
  38 #include "libguile/srcprop.h"
  39 #include "libguile/hashtab.h"
  40 #include "libguile/hash.h"
  41 #include "libguile/ports.h"
  42 #include "libguile/root.h"
  43 #include "libguile/strings.h"
  44 #include "libguile/strports.h"
  45 #include "libguile/vectors.h"
  46 #include "libguile/validate.h"
  47 #include "libguile/srfi-4.h"
  48 #include "libguile/srfi-13.h"
  49
  50 #include "libguile/read.h"
  51 #include "libguile/private-options.h"
  52
  53
  54 \f
  55
  56 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  57 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  58 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  59
  60 scm_t_option scm_read_opts[] = {
  61   { SCM_OPTION_BOOLEAN, "copy", 0,
  62     "Copy source code expressions." },
  63   { SCM_OPTION_BOOLEAN, "positions", 0,
  64     "Record positions of source code expressions." },
  65   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  66     "Convert symbols to lower case."},
  67   { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
  68     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  69 #if SCM_ENABLE_ELISP
  70   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  71     "Support Elisp vector syntax, namely `[...]'."},
  72   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  73     "Support `\\(' and `\\)' in strings."},
  74 #endif
  75   { 0, },
  76 };
  77
  78 /*
  79   Give meaningful error messages for errors
  80
  81   We use the format
  82
  83   FILE:LINE:COL: MESSAGE
  84   This happened in ....
  85
  86   This is not standard GNU format, but the test-suite likes the real
  87   message to be in front.
  88
  89  */
  90
  91
  92 void
  93 scm_i_input_error (char const *function,
  94                    SCM port, const char *message, SCM arg)
  95 {
  96   SCM fn = (scm_is_string (SCM_FILENAME(port))
  97             ? SCM_FILENAME(port)
  98             : scm_from_locale_string ("#<unknown port>"));
  99
 100   SCM string_port = scm_open_output_string ();
 101   SCM string = SCM_EOL;
 102   scm_simple_format (string_port,
 103                      scm_from_locale_string ("~A:~S:~S: ~A"),
 104                      scm_list_4 (fn,
 105                                  scm_from_long (SCM_LINUM (port) + 1),
 106                                  scm_from_int (SCM_COL (port) + 1),
 107                                  scm_from_locale_string (message)));
 108
 109   string = scm_get_output_string (string_port);
 110   scm_close_output_port (string_port);
 111   scm_error_scm (scm_from_locale_symbol ("read-error"),
 112                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 113                  string,
 114                  arg,
 115                  SCM_BOOL_F);
 116 }
 117
 118
 119 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 120             (SCM setting),
 121             "Option interface for the read options. Instead of using\n"
 122             "this procedure directly, use the procedures @code{read-enable},\n"
 123             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 124 #define FUNC_NAME s_scm_read_options
 125 {
 126   SCM ans = scm_options (setting,
 127                          scm_read_opts,
 128                          FUNC_NAME);
 129   if (SCM_COPY_SOURCE_P)
 130     SCM_RECORD_POSITIONS_P = 1;
 131   return ans;
 132 }
 133 #undef FUNC_NAME
 134
 135 /* An association list mapping extra hash characters to procedures.  */
 136 static SCM *scm_read_hash_procedures;
 137
 138
 139 \f
 140 /* Token readers.  */
 141
 142
 143 /* Size of the C buffer used to read symbols and numbers.  */
 144 #define READER_BUFFER_SIZE            128
 145
 146 /* Size of the C buffer used to read strings.  */
 147 #define READER_STRING_BUFFER_SIZE     512
 148
 149 /* The maximum size of Scheme character names.  */
 150 #define READER_CHAR_NAME_MAX_SIZE      50
 151
 152
 153 /* `isblank' is only in C99.  */
 154 #define CHAR_IS_BLANK_(_chr)                                    \
 155   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 156    || ((_chr) == '\f') || ((_chr) == '\r'))
 157
 158 #ifdef MSDOS
 159 # define CHAR_IS_BLANK(_chr)                    \
 160   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 161 #else
 162 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 163 #endif
 164
 165
 166 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 167    structure'').  */
 168 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 169   (CHAR_IS_BLANK (c)                                            \
 170    || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
 171
 172 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 173
 174 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 175    Structure''.  */
 176 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 177   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 178    || ((_chr) == 'd') || ((_chr) == 'l'))
 179
 180 /* An inlinable version of `scm_c_downcase ()'.  */
 181 #define CHAR_DOWNCASE(_chr)                             \
 182   (((_chr) <= UCHAR_MAX) ? tolower (_chr) : (_chr))
 183
 184
 185 /* Read an SCSH block comment.  */
 186 static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
 187 static SCM scm_read_commented_expression (int chr, SCM port);
 188
 189 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Return
 190    zero if the whole token fits in BUF, non-zero otherwise.  */
 191 static inline int
 192 read_token (SCM port, char *buf, size_t buf_size, size_t *read)
 193 {
 194   *read = 0;
 195
 196   while (*read < buf_size)
 197     {
 198       int chr;
 199
 200       chr = scm_getc (port);
 201       chr = (SCM_CASE_INSENSITIVE_P ? CHAR_DOWNCASE (chr) : chr);
 202
 203       if (chr == EOF)
 204         return 0;
 205       else if (CHAR_IS_DELIMITER (chr))
 206         {
 207           scm_ungetc (chr, port);
 208           return 0;
 209         }
 210       else
 211         {
 212           *buf = (char) chr;
 213           buf++, (*read)++;
 214         }
 215     }
 216
 217   return 1;
 218 }
 219
 220
 221 /* Skip whitespace from PORT and return the first non-whitespace character
 222    read.  Raise an error on end-of-file.  */
 223 static int
 224 flush_ws (SCM port, const char *eoferr)
 225 {
 226   register int c;
 227   while (1)
 228     switch (c = scm_getc (port))
 229       {
 230       case EOF:
 231       goteof:
 232         if (eoferr)
 233           {
 234             scm_i_input_error (eoferr,
 235                                port,
 236                                "end of file",
 237                                SCM_EOL);
 238           }
 239         return c;
 240
 241       case ';':
 242       lp:
 243         switch (c = scm_getc (port))
 244           {
 245           case EOF:
 246             goto goteof;
 247           default:
 248             goto lp;
 249           case SCM_LINE_INCREMENTORS:
 250             break;
 251           }
 252         break;
 253
 254       case '#':
 255         switch (c = scm_getc (port))
 256           {
 257           case EOF:
 258             eoferr = "read_sharp";
 259             goto goteof;
 260           case '!':
 261             scm_read_scsh_block_comment (c, port);
 262             break;
 263           case ';':
 264             scm_read_commented_expression (c, port);
 265             break;
 266           default:
 267             scm_ungetc (c, port);
 268             return '#';
 269           }
 270         break;
 271
 272       case SCM_LINE_INCREMENTORS:
 273       case SCM_SINGLE_SPACES:
 274       case '\t':
 275         break;
 276
 277       default:
 278         return c;
 279       }
 280
 281   return 0;
 282 }
 283
 284
 285 \f
 286 /* Token readers.  */
 287
 288 static SCM scm_read_expression (SCM port);
 289 static SCM scm_read_sharp (int chr, SCM port);
 290 static SCM scm_get_hash_procedure (int c);
 291 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 292
 293
 294 static SCM
 295 scm_read_sexp (int chr, SCM port)
 296 #define FUNC_NAME "scm_i_lreadparen"
 297 {
 298   register int c;
 299   register SCM tmp;
 300   register SCM tl, ans = SCM_EOL;
 301   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 302   static const int terminating_char = ')';
 303
 304   /* Need to capture line and column numbers here. */
 305   long line = SCM_LINUM (port);
 306   int column = SCM_COL (port) - 1;
 307
 308
 309   c = flush_ws (port, FUNC_NAME);
 310   if (terminating_char == c)
 311     return SCM_EOL;
 312
 313   scm_ungetc (c, port);
 314   if (scm_is_eq (scm_sym_dot,
 315                  (tmp = scm_read_expression (port))))
 316     {
 317       ans = scm_read_expression (port);
 318       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 319         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 320                            SCM_EOL);
 321       return ans;
 322     }
 323
 324   /* Build the head of the list structure. */
 325   ans = tl = scm_cons (tmp, SCM_EOL);
 326
 327   if (SCM_COPY_SOURCE_P)
 328     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 329                            ? copy
 330                            : tmp,
 331                            SCM_EOL);
 332
 333   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 334     {
 335       SCM new_tail;
 336
 337       scm_ungetc (c, port);
 338       if (scm_is_eq (scm_sym_dot,
 339                      (tmp = scm_read_expression (port))))
 340         {
 341           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 342
 343           if (SCM_COPY_SOURCE_P)
 344             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 345                                        SCM_EOL));
 346
 347           c = flush_ws (port, FUNC_NAME);
 348           if (terminating_char != c)
 349             scm_i_input_error (FUNC_NAME, port,
 350                                "in pair: missing close paren", SCM_EOL);
 351           goto exit;
 352         }
 353
 354       new_tail = scm_cons (tmp, SCM_EOL);
 355       SCM_SETCDR (tl, new_tail);
 356       tl = new_tail;
 357
 358       if (SCM_COPY_SOURCE_P)
 359         {
 360           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 361                                     ? copy
 362                                     : tmp, SCM_EOL);
 363           SCM_SETCDR (tl2, new_tail2);
 364           tl2 = new_tail2;
 365         }
 366     }
 367
 368  exit:
 369   if (SCM_RECORD_POSITIONS_P)
 370     scm_whash_insert (scm_source_whash,
 371                       ans,
 372                       scm_make_srcprops (line, column,
 373                                          SCM_FILENAME (port),
 374                                          SCM_COPY_SOURCE_P
 375                                          ? ans2
 376                                          : SCM_UNDEFINED,
 377                                          SCM_EOL));
 378   return ans;
 379 }
 380 #undef FUNC_NAME
 381
 382 static SCM
 383 scm_read_string (int chr, SCM port)
 384 #define FUNC_NAME "scm_lreadr"
 385 {
 386   /* For strings smaller than C_STR, this function creates only one Scheme
 387      object (the string returned).  */
 388
 389   SCM str = SCM_BOOL_F;
 390   unsigned c_str_len = 0;
 391   scm_t_wchar c;
 392
 393   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 394   while ('"' != (c = scm_getc (port)))
 395     {
 396       if (c == EOF)
 397         {
 398         str_eof:
 399           scm_i_input_error (FUNC_NAME, port,
 400                              "end of file in string constant", SCM_EOL);
 401         }
 402
 403       if (c_str_len + 1 >= scm_i_string_length (str))
 404         {
 405           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 406
 407           str = scm_string_append (scm_list_2 (str, addy));
 408         }
 409
 410       if (c == '\\')
 411         {
 412           switch (c = scm_getc (port))
 413             {
 414             case EOF:
 415               goto str_eof;
 416             case '"':
 417             case '\\':
 418               break;
 419 #if SCM_ENABLE_ELISP
 420             case '(':
 421             case ')':
 422               if (SCM_ESCAPED_PARENS_P)
 423                 break;
 424               goto bad_escaped;
 425 #endif
 426             case '\n':
 427               continue;
 428             case '0':
 429               c = '\0';
 430               break;
 431             case 'f':
 432               c = '\f';
 433               break;
 434             case 'n':
 435               c = '\n';
 436               break;
 437             case 'r':
 438               c = '\r';
 439               break;
 440             case 't':
 441               c = '\t';
 442               break;
 443             case 'a':
 444               c = '\007';
 445               break;
 446             case 'v':
 447               c = '\v';
 448               break;
 449             case 'x':
 450               {
 451                 scm_t_wchar a, b;
 452                 a = scm_getc (port);
 453                 if (a == EOF)
 454                   goto str_eof;
 455                 b = scm_getc (port);
 456                 if (b == EOF)
 457                   goto str_eof;
 458                 if ('0' <= a && a <= '9')
 459                   a -= '0';
 460                 else if ('A' <= a && a <= 'F')
 461                   a = a - 'A' + 10;
 462                 else if ('a' <= a && a <= 'f')
 463                   a = a - 'a' + 10;
 464                 else
 465                   {
 466                     c = a;
 467                     goto bad_escaped;
 468                   }
 469                 if ('0' <= b && b <= '9')
 470                   b -= '0';
 471                 else if ('A' <= b && b <= 'F')
 472                   b = b - 'A' + 10;
 473                 else if ('a' <= b && b <= 'f')
 474                   b = b - 'a' + 10;
 475                 else
 476                   {
 477                     c = b;
 478                     goto bad_escaped;
 479                   }
 480                 c = a * 16 + b;
 481                 break;
 482               }
 483             case 'u':
 484               {
 485                 scm_t_wchar a;
 486                 int i;
 487                 c = 0;
 488                 for (i = 0; i < 4; i++)
 489                   {
 490                     a = scm_getc (port);
 491                     if (a == EOF)
 492                       goto str_eof;
 493                     if ('0' <= a && a <= '9')
 494                       a -= '0';
 495                     else if ('A' <= a && a <= 'F')
 496                       a = a - 'A' + 10;
 497                     else if ('a' <= a && a <= 'f')
 498                       a = a - 'a' + 10;
 499                     else
 500                       {
 501                         c = a;
 502                         goto bad_escaped;
 503                       }
 504                     c = c * 16 + a;
 505                   }
 506                 break;
 507               }
 508             case 'U':
 509               {
 510                 scm_t_wchar a;
 511                 int i;
 512                 c = 0;
 513                 for (i = 0; i < 6; i++)
 514                   {
 515                     a = scm_getc (port);
 516                     if (a == EOF)
 517                       goto str_eof;
 518                     if ('0' <= a && a <= '9')
 519                       a -= '0';
 520                     else if ('A' <= a && a <= 'F')
 521                       a = a - 'A' + 10;
 522                     else if ('a' <= a && a <= 'f')
 523                       a = a - 'a' + 10;
 524                     else
 525                       {
 526                         c = a;
 527                         goto bad_escaped;
 528                       }
 529                     c = c * 16 + a;
 530                   }
 531                 break;
 532               }
 533             default:
 534             bad_escaped:
 535               scm_i_input_error (FUNC_NAME, port,
 536                                  "illegal character in escape sequence: ~S",
 537                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 538             }
 539         }
 540       str = scm_i_string_start_writing (str);
 541       scm_i_string_set_x (str, c_str_len++, c);
 542       scm_i_string_stop_writing ();
 543     }
 544
 545   if (c_str_len > 0)
 546     {
 547       return scm_i_substring_copy (str, 0, c_str_len);
 548     }
 549
 550   return scm_nullstr;
 551 }
 552 #undef FUNC_NAME
 553
 554
 555 static SCM
 556 scm_read_number (int chr, SCM port)
 557 {
 558   SCM result, str = SCM_EOL;
 559   char buffer[READER_BUFFER_SIZE];
 560   size_t read;
 561   int overflow = 0;
 562
 563   scm_ungetc (chr, port);
 564   do
 565     {
 566       overflow = read_token (port, buffer, sizeof (buffer), &read);
 567
 568       if ((overflow) || (scm_is_pair (str)))
 569         str = scm_cons (scm_from_locale_stringn (buffer, read), str);
 570     }
 571   while (overflow);
 572
 573   if (scm_is_pair (str))
 574     {
 575       /* The slow path.  */
 576
 577       str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
 578       result = scm_string_to_number (str, SCM_UNDEFINED);
 579       if (!scm_is_true (result))
 580         /* Return a symbol instead of a number.  */
 581         result = scm_string_to_symbol (str);
 582     }
 583   else
 584     {
 585       result = scm_c_locale_stringn_to_number (buffer, read, 10);
 586       if (!scm_is_true (result))
 587         /* Return a symbol instead of a number.  */
 588         result = scm_from_locale_symboln (buffer, read);
 589     }
 590
 591   return result;
 592 }
 593
 594 static SCM
 595 scm_read_mixed_case_symbol (int chr, SCM port)
 596 {
 597   SCM result, str = SCM_EOL;
 598   int overflow = 0, ends_with_colon = 0;
 599   char buffer[READER_BUFFER_SIZE];
 600   size_t read = 0;
 601   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 602
 603   scm_ungetc (chr, port);
 604   do
 605     {
 606       overflow = read_token (port, buffer, sizeof (buffer), &read);
 607
 608       if (read > 0)
 609         ends_with_colon = (buffer[read - 1] == ':');
 610
 611       if ((overflow) || (scm_is_pair (str)))
 612         str = scm_cons (scm_from_locale_stringn (buffer, read), str);
 613     }
 614   while (overflow);
 615
 616   if (scm_is_pair (str))
 617     {
 618       size_t len;
 619
 620       str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
 621       len = scm_c_string_length (str);
 622
 623       /* Per SRFI-88, `:' alone is an identifier, not a keyword.  */
 624       if (postfix && ends_with_colon && (len > 1))
 625         {
 626           /* Strip off colon.  */
 627           str = scm_c_substring (str, 0, len-1);
 628           result = scm_string_to_symbol (str);
 629           result = scm_symbol_to_keyword (result);
 630         }
 631       else
 632         result = scm_string_to_symbol (str);
 633     }
 634   else
 635     {
 636       /* For symbols smaller than `sizeof (buffer)', we don't need to recur
 637          to Scheme strings.  Therefore, we only create one Scheme object (a
 638          symbol) per symbol read.  */
 639       if (postfix && ends_with_colon && (read > 1))
 640         result = scm_from_locale_keywordn (buffer, read - 1);
 641       else
 642         result = scm_from_locale_symboln (buffer, read);
 643     }
 644
 645   return result;
 646 }
 647
 648 static SCM
 649 scm_read_number_and_radix (int chr, SCM port)
 650 #define FUNC_NAME "scm_lreadr"
 651 {
 652   SCM result, str = SCM_EOL;
 653   size_t read;
 654   char buffer[READER_BUFFER_SIZE];
 655   unsigned int radix;
 656   int overflow = 0;
 657
 658   switch (chr)
 659     {
 660     case 'B':
 661     case 'b':
 662       radix = 2;
 663       break;
 664
 665     case 'o':
 666     case 'O':
 667       radix = 8;
 668       break;
 669
 670     case 'd':
 671     case 'D':
 672       radix = 10;
 673       break;
 674
 675     case 'x':
 676     case 'X':
 677       radix = 16;
 678       break;
 679
 680     default:
 681       scm_ungetc (chr, port);
 682       scm_ungetc ('#', port);
 683       radix = 10;
 684     }
 685
 686   do
 687     {
 688       overflow = read_token (port, buffer, sizeof (buffer), &read);
 689
 690       if ((overflow) || (scm_is_pair (str)))
 691         str = scm_cons (scm_from_locale_stringn (buffer, read), str);
 692     }
 693   while (overflow);
 694
 695   if (scm_is_pair (str))
 696     {
 697       str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
 698       result = scm_string_to_number (str, scm_from_uint (radix));
 699     }
 700   else
 701     result = scm_c_locale_stringn_to_number (buffer, read, radix);
 702
 703   if (scm_is_true (result))
 704     return result;
 705
 706   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 707
 708   return SCM_BOOL_F;
 709 }
 710 #undef FUNC_NAME
 711
 712 static SCM
 713 scm_read_quote (int chr, SCM port)
 714 {
 715   SCM p;
 716   long line = SCM_LINUM (port);
 717   int column = SCM_COL (port) - 1;
 718
 719   switch (chr)
 720     {
 721     case '`':
 722       p = scm_sym_quasiquote;
 723       break;
 724
 725     case '\'':
 726       p = scm_sym_quote;
 727       break;
 728
 729     case ',':
 730       {
 731         int c;
 732
 733         c = scm_getc (port);
 734         if ('@' == c)
 735           p = scm_sym_uq_splicing;
 736         else
 737           {
 738             scm_ungetc (c, port);
 739             p = scm_sym_unquote;
 740           }
 741         break;
 742       }
 743
 744     default:
 745       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 746                "scm_read_quote", chr);
 747       abort ();
 748     }
 749
 750   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 751   if (SCM_RECORD_POSITIONS_P)
 752     scm_whash_insert (scm_source_whash, p,
 753                       scm_make_srcprops (line, column,
 754                                          SCM_FILENAME (port),
 755                                          SCM_COPY_SOURCE_P
 756                                          ? (scm_cons2 (SCM_CAR (p),
 757                                                        SCM_CAR (SCM_CDR (p)),
 758                                                        SCM_EOL))
 759                                          : SCM_UNDEFINED,
 760                                          SCM_EOL));
 761
 762
 763   return p;
 764 }
 765
 766 SCM_SYMBOL (sym_syntax, "syntax");
 767 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 768 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 769 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 770
 771 static SCM
 772 scm_read_syntax (int chr, SCM port)
 773 {
 774   SCM p;
 775   long line = SCM_LINUM (port);
 776   int column = SCM_COL (port) - 1;
 777
 778   switch (chr)
 779     {
 780     case '`':
 781       p = sym_quasisyntax;
 782       break;
 783
 784     case '\'':
 785       p = sym_syntax;
 786       break;
 787
 788     case ',':
 789       {
 790         int c;
 791
 792         c = scm_getc (port);
 793         if ('@' == c)
 794           p = sym_unsyntax_splicing;
 795         else
 796           {
 797             scm_ungetc (c, port);
 798             p = sym_unsyntax;
 799           }
 800         break;
 801       }
 802
 803     default:
 804       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 805                "scm_read_syntax", chr);
 806       abort ();
 807     }
 808
 809   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 810   if (SCM_RECORD_POSITIONS_P)
 811     scm_whash_insert (scm_source_whash, p,
 812                       scm_make_srcprops (line, column,
 813                                          SCM_FILENAME (port),
 814                                          SCM_COPY_SOURCE_P
 815                                          ? (scm_cons2 (SCM_CAR (p),
 816                                                        SCM_CAR (SCM_CDR (p)),
 817                                                        SCM_EOL))
 818                                          : SCM_UNDEFINED,
 819                                          SCM_EOL));
 820
 821
 822   return p;
 823 }
 824
 825 static inline SCM
 826 scm_read_semicolon_comment (int chr, SCM port)
 827 {
 828   int c;
 829
 830   for (c = scm_getc (port);
 831        (c != EOF) && (c != '\n');
 832        c = scm_getc (port));
 833
 834   return SCM_UNSPECIFIED;
 835 }
 836
 837 \f
 838 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 839
 840 static SCM
 841 scm_read_boolean (int chr, SCM port)
 842 {
 843   switch (chr)
 844     {
 845     case 't':
 846     case 'T':
 847       return SCM_BOOL_T;
 848
 849     case 'f':
 850     case 'F':
 851       return SCM_BOOL_F;
 852     }
 853
 854   return SCM_UNSPECIFIED;
 855 }
 856
 857 static SCM
 858 scm_read_character (int chr, SCM port)
 859 #define FUNC_NAME "scm_lreadr"
 860 {
 861   SCM ch;
 862   char charname[READER_CHAR_NAME_MAX_SIZE];
 863   size_t charname_len;
 864
 865   if (read_token (port, charname, sizeof (charname), &charname_len))
 866     goto char_error;
 867
 868   if (charname_len == 0)
 869     {
 870       chr = scm_getc (port);
 871       if (chr == EOF)
 872         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 873                            "while reading character", SCM_EOL);
 874
 875       /* CHR must be a token delimiter, like a whitespace.  */
 876       return (SCM_MAKE_CHAR (chr));
 877     }
 878
 879   if (charname_len == 1)
 880     return SCM_MAKE_CHAR (charname[0]);
 881
 882   if (*charname >= '0' && *charname < '8')
 883     {
 884       /* Dirk:FIXME::  This type of character syntax is not R5RS
 885        * compliant.  Further, it should be verified that the constant
 886        * does only consist of octal digits.  Finally, it should be
 887        * checked whether the resulting fixnum is in the range of
 888        * characters.  */
 889       SCM p = scm_c_locale_stringn_to_number (charname, charname_len, 8);
 890       if (SCM_I_INUMP (p))
 891         return SCM_MAKE_CHAR (SCM_I_INUM (p));
 892     }
 893
 894   ch = scm_i_charname_to_char (charname, charname_len);
 895   if (scm_is_true (ch))
 896     return ch;
 897
 898  char_error:
 899   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 900                      scm_list_1 (scm_from_locale_stringn (charname,
 901                                                           charname_len)));
 902
 903   return SCM_UNSPECIFIED;
 904 }
 905 #undef FUNC_NAME
 906
 907 static inline SCM
 908 scm_read_keyword (int chr, SCM port)
 909 {
 910   SCM symbol;
 911
 912   /* Read the symbol that comprises the keyword.  Doing this instead of
 913      invoking a specific symbol reader function allows `scm_read_keyword ()'
 914      to adapt to the delimiters currently valid of symbols.
 915
 916      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 917   symbol = scm_read_expression (port);
 918   if (!scm_is_symbol (symbol))
 919     scm_i_input_error ("scm_read_keyword", port,
 920                        "keyword prefix `~a' not followed by a symbol: ~s",
 921                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
 922
 923   return (scm_symbol_to_keyword (symbol));
 924 }
 925
 926 static inline SCM
 927 scm_read_vector (int chr, SCM port)
 928 {
 929   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
 930      guarantee that it's going to do what we want.  After all, this is an
 931      implementation detail of `scm_read_vector ()', not a desirable
 932      property.  */
 933   return (scm_vector (scm_read_sexp (chr, port)));
 934 }
 935
 936 static inline SCM
 937 scm_read_srfi4_vector (int chr, SCM port)
 938 {
 939   return scm_i_read_array (port, chr);
 940 }
 941
 942 static SCM
 943 scm_read_bytevector (int chr, SCM port)
 944 {
 945   chr = scm_getc (port);
 946   if (chr != 'u')
 947     goto syntax;
 948
 949   chr = scm_getc (port);
 950   if (chr != '8')
 951     goto syntax;
 952
 953   chr = scm_getc (port);
 954   if (chr != '(')
 955     goto syntax;
 956
 957   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
 958
 959  syntax:
 960   scm_i_input_error ("read_bytevector", port,
 961                      "invalid bytevector prefix",
 962                      SCM_MAKE_CHAR (chr));
 963   return SCM_UNSPECIFIED;
 964 }
 965
 966 static SCM
 967 scm_read_guile_bit_vector (int chr, SCM port)
 968 {
 969   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
 970      terribly inefficient but who cares?  */
 971   SCM s_bits = SCM_EOL;
 972
 973   for (chr = scm_getc (port);
 974        (chr != EOF) && ((chr == '0') || (chr == '1'));
 975        chr = scm_getc (port))
 976     {
 977       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
 978     }
 979
 980   if (chr != EOF)
 981     scm_ungetc (chr, port);
 982
 983   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
 984 }
 985
 986 static inline SCM
 987 scm_read_scsh_block_comment (int chr, SCM port)
 988 {
 989   int bang_seen = 0;
 990
 991   for (;;)
 992     {
 993       int c = scm_getc (port);
 994
 995       if (c == EOF)
 996         scm_i_input_error ("skip_block_comment", port,
 997                            "unterminated `#! ... !#' comment", SCM_EOL);
 998
 999       if (c == '!')
1000         bang_seen = 1;
1001       else if (c == '#' && bang_seen)
1002         break;
1003       else
1004         bang_seen = 0;
1005     }
1006
1007   return SCM_UNSPECIFIED;
1008 }
1009
1010 static SCM
1011 scm_read_commented_expression (int chr, SCM port)
1012 {
1013   int c;
1014
1015   c = flush_ws (port, (char *) NULL);
1016   if (EOF == c)
1017     scm_i_input_error ("read_commented_expression", port,
1018                        "no expression after #; comment", SCM_EOL);
1019   scm_ungetc (c, port);
1020   scm_read_expression (port);
1021   return SCM_UNSPECIFIED;
1022 }
1023
1024 static SCM
1025 scm_read_extended_symbol (int chr, SCM port)
1026 {
1027   /* Guile's extended symbol read syntax looks like this:
1028
1029        #{This is all a symbol name}#
1030
1031      So here, CHR is expected to be `{'.  */
1032   SCM result;
1033   int saw_brace = 0, finished = 0;
1034   size_t len = 0;
1035   char buf[1024];
1036
1037   result = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
1038
1039   while ((chr = scm_getc (port)) != EOF)
1040     {
1041       if (saw_brace)
1042         {
1043           if (chr == '#')
1044             {
1045               finished = 1;
1046               break;
1047             }
1048           else
1049             {
1050               saw_brace = 0;
1051               buf[len++] = '}';
1052               buf[len++] = chr;
1053             }
1054         }
1055       else if (chr == '}')
1056         saw_brace = 1;
1057       else
1058         buf[len++] = chr;
1059
1060       if (len >= sizeof (buf) - 2)
1061         {
1062           scm_string_append (scm_list_2 (result,
1063                                          scm_from_locale_stringn (buf, len)));
1064           len = 0;
1065         }
1066
1067       if (finished)
1068         break;
1069     }
1070
1071   if (len)
1072     result = scm_string_append (scm_list_2
1073                                 (result,
1074                                  scm_from_locale_stringn (buf, len)));
1075
1076   return (scm_string_to_symbol (result));
1077 }
1078
1079
1080 \f
1081 /* Top-level token readers, i.e., dispatchers.  */
1082
1083 static SCM
1084 scm_read_sharp_extension (int chr, SCM port)
1085 {
1086   SCM proc;
1087
1088   proc = scm_get_hash_procedure (chr);
1089   if (scm_is_true (scm_procedure_p (proc)))
1090     {
1091       long line = SCM_LINUM (port);
1092       int column = SCM_COL (port) - 2;
1093       SCM got;
1094
1095       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1096       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1097         {
1098           if (SCM_RECORD_POSITIONS_P)
1099             return (recsexpr (got, line, column,
1100                               SCM_FILENAME (port)));
1101           else
1102             return got;
1103         }
1104     }
1105
1106   return SCM_UNSPECIFIED;
1107 }
1108
1109 /* The reader for the sharp `#' character.  It basically dispatches reads
1110    among the above token readers.   */
1111 static SCM
1112 scm_read_sharp (int chr, SCM port)
1113 #define FUNC_NAME "scm_lreadr"
1114 {
1115   SCM result;
1116
1117   chr = scm_getc (port);
1118
1119   result = scm_read_sharp_extension (chr, port);
1120   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1121     return result;
1122
1123   switch (chr)
1124     {
1125     case '\\':
1126       return (scm_read_character (chr, port));
1127     case '(':
1128       return (scm_read_vector (chr, port));
1129     case 's':
1130     case 'u':
1131     case 'f':
1132       /* This one may return either a boolean or an SRFI-4 vector.  */
1133       return (scm_read_srfi4_vector (chr, port));
1134     case 'v':
1135       return (scm_read_bytevector (chr, port));
1136     case '*':
1137       return (scm_read_guile_bit_vector (chr, port));
1138     case 't':
1139     case 'T':
1140     case 'F':
1141       /* This one may return either a boolean or an SRFI-4 vector.  */
1142       return (scm_read_boolean (chr, port));
1143     case ':':
1144       return (scm_read_keyword (chr, port));
1145     case '0': case '1': case '2': case '3': case '4':
1146     case '5': case '6': case '7': case '8': case '9':
1147     case '@':
1148 #if SCM_ENABLE_DEPRECATED
1149       /* See below for 'i' and 'e'. */
1150     case 'a':
1151     case 'c':
1152     case 'y':
1153     case 'h':
1154     case 'l':
1155 #endif
1156       return (scm_i_read_array (port, chr));
1157
1158     case 'i':
1159     case 'e':
1160 #if SCM_ENABLE_DEPRECATED
1161       {
1162         /* When next char is '(', it really is an old-style
1163            uniform array. */
1164         int next_c = scm_getc (port);
1165         if (next_c != EOF)
1166           scm_ungetc (next_c, port);
1167         if (next_c == '(')
1168           return scm_i_read_array (port, chr);
1169         /* Fall through. */
1170       }
1171 #endif
1172     case 'b':
1173     case 'B':
1174     case 'o':
1175     case 'O':
1176     case 'd':
1177     case 'D':
1178     case 'x':
1179     case 'X':
1180     case 'I':
1181     case 'E':
1182       return (scm_read_number_and_radix (chr, port));
1183     case '{':
1184       return (scm_read_extended_symbol (chr, port));
1185     case '!':
1186       return (scm_read_scsh_block_comment (chr, port));
1187     case ';':
1188       return (scm_read_commented_expression (chr, port));
1189     case '`':
1190     case '\'':
1191     case ',':
1192       return (scm_read_syntax (chr, port));
1193     default:
1194       result = scm_read_sharp_extension (chr, port);
1195       if (scm_is_eq (result, SCM_UNSPECIFIED))
1196         scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1197                            scm_list_1 (SCM_MAKE_CHAR (chr)));
1198       else
1199         return result;
1200     }
1201
1202   return SCM_UNSPECIFIED;
1203 }
1204 #undef FUNC_NAME
1205
1206 static SCM
1207 scm_read_expression (SCM port)
1208 #define FUNC_NAME "scm_read_expression"
1209 {
1210   while (1)
1211     {
1212       register int chr;
1213
1214       chr = scm_getc (port);
1215
1216       switch (chr)
1217         {
1218         case SCM_WHITE_SPACES:
1219         case SCM_LINE_INCREMENTORS:
1220           break;
1221         case ';':
1222           (void) scm_read_semicolon_comment (chr, port);
1223           break;
1224         case '(':
1225           return (scm_read_sexp (chr, port));
1226         case '"':
1227           return (scm_read_string (chr, port));
1228         case '\'':
1229         case '`':
1230         case ',':
1231           return (scm_read_quote (chr, port));
1232         case '#':
1233           {
1234             SCM result;
1235             result = scm_read_sharp (chr, port);
1236             if (scm_is_eq (result, SCM_UNSPECIFIED))
1237               /* We read a comment or some such.  */
1238               break;
1239             else
1240               return result;
1241           }
1242         case ')':
1243           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1244           break;
1245         case EOF:
1246           return SCM_EOF_VAL;
1247         case ':':
1248           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1249             return scm_symbol_to_keyword (scm_read_expression (port));
1250           /* Fall through.  */
1251
1252         default:
1253           {
1254             if (((chr >= '0') && (chr <= '9'))
1255                 || (strchr ("+-.", chr)))
1256               return (scm_read_number (chr, port));
1257             else
1258               return (scm_read_mixed_case_symbol (chr, port));
1259           }
1260         }
1261     }
1262 }
1263 #undef FUNC_NAME
1264
1265 \f
1266 /* Actual reader.  */
1267
1268 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1269             (SCM port),
1270             "Read an s-expression from the input port @var{port}, or from\n"
1271             "the current input port if @var{port} is not specified.\n"
1272             "Any whitespace before the next token is discarded.")
1273 #define FUNC_NAME s_scm_read
1274 {
1275   int c;
1276
1277   if (SCM_UNBNDP (port))
1278     port = scm_current_input_port ();
1279   SCM_VALIDATE_OPINPORT (1, port);
1280
1281   c = flush_ws (port, (char *) NULL);
1282   if (EOF == c)
1283     return SCM_EOF_VAL;
1284   scm_ungetc (c, port);
1285
1286   return (scm_read_expression (port));
1287 }
1288 #undef FUNC_NAME
1289
1290
1291 \f
1292
1293 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1294 static SCM
1295 recsexpr (SCM obj, long line, int column, SCM filename)
1296 {
1297   if (!scm_is_pair(obj)) {
1298     return obj;
1299   } else {
1300     SCM tmp = obj, copy;
1301     /* If this sexpr is visible in the read:sharp source, we want to
1302        keep that information, so only record non-constant cons cells
1303        which haven't previously been read by the reader. */
1304     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1305       {
1306         if (SCM_COPY_SOURCE_P)
1307           {
1308             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1309                              SCM_UNDEFINED);
1310             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1311               {
1312                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1313                                                       line,
1314                                                       column,
1315                                                       filename),
1316                                             SCM_UNDEFINED));
1317                 copy = SCM_CDR (copy);
1318               }
1319             SCM_SETCDR (copy, tmp);
1320           }
1321         else
1322           {
1323             recsexpr (SCM_CAR (obj), line, column, filename);
1324             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1325               recsexpr (SCM_CAR (tmp), line, column, filename);
1326             copy = SCM_UNDEFINED;
1327           }
1328         scm_whash_insert (scm_source_whash,
1329                           obj,
1330                           scm_make_srcprops (line,
1331                                              column,
1332                                              filename,
1333                                              copy,
1334                                              SCM_EOL));
1335       }
1336     return obj;
1337   }
1338 }
1339
1340 /* Manipulate the read-hash-procedures alist.  This could be written in
1341    Scheme, but maybe it will also be used by C code during initialisation.  */
1342 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1343             (SCM chr, SCM proc),
1344             "Install the procedure @var{proc} for reading expressions\n"
1345             "starting with the character sequence @code{#} and @var{chr}.\n"
1346             "@var{proc} will be called with two arguments:  the character\n"
1347             "@var{chr} and the port to read further data from. The object\n"
1348             "returned will be the return value of @code{read}. \n"
1349             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1350             )
1351 #define FUNC_NAME s_scm_read_hash_extend
1352 {
1353   SCM this;
1354   SCM prev;
1355
1356   SCM_VALIDATE_CHAR (1, chr);
1357   SCM_ASSERT (scm_is_false (proc)
1358               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1359               proc, SCM_ARG2, FUNC_NAME);
1360
1361   /* Check if chr is already in the alist.  */
1362   this = *scm_read_hash_procedures;
1363   prev = SCM_BOOL_F;
1364   while (1)
1365     {
1366       if (scm_is_null (this))
1367         {
1368           /* not found, so add it to the beginning.  */
1369           if (scm_is_true (proc))
1370             {
1371               *scm_read_hash_procedures =
1372                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1373             }
1374           break;
1375         }
1376       if (scm_is_eq (chr, SCM_CAAR (this)))
1377         {
1378           /* already in the alist.  */
1379           if (scm_is_false (proc))
1380             {
1381               /* remove it.  */
1382               if (scm_is_false (prev))
1383                 {
1384                   *scm_read_hash_procedures =
1385                     SCM_CDR (*scm_read_hash_procedures);
1386                 }
1387               else
1388                 scm_set_cdr_x (prev, SCM_CDR (this));
1389             }
1390           else
1391             {
1392               /* replace it.  */
1393               scm_set_cdr_x (SCM_CAR (this), proc);
1394             }
1395           break;
1396         }
1397       prev = this;
1398       this = SCM_CDR (this);
1399     }
1400
1401   return SCM_UNSPECIFIED;
1402 }
1403 #undef FUNC_NAME
1404
1405 /* Recover the read-hash procedure corresponding to char c.  */
1406 static SCM
1407 scm_get_hash_procedure (int c)
1408 {
1409   SCM rest = *scm_read_hash_procedures;
1410
1411   while (1)
1412     {
1413       if (scm_is_null (rest))
1414         return SCM_BOOL_F;
1415
1416       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1417         return SCM_CDAR (rest);
1418
1419       rest = SCM_CDR (rest);
1420     }
1421 }
1422
1423 void
1424 scm_init_read ()
1425 {
1426   scm_read_hash_procedures =
1427     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1428
1429   scm_init_opts (scm_read_options, scm_read_opts);
1430 #include "libguile/read.x"
1431 }
1432
1433 /*
1434   Local Variables:
1435   c-file-style: "gnu"
1436   End:
1437 */