libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63
  64 scm_t_option scm_read_opts[] = {
  65   { SCM_OPTION_BOOLEAN, "copy", 0,
  66     "Copy source code expressions." },
  67   { SCM_OPTION_BOOLEAN, "positions", 0,
  68     "Record positions of source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  70     "Convert symbols to lower case."},
  71   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  72     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  73 #if SCM_ENABLE_ELISP
  74   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  75     "Support Elisp vector syntax, namely `[...]'."},
  76   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  77     "Support `\\(' and `\\)' in strings."},
  78 #endif
  79   { 0, },
  80 };
  81
  82 /*
  83   Give meaningful error messages for errors
  84
  85   We use the format
  86
  87   FILE:LINE:COL: MESSAGE
  88   This happened in ....
  89
  90   This is not standard GNU format, but the test-suite likes the real
  91   message to be in front.
  92
  93  */
  94
  95
  96 void
  97 scm_i_input_error (char const *function,
  98                    SCM port, const char *message, SCM arg)
  99 {
 100   SCM fn = (scm_is_string (SCM_FILENAME(port))
 101             ? SCM_FILENAME(port)
 102             : scm_from_locale_string ("#<unknown port>"));
 103
 104   SCM string_port = scm_open_output_string ();
 105   SCM string = SCM_EOL;
 106   scm_simple_format (string_port,
 107                      scm_from_locale_string ("~A:~S:~S: ~A"),
 108                      scm_list_4 (fn,
 109                                  scm_from_long (SCM_LINUM (port) + 1),
 110                                  scm_from_int (SCM_COL (port) + 1),
 111                                  scm_from_locale_string (message)));
 112
 113   string = scm_get_output_string (string_port);
 114   scm_close_output_port (string_port);
 115   scm_error_scm (scm_from_locale_symbol ("read-error"),
 116                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 117                  string,
 118                  arg,
 119                  SCM_BOOL_F);
 120 }
 121
 122
 123 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 124             (SCM setting),
 125             "Option interface for the read options. Instead of using\n"
 126             "this procedure directly, use the procedures @code{read-enable},\n"
 127             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 128 #define FUNC_NAME s_scm_read_options
 129 {
 130   SCM ans = scm_options (setting,
 131                          scm_read_opts,
 132                          FUNC_NAME);
 133   if (SCM_COPY_SOURCE_P)
 134     SCM_RECORD_POSITIONS_P = 1;
 135   return ans;
 136 }
 137 #undef FUNC_NAME
 138
 139 /* An association list mapping extra hash characters to procedures.  */
 140 static SCM *scm_read_hash_procedures;
 141
 142
 143 \f
 144 /* Token readers.  */
 145
 146
 147 /* Size of the C buffer used to read symbols and numbers.  */
 148 #define READER_BUFFER_SIZE            128
 149
 150 /* Size of the C buffer used to read strings.  */
 151 #define READER_STRING_BUFFER_SIZE     512
 152
 153 /* The maximum size of Scheme character names.  */
 154 #define READER_CHAR_NAME_MAX_SIZE      50
 155
 156
 157 /* `isblank' is only in C99.  */
 158 #define CHAR_IS_BLANK_(_chr)                                    \
 159   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 160    || ((_chr) == '\f') || ((_chr) == '\r'))
 161
 162 #ifdef MSDOS
 163 # define CHAR_IS_BLANK(_chr)                    \
 164   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 165 #else
 166 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 167 #endif
 168
 169
 170 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 171    structure'').  */
 172 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 173   (CHAR_IS_BLANK (c)                                            \
 174    || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
 175
 176 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 177
 178 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 179    Structure''.  */
 180 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 181   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 182    || ((_chr) == 'd') || ((_chr) == 'l'))
 183
 184 /* Read an SCSH block comment.  */
 185 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 186 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 187 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 188 static SCM scm_get_hash_procedure (int);
 189
 190 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Return
 191    zero if the whole token fits in BUF, non-zero otherwise.  */
 192 static inline int
 193 read_token (SCM port, SCM buf, size_t *read)
 194 {
 195   scm_t_wchar chr;
 196   *read = 0;
 197
 198   while (*read < scm_i_string_length (buf))
 199     {
 200       chr = scm_getc (port);
 201
 202       if (chr == EOF)
 203         {
 204           scm_i_string_stop_writing ();
 205           return 0;
 206         }
 207
 208       chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
 209
 210       if (CHAR_IS_DELIMITER (chr))
 211         {
 212           scm_i_string_stop_writing ();
 213           scm_ungetc (chr, port);
 214           return 0;
 215         }
 216
 217       scm_i_string_set_x (buf, *read, chr);
 218       (*read)++;
 219     }
 220
 221   return 1;
 222 }
 223
 224 static SCM
 225 read_complete_token (SCM port, size_t *read)
 226 {
 227   SCM buffer, str = SCM_EOL;
 228   size_t len;
 229   int overflow;
 230
 231   buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 232   overflow = read_token (port, buffer, read);
 233   if (!overflow)
 234     return scm_i_substring (buffer, 0, *read);
 235
 236   str = scm_string_copy (buffer);
 237   do
 238     {
 239       overflow = read_token (port, buffer, &len);
 240       str = scm_string_append (scm_list_2 (str, buffer));
 241       *read += len;
 242     }
 243   while (overflow);
 244
 245   return scm_i_substring (str, 0, *read);
 246 }
 247
 248 /* Skip whitespace from PORT and return the first non-whitespace character
 249    read.  Raise an error on end-of-file.  */
 250 static int
 251 flush_ws (SCM port, const char *eoferr)
 252 {
 253   register scm_t_wchar c;
 254   while (1)
 255     switch (c = scm_getc (port))
 256       {
 257       case EOF:
 258       goteof:
 259         if (eoferr)
 260           {
 261             scm_i_input_error (eoferr,
 262                                port,
 263                                "end of file",
 264                                SCM_EOL);
 265           }
 266         return c;
 267
 268       case ';':
 269       lp:
 270         switch (c = scm_getc (port))
 271           {
 272           case EOF:
 273             goto goteof;
 274           default:
 275             goto lp;
 276           case SCM_LINE_INCREMENTORS:
 277             break;
 278           }
 279         break;
 280
 281       case '#':
 282         switch (c = scm_getc (port))
 283           {
 284           case EOF:
 285             eoferr = "read_sharp";
 286             goto goteof;
 287           case '!':
 288             scm_read_scsh_block_comment (c, port);
 289             break;
 290           case ';':
 291             scm_read_commented_expression (c, port);
 292             break;
 293           case '|':
 294             if (scm_is_false (scm_get_hash_procedure (c)))
 295               {
 296                 scm_read_r6rs_block_comment (c, port);
 297                 break;
 298               }
 299             /* fall through */
 300           default:
 301             scm_ungetc (c, port);
 302             return '#';
 303           }
 304         break;
 305
 306       case SCM_LINE_INCREMENTORS:
 307       case SCM_SINGLE_SPACES:
 308       case '\t':
 309         break;
 310
 311       default:
 312         return c;
 313       }
 314
 315   return 0;
 316 }
 317
 318
 319 \f
 320 /* Token readers.  */
 321
 322 static SCM scm_read_expression (SCM port);
 323 static SCM scm_read_sharp (int chr, SCM port);
 324 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 325
 326
 327 static SCM
 328 scm_read_sexp (scm_t_wchar chr, SCM port)
 329 #define FUNC_NAME "scm_i_lreadparen"
 330 {
 331   register int c;
 332   register SCM tmp;
 333   register SCM tl, ans = SCM_EOL;
 334   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 335   static const int terminating_char = ')';
 336
 337   /* Need to capture line and column numbers here. */
 338   long line = SCM_LINUM (port);
 339   int column = SCM_COL (port) - 1;
 340
 341
 342   c = flush_ws (port, FUNC_NAME);
 343   if (terminating_char == c)
 344     return SCM_EOL;
 345
 346   scm_ungetc (c, port);
 347   if (scm_is_eq (scm_sym_dot,
 348                  (tmp = scm_read_expression (port))))
 349     {
 350       ans = scm_read_expression (port);
 351       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 352         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 353                            SCM_EOL);
 354       return ans;
 355     }
 356
 357   /* Build the head of the list structure. */
 358   ans = tl = scm_cons (tmp, SCM_EOL);
 359
 360   if (SCM_COPY_SOURCE_P)
 361     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 362                            ? copy
 363                            : tmp,
 364                            SCM_EOL);
 365
 366   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 367     {
 368       SCM new_tail;
 369
 370       scm_ungetc (c, port);
 371       if (scm_is_eq (scm_sym_dot,
 372                      (tmp = scm_read_expression (port))))
 373         {
 374           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 375
 376           if (SCM_COPY_SOURCE_P)
 377             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 378                                        SCM_EOL));
 379
 380           c = flush_ws (port, FUNC_NAME);
 381           if (terminating_char != c)
 382             scm_i_input_error (FUNC_NAME, port,
 383                                "in pair: missing close paren", SCM_EOL);
 384           goto exit;
 385         }
 386
 387       new_tail = scm_cons (tmp, SCM_EOL);
 388       SCM_SETCDR (tl, new_tail);
 389       tl = new_tail;
 390
 391       if (SCM_COPY_SOURCE_P)
 392         {
 393           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 394                                     ? copy
 395                                     : tmp, SCM_EOL);
 396           SCM_SETCDR (tl2, new_tail2);
 397           tl2 = new_tail2;
 398         }
 399     }
 400
 401  exit:
 402   if (SCM_RECORD_POSITIONS_P)
 403     scm_whash_insert (scm_source_whash,
 404                       ans,
 405                       scm_make_srcprops (line, column,
 406                                          SCM_FILENAME (port),
 407                                          SCM_COPY_SOURCE_P
 408                                          ? ans2
 409                                          : SCM_UNDEFINED,
 410                                          SCM_EOL));
 411   return ans;
 412 }
 413 #undef FUNC_NAME
 414
 415 static SCM
 416 scm_read_string (int chr, SCM port)
 417 #define FUNC_NAME "scm_lreadr"
 418 {
 419   /* For strings smaller than C_STR, this function creates only one Scheme
 420      object (the string returned).  */
 421
 422   SCM str = SCM_BOOL_F;
 423   unsigned c_str_len = 0;
 424   scm_t_wchar c;
 425
 426   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 427   while ('"' != (c = scm_getc (port)))
 428     {
 429       if (c == EOF)
 430         {
 431         str_eof:
 432           scm_i_input_error (FUNC_NAME, port,
 433                              "end of file in string constant", SCM_EOL);
 434         }
 435
 436       if (c_str_len + 1 >= scm_i_string_length (str))
 437         {
 438           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 439
 440           str = scm_string_append (scm_list_2 (str, addy));
 441         }
 442
 443       if (c == '\\')
 444         {
 445           switch (c = scm_getc (port))
 446             {
 447             case EOF:
 448               goto str_eof;
 449             case '"':
 450             case '\\':
 451               break;
 452 #if SCM_ENABLE_ELISP
 453             case '(':
 454             case ')':
 455               if (SCM_ESCAPED_PARENS_P)
 456                 break;
 457               goto bad_escaped;
 458 #endif
 459             case '\n':
 460               continue;
 461             case '0':
 462               c = '\0';
 463               break;
 464             case 'f':
 465               c = '\f';
 466               break;
 467             case 'n':
 468               c = '\n';
 469               break;
 470             case 'r':
 471               c = '\r';
 472               break;
 473             case 't':
 474               c = '\t';
 475               break;
 476             case 'a':
 477               c = '\007';
 478               break;
 479             case 'v':
 480               c = '\v';
 481               break;
 482             case 'x':
 483               {
 484                 scm_t_wchar a, b;
 485                 a = scm_getc (port);
 486                 if (a == EOF)
 487                   goto str_eof;
 488                 b = scm_getc (port);
 489                 if (b == EOF)
 490                   goto str_eof;
 491                 if ('0' <= a && a <= '9')
 492                   a -= '0';
 493                 else if ('A' <= a && a <= 'F')
 494                   a = a - 'A' + 10;
 495                 else if ('a' <= a && a <= 'f')
 496                   a = a - 'a' + 10;
 497                 else
 498                   {
 499                     c = a;
 500                     goto bad_escaped;
 501                   }
 502                 if ('0' <= b && b <= '9')
 503                   b -= '0';
 504                 else if ('A' <= b && b <= 'F')
 505                   b = b - 'A' + 10;
 506                 else if ('a' <= b && b <= 'f')
 507                   b = b - 'a' + 10;
 508                 else
 509                   {
 510                     c = b;
 511                     goto bad_escaped;
 512                   }
 513                 c = a * 16 + b;
 514                 break;
 515               }
 516             case 'u':
 517               {
 518                 scm_t_wchar a;
 519                 int i;
 520                 c = 0;
 521                 for (i = 0; i < 4; i++)
 522                   {
 523                     a = scm_getc (port);
 524                     if (a == EOF)
 525                       goto str_eof;
 526                     if ('0' <= a && a <= '9')
 527                       a -= '0';
 528                     else if ('A' <= a && a <= 'F')
 529                       a = a - 'A' + 10;
 530                     else if ('a' <= a && a <= 'f')
 531                       a = a - 'a' + 10;
 532                     else
 533                       {
 534                         c = a;
 535                         goto bad_escaped;
 536                       }
 537                     c = c * 16 + a;
 538                   }
 539                 break;
 540               }
 541             case 'U':
 542               {
 543                 scm_t_wchar a;
 544                 int i;
 545                 c = 0;
 546                 for (i = 0; i < 6; i++)
 547                   {
 548                     a = scm_getc (port);
 549                     if (a == EOF)
 550                       goto str_eof;
 551                     if ('0' <= a && a <= '9')
 552                       a -= '0';
 553                     else if ('A' <= a && a <= 'F')
 554                       a = a - 'A' + 10;
 555                     else if ('a' <= a && a <= 'f')
 556                       a = a - 'a' + 10;
 557                     else
 558                       {
 559                         c = a;
 560                         goto bad_escaped;
 561                       }
 562                     c = c * 16 + a;
 563                   }
 564                 break;
 565               }
 566             default:
 567             bad_escaped:
 568               scm_i_input_error (FUNC_NAME, port,
 569                                  "illegal character in escape sequence: ~S",
 570                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 571             }
 572         }
 573       str = scm_i_string_start_writing (str);
 574       scm_i_string_set_x (str, c_str_len++, c);
 575       scm_i_string_stop_writing ();
 576     }
 577
 578   if (c_str_len > 0)
 579     {
 580       return scm_i_substring_copy (str, 0, c_str_len);
 581     }
 582
 583   return scm_nullstr;
 584 }
 585 #undef FUNC_NAME
 586
 587
 588 static SCM
 589 scm_read_number (scm_t_wchar chr, SCM port)
 590 {
 591   SCM result;
 592   SCM buffer;
 593   size_t read;
 594
 595   scm_ungetc (chr, port);
 596   buffer = read_complete_token (port, &read);
 597   result = scm_string_to_number (buffer, SCM_UNDEFINED);
 598   if (!scm_is_true (result))
 599     /* Return a symbol instead of a number.  */
 600     result = scm_string_to_symbol (buffer);
 601
 602   return result;
 603 }
 604
 605 static SCM
 606 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 607 {
 608   SCM result;
 609   int ends_with_colon = 0;
 610   SCM buffer;
 611   size_t read = 0;
 612   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 613
 614   scm_ungetc (chr, port);
 615   buffer = read_complete_token (port, &read);
 616   if (read > 0)
 617     ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
 618
 619   if (postfix && ends_with_colon && (read > 1))
 620     result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
 621   else
 622     result = scm_string_to_symbol (buffer);
 623
 624   return result;
 625 }
 626
 627 static SCM
 628 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 629 #define FUNC_NAME "scm_lreadr"
 630 {
 631   SCM result;
 632   size_t read;
 633   SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 634   unsigned int radix;
 635
 636   switch (chr)
 637     {
 638     case 'B':
 639     case 'b':
 640       radix = 2;
 641       break;
 642
 643     case 'o':
 644     case 'O':
 645       radix = 8;
 646       break;
 647
 648     case 'd':
 649     case 'D':
 650       radix = 10;
 651       break;
 652
 653     case 'x':
 654     case 'X':
 655       radix = 16;
 656       break;
 657
 658     default:
 659       scm_ungetc (chr, port);
 660       scm_ungetc ('#', port);
 661       radix = 10;
 662     }
 663
 664   buffer = read_complete_token (port, &read);
 665   result = scm_string_to_number (buffer, scm_from_uint (radix));
 666
 667   if (scm_is_true (result))
 668     return result;
 669
 670   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 671
 672   return SCM_BOOL_F;
 673 }
 674 #undef FUNC_NAME
 675
 676 static SCM
 677 scm_read_quote (int chr, SCM port)
 678 {
 679   SCM p;
 680   long line = SCM_LINUM (port);
 681   int column = SCM_COL (port) - 1;
 682
 683   switch (chr)
 684     {
 685     case '`':
 686       p = scm_sym_quasiquote;
 687       break;
 688
 689     case '\'':
 690       p = scm_sym_quote;
 691       break;
 692
 693     case ',':
 694       {
 695         scm_t_wchar c;
 696
 697         c = scm_getc (port);
 698         if ('@' == c)
 699           p = scm_sym_uq_splicing;
 700         else
 701           {
 702             scm_ungetc (c, port);
 703             p = scm_sym_unquote;
 704           }
 705         break;
 706       }
 707
 708     default:
 709       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 710                "scm_read_quote", chr);
 711       abort ();
 712     }
 713
 714   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 715   if (SCM_RECORD_POSITIONS_P)
 716     scm_whash_insert (scm_source_whash, p,
 717                       scm_make_srcprops (line, column,
 718                                          SCM_FILENAME (port),
 719                                          SCM_COPY_SOURCE_P
 720                                          ? (scm_cons2 (SCM_CAR (p),
 721                                                        SCM_CAR (SCM_CDR (p)),
 722                                                        SCM_EOL))
 723                                          : SCM_UNDEFINED,
 724                                          SCM_EOL));
 725
 726
 727   return p;
 728 }
 729
 730 SCM_SYMBOL (sym_syntax, "syntax");
 731 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 732 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 733 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 734
 735 static SCM
 736 scm_read_syntax (int chr, SCM port)
 737 {
 738   SCM p;
 739   long line = SCM_LINUM (port);
 740   int column = SCM_COL (port) - 1;
 741
 742   switch (chr)
 743     {
 744     case '`':
 745       p = sym_quasisyntax;
 746       break;
 747
 748     case '\'':
 749       p = sym_syntax;
 750       break;
 751
 752     case ',':
 753       {
 754         int c;
 755
 756         c = scm_getc (port);
 757         if ('@' == c)
 758           p = sym_unsyntax_splicing;
 759         else
 760           {
 761             scm_ungetc (c, port);
 762             p = sym_unsyntax;
 763           }
 764         break;
 765       }
 766
 767     default:
 768       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 769                "scm_read_syntax", chr);
 770       abort ();
 771     }
 772
 773   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 774   if (SCM_RECORD_POSITIONS_P)
 775     scm_whash_insert (scm_source_whash, p,
 776                       scm_make_srcprops (line, column,
 777                                          SCM_FILENAME (port),
 778                                          SCM_COPY_SOURCE_P
 779                                          ? (scm_cons2 (SCM_CAR (p),
 780                                                        SCM_CAR (SCM_CDR (p)),
 781                                                        SCM_EOL))
 782                                          : SCM_UNDEFINED,
 783                                          SCM_EOL));
 784
 785
 786   return p;
 787 }
 788
 789 static inline SCM
 790 scm_read_semicolon_comment (int chr, SCM port)
 791 {
 792   int c;
 793
 794   /* We use the get_byte here because there is no need to get the
 795      locale correct with comment input. This presumes that newline
 796      always represents itself no matter what the encoding is.  */
 797   for (c = scm_get_byte_or_eof (port);
 798        (c != EOF) && (c != '\n');
 799        c = scm_getc (port));
 800
 801   return SCM_UNSPECIFIED;
 802 }
 803
 804 \f
 805 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 806
 807 static SCM
 808 scm_read_boolean (int chr, SCM port)
 809 {
 810   switch (chr)
 811     {
 812     case 't':
 813     case 'T':
 814       return SCM_BOOL_T;
 815
 816     case 'f':
 817     case 'F':
 818       return SCM_BOOL_F;
 819     }
 820
 821   return SCM_UNSPECIFIED;
 822 }
 823
 824 static SCM
 825 scm_read_character (scm_t_wchar chr, SCM port)
 826 #define FUNC_NAME "scm_lreadr"
 827 {
 828   SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
 829   size_t charname_len;
 830   scm_t_wchar cp;
 831   int overflow;
 832
 833   overflow = read_token (port, charname, &charname_len);
 834   charname = scm_c_substring (charname, 0, charname_len);
 835
 836   if (overflow)
 837     goto char_error;
 838
 839   if (charname_len == 0)
 840     {
 841       chr = scm_getc (port);
 842       if (chr == EOF)
 843         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 844                            "while reading character", SCM_EOL);
 845
 846       /* CHR must be a token delimiter, like a whitespace.  */
 847       return (SCM_MAKE_CHAR (chr));
 848     }
 849
 850   if (charname_len == 1)
 851     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
 852
 853   cp = scm_i_string_ref (charname, 0);
 854   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 855     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 856
 857   if (cp >= '0' && cp < '8')
 858     {
 859       /* Dirk:FIXME::  This type of character syntax is not R5RS
 860        * compliant.  Further, it should be verified that the constant
 861        * does only consist of octal digits.  */
 862       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 863       if (SCM_I_INUMP (p))
 864         {
 865           scm_t_wchar c = SCM_I_INUM (p);
 866           if (SCM_IS_UNICODE_CHAR (c))
 867             return SCM_MAKE_CHAR (c);
 868           else
 869             scm_i_input_error (FUNC_NAME, port,
 870                                "out-of-range octal character escape: ~a",
 871                                scm_list_1 (charname));
 872         }
 873     }
 874
 875   /* The names of characters should never have non-Latin1
 876      characters.  */
 877   if (scm_i_is_narrow_string (charname)
 878       || scm_i_try_narrow_string (charname))
 879     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 880                                        charname_len);
 881       if (scm_is_true (ch))
 882         return ch;
 883     }
 884
 885  char_error:
 886   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 887                      scm_list_1 (charname));
 888
 889   return SCM_UNSPECIFIED;
 890 }
 891 #undef FUNC_NAME
 892
 893 static inline SCM
 894 scm_read_keyword (int chr, SCM port)
 895 {
 896   SCM symbol;
 897
 898   /* Read the symbol that comprises the keyword.  Doing this instead of
 899      invoking a specific symbol reader function allows `scm_read_keyword ()'
 900      to adapt to the delimiters currently valid of symbols.
 901
 902      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 903   symbol = scm_read_expression (port);
 904   if (!scm_is_symbol (symbol))
 905     scm_i_input_error ("scm_read_keyword", port,
 906                        "keyword prefix `~a' not followed by a symbol: ~s",
 907                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
 908
 909   return (scm_symbol_to_keyword (symbol));
 910 }
 911
 912 static inline SCM
 913 scm_read_vector (int chr, SCM port)
 914 {
 915   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
 916      guarantee that it's going to do what we want.  After all, this is an
 917      implementation detail of `scm_read_vector ()', not a desirable
 918      property.  */
 919   return (scm_vector (scm_read_sexp (chr, port)));
 920 }
 921
 922 static inline SCM
 923 scm_read_srfi4_vector (int chr, SCM port)
 924 {
 925   return scm_i_read_array (port, chr);
 926 }
 927
 928 static SCM
 929 scm_read_bytevector (scm_t_wchar chr, SCM port)
 930 {
 931   chr = scm_getc (port);
 932   if (chr != 'u')
 933     goto syntax;
 934
 935   chr = scm_getc (port);
 936   if (chr != '8')
 937     goto syntax;
 938
 939   chr = scm_getc (port);
 940   if (chr != '(')
 941     goto syntax;
 942
 943   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
 944
 945  syntax:
 946   scm_i_input_error ("read_bytevector", port,
 947                      "invalid bytevector prefix",
 948                      SCM_MAKE_CHAR (chr));
 949   return SCM_UNSPECIFIED;
 950 }
 951
 952 static SCM
 953 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
 954 {
 955   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
 956      terribly inefficient but who cares?  */
 957   SCM s_bits = SCM_EOL;
 958
 959   for (chr = scm_getc (port);
 960        (chr != EOF) && ((chr == '0') || (chr == '1'));
 961        chr = scm_getc (port))
 962     {
 963       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
 964     }
 965
 966   if (chr != EOF)
 967     scm_ungetc (chr, port);
 968
 969   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
 970 }
 971
 972 static inline SCM
 973 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
 974 {
 975   int bang_seen = 0;
 976
 977   /* We can use the get_byte here because there is no need to get the
 978      locale correct when reading comments. This presumes that
 979      hash and exclamation points always represent themselves no
 980      matter what the source encoding is.*/
 981   for (;;)
 982     {
 983       int c = scm_get_byte_or_eof (port);
 984
 985       if (c == EOF)
 986         scm_i_input_error ("skip_block_comment", port,
 987                            "unterminated `#! ... !#' comment", SCM_EOL);
 988
 989       if (c == '!')
 990         bang_seen = 1;
 991       else if (c == '#' && bang_seen)
 992         break;
 993       else
 994         bang_seen = 0;
 995     }
 996
 997   return SCM_UNSPECIFIED;
 998 }
 999
1000 static SCM
1001 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1002 {
1003   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1004      nested.  So care must be taken.  */
1005   int nesting_level = 1;
1006   int opening_seen = 0, closing_seen = 0;
1007
1008   while (nesting_level > 0)
1009     {
1010       int c = scm_getc (port);
1011
1012       if (c == EOF)
1013         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1014                            "unterminated `#| ... |#' comment", SCM_EOL);
1015
1016       if (opening_seen)
1017         {
1018           if (c == '|')
1019             nesting_level++;
1020           opening_seen = 0;
1021         }
1022       else if (closing_seen)
1023         {
1024           if (c == '#')
1025             nesting_level--;
1026           closing_seen = 0;
1027         }
1028       else if (c == '|')
1029         closing_seen = 1;
1030       else if (c == '#')
1031         opening_seen = 1;
1032       else
1033         opening_seen = closing_seen = 0;
1034     }
1035
1036   return SCM_UNSPECIFIED;
1037 }
1038
1039 static SCM
1040 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1041 {
1042   scm_t_wchar c;
1043
1044   c = flush_ws (port, (char *) NULL);
1045   if (EOF == c)
1046     scm_i_input_error ("read_commented_expression", port,
1047                        "no expression after #; comment", SCM_EOL);
1048   scm_ungetc (c, port);
1049   scm_read_expression (port);
1050   return SCM_UNSPECIFIED;
1051 }
1052
1053 static SCM
1054 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1055 {
1056   /* Guile's extended symbol read syntax looks like this:
1057
1058        #{This is all a symbol name}#
1059
1060      So here, CHR is expected to be `{'.  */
1061   int saw_brace = 0, finished = 0;
1062   size_t len = 0;
1063   SCM buf = scm_i_make_string (1024, NULL);
1064
1065   buf = scm_i_string_start_writing (buf);
1066
1067   while ((chr = scm_getc (port)) != EOF)
1068     {
1069       if (saw_brace)
1070         {
1071           if (chr == '#')
1072             {
1073               finished = 1;
1074               break;
1075             }
1076           else
1077             {
1078               saw_brace = 0;
1079               scm_i_string_set_x (buf, len++, '}');
1080               scm_i_string_set_x (buf, len++, chr);
1081             }
1082         }
1083       else if (chr == '}')
1084         saw_brace = 1;
1085       else
1086         scm_i_string_set_x (buf, len++, chr);
1087
1088       if (len >= scm_i_string_length (buf) - 2)
1089         {
1090           SCM addy;
1091
1092           scm_i_string_stop_writing ();
1093           addy = scm_i_make_string (1024, NULL);
1094           buf = scm_string_append (scm_list_2 (buf, addy));
1095           len = 0;
1096           buf = scm_i_string_start_writing (buf);
1097         }
1098
1099       if (finished)
1100         break;
1101     }
1102   scm_i_string_stop_writing ();
1103
1104   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1105 }
1106
1107
1108 \f
1109 /* Top-level token readers, i.e., dispatchers.  */
1110
1111 static SCM
1112 scm_read_sharp_extension (int chr, SCM port)
1113 {
1114   SCM proc;
1115
1116   proc = scm_get_hash_procedure (chr);
1117   if (scm_is_true (scm_procedure_p (proc)))
1118     {
1119       long line = SCM_LINUM (port);
1120       int column = SCM_COL (port) - 2;
1121       SCM got;
1122
1123       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1124       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1125         {
1126           if (SCM_RECORD_POSITIONS_P)
1127             return (recsexpr (got, line, column,
1128                               SCM_FILENAME (port)));
1129           else
1130             return got;
1131         }
1132     }
1133
1134   return SCM_UNSPECIFIED;
1135 }
1136
1137 /* The reader for the sharp `#' character.  It basically dispatches reads
1138    among the above token readers.   */
1139 static SCM
1140 scm_read_sharp (scm_t_wchar chr, SCM port)
1141 #define FUNC_NAME "scm_lreadr"
1142 {
1143   SCM result;
1144
1145   chr = scm_getc (port);
1146
1147   result = scm_read_sharp_extension (chr, port);
1148   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1149     return result;
1150
1151   switch (chr)
1152     {
1153     case '\\':
1154       return (scm_read_character (chr, port));
1155     case '(':
1156       return (scm_read_vector (chr, port));
1157     case 's':
1158     case 'u':
1159     case 'f':
1160       /* This one may return either a boolean or an SRFI-4 vector.  */
1161       return (scm_read_srfi4_vector (chr, port));
1162     case 'v':
1163       return (scm_read_bytevector (chr, port));
1164     case '*':
1165       return (scm_read_guile_bit_vector (chr, port));
1166     case 't':
1167     case 'T':
1168     case 'F':
1169       /* This one may return either a boolean or an SRFI-4 vector.  */
1170       return (scm_read_boolean (chr, port));
1171     case ':':
1172       return (scm_read_keyword (chr, port));
1173     case '0': case '1': case '2': case '3': case '4':
1174     case '5': case '6': case '7': case '8': case '9':
1175     case '@':
1176 #if SCM_ENABLE_DEPRECATED
1177       /* See below for 'i' and 'e'. */
1178     case 'a':
1179     case 'c':
1180     case 'y':
1181     case 'h':
1182     case 'l':
1183 #endif
1184       return (scm_i_read_array (port, chr));
1185
1186     case 'i':
1187     case 'e':
1188 #if SCM_ENABLE_DEPRECATED
1189       {
1190         /* When next char is '(', it really is an old-style
1191            uniform array. */
1192         scm_t_wchar next_c = scm_getc (port);
1193         if (next_c != EOF)
1194           scm_ungetc (next_c, port);
1195         if (next_c == '(')
1196           return scm_i_read_array (port, chr);
1197         /* Fall through. */
1198       }
1199 #endif
1200     case 'b':
1201     case 'B':
1202     case 'o':
1203     case 'O':
1204     case 'd':
1205     case 'D':
1206     case 'x':
1207     case 'X':
1208     case 'I':
1209     case 'E':
1210       return (scm_read_number_and_radix (chr, port));
1211     case '{':
1212       return (scm_read_extended_symbol (chr, port));
1213     case '!':
1214       return (scm_read_scsh_block_comment (chr, port));
1215     case ';':
1216       return (scm_read_commented_expression (chr, port));
1217     case '`':
1218     case '\'':
1219     case ',':
1220       return (scm_read_syntax (chr, port));
1221     default:
1222       result = scm_read_sharp_extension (chr, port);
1223       if (scm_is_eq (result, SCM_UNSPECIFIED))
1224         {
1225           /* To remain compatible with 1.8 and earlier, the following
1226              characters have lower precedence than `read-hash-extend'
1227              characters.  */
1228           switch (chr)
1229             {
1230             case '|':
1231               return scm_read_r6rs_block_comment (chr, port);
1232             default:
1233               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1234                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1235             }
1236         }
1237       else
1238         return result;
1239     }
1240
1241   return SCM_UNSPECIFIED;
1242 }
1243 #undef FUNC_NAME
1244
1245 static SCM
1246 scm_read_expression (SCM port)
1247 #define FUNC_NAME "scm_read_expression"
1248 {
1249   while (1)
1250     {
1251       register scm_t_wchar chr;
1252
1253       chr = scm_getc (port);
1254
1255       switch (chr)
1256         {
1257         case SCM_WHITE_SPACES:
1258         case SCM_LINE_INCREMENTORS:
1259           break;
1260         case ';':
1261           (void) scm_read_semicolon_comment (chr, port);
1262           break;
1263         case '(':
1264           return (scm_read_sexp (chr, port));
1265         case '"':
1266           return (scm_read_string (chr, port));
1267         case '\'':
1268         case '`':
1269         case ',':
1270           return (scm_read_quote (chr, port));
1271         case '#':
1272           {
1273             SCM result;
1274             result = scm_read_sharp (chr, port);
1275             if (scm_is_eq (result, SCM_UNSPECIFIED))
1276               /* We read a comment or some such.  */
1277               break;
1278             else
1279               return result;
1280           }
1281         case ')':
1282           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1283           break;
1284         case EOF:
1285           return SCM_EOF_VAL;
1286         case ':':
1287           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1288             return scm_symbol_to_keyword (scm_read_expression (port));
1289           /* Fall through.  */
1290
1291         default:
1292           {
1293             if (((chr >= '0') && (chr <= '9'))
1294                 || (strchr ("+-.", chr)))
1295               return (scm_read_number (chr, port));
1296             else
1297               return (scm_read_mixed_case_symbol (chr, port));
1298           }
1299         }
1300     }
1301 }
1302 #undef FUNC_NAME
1303
1304 \f
1305 /* Actual reader.  */
1306
1307 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1308             (SCM port),
1309             "Read an s-expression from the input port @var{port}, or from\n"
1310             "the current input port if @var{port} is not specified.\n"
1311             "Any whitespace before the next token is discarded.")
1312 #define FUNC_NAME s_scm_read
1313 {
1314   int c;
1315
1316   if (SCM_UNBNDP (port))
1317     port = scm_current_input_port ();
1318   SCM_VALIDATE_OPINPORT (1, port);
1319
1320   c = flush_ws (port, (char *) NULL);
1321   if (EOF == c)
1322     return SCM_EOF_VAL;
1323   scm_ungetc (c, port);
1324
1325   return (scm_read_expression (port));
1326 }
1327 #undef FUNC_NAME
1328
1329
1330 \f
1331
1332 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1333 static SCM
1334 recsexpr (SCM obj, long line, int column, SCM filename)
1335 {
1336   if (!scm_is_pair(obj)) {
1337     return obj;
1338   } else {
1339     SCM tmp = obj, copy;
1340     /* If this sexpr is visible in the read:sharp source, we want to
1341        keep that information, so only record non-constant cons cells
1342        which haven't previously been read by the reader. */
1343     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1344       {
1345         if (SCM_COPY_SOURCE_P)
1346           {
1347             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1348                              SCM_UNDEFINED);
1349             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1350               {
1351                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1352                                                       line,
1353                                                       column,
1354                                                       filename),
1355                                             SCM_UNDEFINED));
1356                 copy = SCM_CDR (copy);
1357               }
1358             SCM_SETCDR (copy, tmp);
1359           }
1360         else
1361           {
1362             recsexpr (SCM_CAR (obj), line, column, filename);
1363             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1364               recsexpr (SCM_CAR (tmp), line, column, filename);
1365             copy = SCM_UNDEFINED;
1366           }
1367         scm_whash_insert (scm_source_whash,
1368                           obj,
1369                           scm_make_srcprops (line,
1370                                              column,
1371                                              filename,
1372                                              copy,
1373                                              SCM_EOL));
1374       }
1375     return obj;
1376   }
1377 }
1378
1379 /* Manipulate the read-hash-procedures alist.  This could be written in
1380    Scheme, but maybe it will also be used by C code during initialisation.  */
1381 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1382             (SCM chr, SCM proc),
1383             "Install the procedure @var{proc} for reading expressions\n"
1384             "starting with the character sequence @code{#} and @var{chr}.\n"
1385             "@var{proc} will be called with two arguments:  the character\n"
1386             "@var{chr} and the port to read further data from. The object\n"
1387             "returned will be the return value of @code{read}. \n"
1388             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1389             )
1390 #define FUNC_NAME s_scm_read_hash_extend
1391 {
1392   SCM this;
1393   SCM prev;
1394
1395   SCM_VALIDATE_CHAR (1, chr);
1396   SCM_ASSERT (scm_is_false (proc)
1397               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1398               proc, SCM_ARG2, FUNC_NAME);
1399
1400   /* Check if chr is already in the alist.  */
1401   this = *scm_read_hash_procedures;
1402   prev = SCM_BOOL_F;
1403   while (1)
1404     {
1405       if (scm_is_null (this))
1406         {
1407           /* not found, so add it to the beginning.  */
1408           if (scm_is_true (proc))
1409             {
1410               *scm_read_hash_procedures =
1411                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1412             }
1413           break;
1414         }
1415       if (scm_is_eq (chr, SCM_CAAR (this)))
1416         {
1417           /* already in the alist.  */
1418           if (scm_is_false (proc))
1419             {
1420               /* remove it.  */
1421               if (scm_is_false (prev))
1422                 {
1423                   *scm_read_hash_procedures =
1424                     SCM_CDR (*scm_read_hash_procedures);
1425                 }
1426               else
1427                 scm_set_cdr_x (prev, SCM_CDR (this));
1428             }
1429           else
1430             {
1431               /* replace it.  */
1432               scm_set_cdr_x (SCM_CAR (this), proc);
1433             }
1434           break;
1435         }
1436       prev = this;
1437       this = SCM_CDR (this);
1438     }
1439
1440   return SCM_UNSPECIFIED;
1441 }
1442 #undef FUNC_NAME
1443
1444 /* Recover the read-hash procedure corresponding to char c.  */
1445 static SCM
1446 scm_get_hash_procedure (int c)
1447 {
1448   SCM rest = *scm_read_hash_procedures;
1449
1450   while (1)
1451     {
1452       if (scm_is_null (rest))
1453         return SCM_BOOL_F;
1454
1455       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1456         return SCM_CDAR (rest);
1457
1458       rest = SCM_CDR (rest);
1459     }
1460 }
1461
1462 #define SCM_ENCODING_SEARCH_SIZE (500)
1463
1464 /* Search the first few hundred characters of a file for an Emacs-like coding
1465    declaration.  Returns either NULL or a string whose storage has been
1466    allocated with `scm_gc_malloc ()'.  */
1467 char *
1468 scm_i_scan_for_encoding (SCM port)
1469 {
1470   char header[SCM_ENCODING_SEARCH_SIZE+1];
1471   size_t bytes_read;
1472   char *encoding = NULL;
1473   int utf8_bom = 0;
1474   char *pos;
1475   int i;
1476   int in_comment;
1477
1478   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1479     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1480        "guile <(echo '(display "hello")')") so bail out.  */
1481     return NULL;
1482
1483   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1484
1485   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1486
1487   if (bytes_read > 3
1488       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1489     utf8_bom = 1;
1490
1491   /* search past "coding[:=]" */
1492   pos = header;
1493   while (1)
1494     {
1495       if ((pos = strstr(pos, "coding")) == NULL)
1496         return NULL;
1497
1498       pos += strlen("coding");
1499       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1500           (*pos == ':' || *pos == '='))
1501         {
1502           pos ++;
1503           break;
1504         }
1505     }
1506
1507   /* skip spaces */
1508   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1509          (*pos == ' ' || *pos == '\t'))
1510     pos ++;
1511
1512   /* grab the next token */
1513   i = 0;
1514   while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1515          && pos + i - header < bytes_read
1516          && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1517     i++;
1518
1519   if (i == 0)
1520     return NULL;
1521
1522   encoding = scm_gc_strndup (pos, i, "encoding");
1523   for (i = 0; i < strlen (encoding); i++)
1524     encoding[i] = toupper ((int) encoding[i]);
1525
1526   /* push backwards to make sure we were in a comment */
1527   in_comment = 0;
1528   while (pos - i - header > 0)
1529     {
1530       if (*(pos - i) == '\n')
1531         {
1532           /* This wasn't in a semicolon comment. Check for a
1533            hash-bang comment. */
1534           char *beg = strstr (header, "#!");
1535           char *end = strstr (header, "!#");
1536           if (beg < pos && pos < end)
1537             in_comment = 1;
1538           break;
1539         }
1540       if (*(pos - i) == ';')
1541         {
1542           in_comment = 1;
1543           break;
1544         }
1545       i ++;
1546     }
1547   if (!in_comment)
1548     /* This wasn't in a comment */
1549     return NULL;
1550
1551   if (utf8_bom && strcmp(encoding, "UTF-8"))
1552     scm_misc_error (NULL,
1553                     "the port input declares the encoding ~s but is encoded as UTF-8",
1554                     scm_list_1 (scm_from_locale_string (encoding)));
1555
1556   return encoding;
1557 }
1558
1559 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1560             (SCM port),
1561             "Scans the port for an Emacs-like character coding declaration\n"
1562             "near the top of the contents of a port with random-acessible contents.\n"
1563             "The coding declaration is of the form\n"
1564             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1565             "\n"
1566             "Returns a string containing the character encoding of the file\n"
1567             "if a declaration was found, or @code{#f} otherwise.\n")
1568 #define FUNC_NAME s_scm_file_encoding
1569 {
1570   char *enc;
1571   SCM s_enc;
1572
1573   enc = scm_i_scan_for_encoding (port);
1574   if (enc == NULL)
1575     return SCM_BOOL_F;
1576   else
1577     {
1578       s_enc = scm_from_locale_string (enc);
1579       return s_enc;
1580     }
1581
1582   return SCM_BOOL_F;
1583 }
1584 #undef FUNC_NAME
1585
1586 void
1587 scm_init_read ()
1588 {
1589   scm_read_hash_procedures =
1590     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1591
1592   scm_init_opts (scm_read_options, scm_read_opts);
1593 #include "libguile/read.x"
1594 }
1595
1596 /*
1597   Local Variables:
1598   c-file-style: "gnu"
1599   End:
1600 */