libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/root.h"
  46 #include "libguile/strings.h"
  47 #include "libguile/strports.h"
  48 #include "libguile/vectors.h"
  49 #include "libguile/validate.h"
  50 #include "libguile/srfi-4.h"
  51 #include "libguile/srfi-13.h"
  52
  53 #include "libguile/read.h"
  54 #include "libguile/private-options.h"
  55
  56
  57 \f
  58
  59 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  60 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  61 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  62
  63 scm_t_option scm_read_opts[] = {
  64   { SCM_OPTION_BOOLEAN, "copy", 0,
  65     "Copy source code expressions." },
  66   { SCM_OPTION_BOOLEAN, "positions", 0,
  67     "Record positions of source code expressions." },
  68   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  69     "Convert symbols to lower case."},
  70   { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
  71     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  72 #if SCM_ENABLE_ELISP
  73   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  74     "Support Elisp vector syntax, namely `[...]'."},
  75   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  76     "Support `\\(' and `\\)' in strings."},
  77 #endif
  78   { 0, },
  79 };
  80
  81 /*
  82   Give meaningful error messages for errors
  83
  84   We use the format
  85
  86   FILE:LINE:COL: MESSAGE
  87   This happened in ....
  88
  89   This is not standard GNU format, but the test-suite likes the real
  90   message to be in front.
  91
  92  */
  93
  94
  95 void
  96 scm_i_input_error (char const *function,
  97                    SCM port, const char *message, SCM arg)
  98 {
  99   SCM fn = (scm_is_string (SCM_FILENAME(port))
 100             ? SCM_FILENAME(port)
 101             : scm_from_locale_string ("#<unknown port>"));
 102
 103   SCM string_port = scm_open_output_string ();
 104   SCM string = SCM_EOL;
 105   scm_simple_format (string_port,
 106                      scm_from_locale_string ("~A:~S:~S: ~A"),
 107                      scm_list_4 (fn,
 108                                  scm_from_long (SCM_LINUM (port) + 1),
 109                                  scm_from_int (SCM_COL (port) + 1),
 110                                  scm_from_locale_string (message)));
 111
 112   string = scm_get_output_string (string_port);
 113   scm_close_output_port (string_port);
 114   scm_error_scm (scm_from_locale_symbol ("read-error"),
 115                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 116                  string,
 117                  arg,
 118                  SCM_BOOL_F);
 119 }
 120
 121
 122 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 123             (SCM setting),
 124             "Option interface for the read options. Instead of using\n"
 125             "this procedure directly, use the procedures @code{read-enable},\n"
 126             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 127 #define FUNC_NAME s_scm_read_options
 128 {
 129   SCM ans = scm_options (setting,
 130                          scm_read_opts,
 131                          FUNC_NAME);
 132   if (SCM_COPY_SOURCE_P)
 133     SCM_RECORD_POSITIONS_P = 1;
 134   return ans;
 135 }
 136 #undef FUNC_NAME
 137
 138 /* An association list mapping extra hash characters to procedures.  */
 139 static SCM *scm_read_hash_procedures;
 140
 141
 142 \f
 143 /* Token readers.  */
 144
 145
 146 /* Size of the C buffer used to read symbols and numbers.  */
 147 #define READER_BUFFER_SIZE            128
 148
 149 /* Size of the C buffer used to read strings.  */
 150 #define READER_STRING_BUFFER_SIZE     512
 151
 152 /* The maximum size of Scheme character names.  */
 153 #define READER_CHAR_NAME_MAX_SIZE      50
 154
 155
 156 /* `isblank' is only in C99.  */
 157 #define CHAR_IS_BLANK_(_chr)                                    \
 158   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 159    || ((_chr) == '\f') || ((_chr) == '\r'))
 160
 161 #ifdef MSDOS
 162 # define CHAR_IS_BLANK(_chr)                    \
 163   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 164 #else
 165 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 166 #endif
 167
 168
 169 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 170    structure'').  */
 171 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 172   (CHAR_IS_BLANK (c)                                            \
 173    || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
 174
 175 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 176
 177 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 178    Structure''.  */
 179 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 180   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 181    || ((_chr) == 'd') || ((_chr) == 'l'))
 182
 183 /* Read an SCSH block comment.  */
 184 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 185 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 186 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 187 static SCM scm_get_hash_procedure (int);
 188
 189 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Return
 190    zero if the whole token fits in BUF, non-zero otherwise.  */
 191 static inline int
 192 read_token (SCM port, SCM buf, size_t *read)
 193 {
 194   scm_t_wchar chr;
 195   *read = 0;
 196
 197   buf = scm_i_string_start_writing (buf);
 198   while (*read < scm_i_string_length (buf))
 199     {
 200       chr = scm_getc (port);
 201
 202       if (chr == EOF)
 203         {
 204           scm_i_string_stop_writing ();
 205           return 0;
 206         }
 207
 208       chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
 209
 210       if (CHAR_IS_DELIMITER (chr))
 211         {
 212           scm_i_string_stop_writing ();
 213           scm_ungetc (chr, port);
 214           return 0;
 215         }
 216
 217       scm_i_string_set_x (buf, *read, chr);
 218       (*read)++;
 219     }
 220   scm_i_string_stop_writing ();
 221
 222   return 1;
 223 }
 224
 225 static SCM
 226 read_complete_token (SCM port, size_t *read)
 227 {
 228   SCM buffer, str = SCM_EOL;
 229   size_t len;
 230   int overflow;
 231
 232   buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 233   overflow = read_token (port, buffer, read);
 234   if (!overflow)
 235     return scm_i_substring (buffer, 0, *read);
 236
 237   str = scm_string_copy (buffer);
 238   do
 239     {
 240       overflow = read_token (port, buffer, &len);
 241       str = scm_string_append (scm_list_2 (str, buffer));
 242       *read += len;
 243     }
 244   while (overflow);
 245
 246   return scm_i_substring (str, 0, *read);
 247 }
 248
 249 /* Skip whitespace from PORT and return the first non-whitespace character
 250    read.  Raise an error on end-of-file.  */
 251 static int
 252 flush_ws (SCM port, const char *eoferr)
 253 {
 254   register scm_t_wchar c;
 255   while (1)
 256     switch (c = scm_getc (port))
 257       {
 258       case EOF:
 259       goteof:
 260         if (eoferr)
 261           {
 262             scm_i_input_error (eoferr,
 263                                port,
 264                                "end of file",
 265                                SCM_EOL);
 266           }
 267         return c;
 268
 269       case ';':
 270       lp:
 271         switch (c = scm_getc (port))
 272           {
 273           case EOF:
 274             goto goteof;
 275           default:
 276             goto lp;
 277           case SCM_LINE_INCREMENTORS:
 278             break;
 279           }
 280         break;
 281
 282       case '#':
 283         switch (c = scm_getc (port))
 284           {
 285           case EOF:
 286             eoferr = "read_sharp";
 287             goto goteof;
 288           case '!':
 289             scm_read_scsh_block_comment (c, port);
 290             break;
 291           case ';':
 292             scm_read_commented_expression (c, port);
 293             break;
 294           case '|':
 295             if (scm_is_false (scm_get_hash_procedure (c)))
 296               {
 297                 scm_read_r6rs_block_comment (c, port);
 298                 break;
 299               }
 300             /* fall through */
 301           default:
 302             scm_ungetc (c, port);
 303             return '#';
 304           }
 305         break;
 306
 307       case SCM_LINE_INCREMENTORS:
 308       case SCM_SINGLE_SPACES:
 309       case '\t':
 310         break;
 311
 312       default:
 313         return c;
 314       }
 315
 316   return 0;
 317 }
 318
 319
 320 \f
 321 /* Token readers.  */
 322
 323 static SCM scm_read_expression (SCM port);
 324 static SCM scm_read_sharp (int chr, SCM port);
 325 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 326
 327
 328 static SCM
 329 scm_read_sexp (scm_t_wchar chr, SCM port)
 330 #define FUNC_NAME "scm_i_lreadparen"
 331 {
 332   register int c;
 333   register SCM tmp;
 334   register SCM tl, ans = SCM_EOL;
 335   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 336   static const int terminating_char = ')';
 337
 338   /* Need to capture line and column numbers here. */
 339   long line = SCM_LINUM (port);
 340   int column = SCM_COL (port) - 1;
 341
 342
 343   c = flush_ws (port, FUNC_NAME);
 344   if (terminating_char == c)
 345     return SCM_EOL;
 346
 347   scm_ungetc (c, port);
 348   if (scm_is_eq (scm_sym_dot,
 349                  (tmp = scm_read_expression (port))))
 350     {
 351       ans = scm_read_expression (port);
 352       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 353         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 354                            SCM_EOL);
 355       return ans;
 356     }
 357
 358   /* Build the head of the list structure. */
 359   ans = tl = scm_cons (tmp, SCM_EOL);
 360
 361   if (SCM_COPY_SOURCE_P)
 362     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 363                            ? copy
 364                            : tmp,
 365                            SCM_EOL);
 366
 367   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 368     {
 369       SCM new_tail;
 370
 371       scm_ungetc (c, port);
 372       if (scm_is_eq (scm_sym_dot,
 373                      (tmp = scm_read_expression (port))))
 374         {
 375           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 376
 377           if (SCM_COPY_SOURCE_P)
 378             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 379                                        SCM_EOL));
 380
 381           c = flush_ws (port, FUNC_NAME);
 382           if (terminating_char != c)
 383             scm_i_input_error (FUNC_NAME, port,
 384                                "in pair: missing close paren", SCM_EOL);
 385           goto exit;
 386         }
 387
 388       new_tail = scm_cons (tmp, SCM_EOL);
 389       SCM_SETCDR (tl, new_tail);
 390       tl = new_tail;
 391
 392       if (SCM_COPY_SOURCE_P)
 393         {
 394           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 395                                     ? copy
 396                                     : tmp, SCM_EOL);
 397           SCM_SETCDR (tl2, new_tail2);
 398           tl2 = new_tail2;
 399         }
 400     }
 401
 402  exit:
 403   if (SCM_RECORD_POSITIONS_P)
 404     scm_whash_insert (scm_source_whash,
 405                       ans,
 406                       scm_make_srcprops (line, column,
 407                                          SCM_FILENAME (port),
 408                                          SCM_COPY_SOURCE_P
 409                                          ? ans2
 410                                          : SCM_UNDEFINED,
 411                                          SCM_EOL));
 412   return ans;
 413 }
 414 #undef FUNC_NAME
 415
 416 static SCM
 417 scm_read_string (int chr, SCM port)
 418 #define FUNC_NAME "scm_lreadr"
 419 {
 420   /* For strings smaller than C_STR, this function creates only one Scheme
 421      object (the string returned).  */
 422
 423   SCM str = SCM_BOOL_F;
 424   unsigned c_str_len = 0;
 425   scm_t_wchar c;
 426
 427   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 428   while ('"' != (c = scm_getc (port)))
 429     {
 430       if (c == EOF)
 431         {
 432         str_eof:
 433           scm_i_input_error (FUNC_NAME, port,
 434                              "end of file in string constant", SCM_EOL);
 435         }
 436
 437       if (c_str_len + 1 >= scm_i_string_length (str))
 438         {
 439           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 440
 441           str = scm_string_append (scm_list_2 (str, addy));
 442         }
 443
 444       if (c == '\\')
 445         {
 446           switch (c = scm_getc (port))
 447             {
 448             case EOF:
 449               goto str_eof;
 450             case '"':
 451             case '\\':
 452               break;
 453 #if SCM_ENABLE_ELISP
 454             case '(':
 455             case ')':
 456               if (SCM_ESCAPED_PARENS_P)
 457                 break;
 458               goto bad_escaped;
 459 #endif
 460             case '\n':
 461               continue;
 462             case '0':
 463               c = '\0';
 464               break;
 465             case 'f':
 466               c = '\f';
 467               break;
 468             case 'n':
 469               c = '\n';
 470               break;
 471             case 'r':
 472               c = '\r';
 473               break;
 474             case 't':
 475               c = '\t';
 476               break;
 477             case 'a':
 478               c = '\007';
 479               break;
 480             case 'v':
 481               c = '\v';
 482               break;
 483             case 'x':
 484               {
 485                 scm_t_wchar a, b;
 486                 a = scm_getc (port);
 487                 if (a == EOF)
 488                   goto str_eof;
 489                 b = scm_getc (port);
 490                 if (b == EOF)
 491                   goto str_eof;
 492                 if ('0' <= a && a <= '9')
 493                   a -= '0';
 494                 else if ('A' <= a && a <= 'F')
 495                   a = a - 'A' + 10;
 496                 else if ('a' <= a && a <= 'f')
 497                   a = a - 'a' + 10;
 498                 else
 499                   {
 500                     c = a;
 501                     goto bad_escaped;
 502                   }
 503                 if ('0' <= b && b <= '9')
 504                   b -= '0';
 505                 else if ('A' <= b && b <= 'F')
 506                   b = b - 'A' + 10;
 507                 else if ('a' <= b && b <= 'f')
 508                   b = b - 'a' + 10;
 509                 else
 510                   {
 511                     c = b;
 512                     goto bad_escaped;
 513                   }
 514                 c = a * 16 + b;
 515                 break;
 516               }
 517             case 'u':
 518               {
 519                 scm_t_wchar a;
 520                 int i;
 521                 c = 0;
 522                 for (i = 0; i < 4; i++)
 523                   {
 524                     a = scm_getc (port);
 525                     if (a == EOF)
 526                       goto str_eof;
 527                     if ('0' <= a && a <= '9')
 528                       a -= '0';
 529                     else if ('A' <= a && a <= 'F')
 530                       a = a - 'A' + 10;
 531                     else if ('a' <= a && a <= 'f')
 532                       a = a - 'a' + 10;
 533                     else
 534                       {
 535                         c = a;
 536                         goto bad_escaped;
 537                       }
 538                     c = c * 16 + a;
 539                   }
 540                 break;
 541               }
 542             case 'U':
 543               {
 544                 scm_t_wchar a;
 545                 int i;
 546                 c = 0;
 547                 for (i = 0; i < 6; i++)
 548                   {
 549                     a = scm_getc (port);
 550                     if (a == EOF)
 551                       goto str_eof;
 552                     if ('0' <= a && a <= '9')
 553                       a -= '0';
 554                     else if ('A' <= a && a <= 'F')
 555                       a = a - 'A' + 10;
 556                     else if ('a' <= a && a <= 'f')
 557                       a = a - 'a' + 10;
 558                     else
 559                       {
 560                         c = a;
 561                         goto bad_escaped;
 562                       }
 563                     c = c * 16 + a;
 564                   }
 565                 break;
 566               }
 567             default:
 568             bad_escaped:
 569               scm_i_input_error (FUNC_NAME, port,
 570                                  "illegal character in escape sequence: ~S",
 571                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 572             }
 573         }
 574       str = scm_i_string_start_writing (str);
 575       scm_i_string_set_x (str, c_str_len++, c);
 576       scm_i_string_stop_writing ();
 577     }
 578
 579   if (c_str_len > 0)
 580     {
 581       return scm_i_substring_copy (str, 0, c_str_len);
 582     }
 583
 584   return scm_nullstr;
 585 }
 586 #undef FUNC_NAME
 587
 588
 589 static SCM
 590 scm_read_number (scm_t_wchar chr, SCM port)
 591 {
 592   SCM result;
 593   SCM buffer;
 594   size_t read;
 595
 596   scm_ungetc (chr, port);
 597   buffer = read_complete_token (port, &read);
 598   result = scm_string_to_number (buffer, SCM_UNDEFINED);
 599   if (!scm_is_true (result))
 600     /* Return a symbol instead of a number.  */
 601     result = scm_string_to_symbol (buffer);
 602
 603   return result;
 604 }
 605
 606 static SCM
 607 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 608 {
 609   SCM result;
 610   int ends_with_colon = 0;
 611   SCM buffer;
 612   size_t read = 0;
 613   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 614
 615   scm_ungetc (chr, port);
 616   buffer = read_complete_token (port, &read);
 617   if (read > 0)
 618     ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
 619
 620   if (postfix && ends_with_colon && (read > 1))
 621     result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
 622   else
 623     result = scm_string_to_symbol (buffer);
 624
 625   return result;
 626 }
 627
 628 static SCM
 629 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 630 #define FUNC_NAME "scm_lreadr"
 631 {
 632   SCM result;
 633   size_t read;
 634   SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 635   unsigned int radix;
 636
 637   switch (chr)
 638     {
 639     case 'B':
 640     case 'b':
 641       radix = 2;
 642       break;
 643
 644     case 'o':
 645     case 'O':
 646       radix = 8;
 647       break;
 648
 649     case 'd':
 650     case 'D':
 651       radix = 10;
 652       break;
 653
 654     case 'x':
 655     case 'X':
 656       radix = 16;
 657       break;
 658
 659     default:
 660       scm_ungetc (chr, port);
 661       scm_ungetc ('#', port);
 662       radix = 10;
 663     }
 664
 665   buffer = read_complete_token (port, &read);
 666   result = scm_string_to_number (buffer, scm_from_uint (radix));
 667
 668   if (scm_is_true (result))
 669     return result;
 670
 671   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 672
 673   return SCM_BOOL_F;
 674 }
 675 #undef FUNC_NAME
 676
 677 static SCM
 678 scm_read_quote (int chr, SCM port)
 679 {
 680   SCM p;
 681   long line = SCM_LINUM (port);
 682   int column = SCM_COL (port) - 1;
 683
 684   switch (chr)
 685     {
 686     case '`':
 687       p = scm_sym_quasiquote;
 688       break;
 689
 690     case '\'':
 691       p = scm_sym_quote;
 692       break;
 693
 694     case ',':
 695       {
 696         scm_t_wchar c;
 697
 698         c = scm_getc (port);
 699         if ('@' == c)
 700           p = scm_sym_uq_splicing;
 701         else
 702           {
 703             scm_ungetc (c, port);
 704             p = scm_sym_unquote;
 705           }
 706         break;
 707       }
 708
 709     default:
 710       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 711                "scm_read_quote", chr);
 712       abort ();
 713     }
 714
 715   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 716   if (SCM_RECORD_POSITIONS_P)
 717     scm_whash_insert (scm_source_whash, p,
 718                       scm_make_srcprops (line, column,
 719                                          SCM_FILENAME (port),
 720                                          SCM_COPY_SOURCE_P
 721                                          ? (scm_cons2 (SCM_CAR (p),
 722                                                        SCM_CAR (SCM_CDR (p)),
 723                                                        SCM_EOL))
 724                                          : SCM_UNDEFINED,
 725                                          SCM_EOL));
 726
 727
 728   return p;
 729 }
 730
 731 SCM_SYMBOL (sym_syntax, "syntax");
 732 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 733 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 734 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 735
 736 static SCM
 737 scm_read_syntax (int chr, SCM port)
 738 {
 739   SCM p;
 740   long line = SCM_LINUM (port);
 741   int column = SCM_COL (port) - 1;
 742
 743   switch (chr)
 744     {
 745     case '`':
 746       p = sym_quasisyntax;
 747       break;
 748
 749     case '\'':
 750       p = sym_syntax;
 751       break;
 752
 753     case ',':
 754       {
 755         int c;
 756
 757         c = scm_getc (port);
 758         if ('@' == c)
 759           p = sym_unsyntax_splicing;
 760         else
 761           {
 762             scm_ungetc (c, port);
 763             p = sym_unsyntax;
 764           }
 765         break;
 766       }
 767
 768     default:
 769       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 770                "scm_read_syntax", chr);
 771       abort ();
 772     }
 773
 774   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 775   if (SCM_RECORD_POSITIONS_P)
 776     scm_whash_insert (scm_source_whash, p,
 777                       scm_make_srcprops (line, column,
 778                                          SCM_FILENAME (port),
 779                                          SCM_COPY_SOURCE_P
 780                                          ? (scm_cons2 (SCM_CAR (p),
 781                                                        SCM_CAR (SCM_CDR (p)),
 782                                                        SCM_EOL))
 783                                          : SCM_UNDEFINED,
 784                                          SCM_EOL));
 785
 786
 787   return p;
 788 }
 789
 790 static inline SCM
 791 scm_read_semicolon_comment (int chr, SCM port)
 792 {
 793   int c;
 794
 795   /* We use the get_byte here because there is no need to get the
 796      locale correct with comment input. This presumes that newline
 797      always represents itself no matter what the encoding is.  */
 798   for (c = scm_get_byte_or_eof (port);
 799        (c != EOF) && (c != '\n');
 800        c = scm_getc (port));
 801
 802   return SCM_UNSPECIFIED;
 803 }
 804
 805 \f
 806 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 807
 808 static SCM
 809 scm_read_boolean (int chr, SCM port)
 810 {
 811   switch (chr)
 812     {
 813     case 't':
 814     case 'T':
 815       return SCM_BOOL_T;
 816
 817     case 'f':
 818     case 'F':
 819       return SCM_BOOL_F;
 820     }
 821
 822   return SCM_UNSPECIFIED;
 823 }
 824
 825 static SCM
 826 scm_read_character (scm_t_wchar chr, SCM port)
 827 #define FUNC_NAME "scm_lreadr"
 828 {
 829   SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
 830   size_t charname_len;
 831   scm_t_wchar cp;
 832   int overflow;
 833
 834   overflow = read_token (port, charname, &charname_len);
 835   charname = scm_c_substring (charname, 0, charname_len);
 836
 837   if (overflow)
 838     goto char_error;
 839
 840   if (charname_len == 0)
 841     {
 842       chr = scm_getc (port);
 843       if (chr == EOF)
 844         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 845                            "while reading character", SCM_EOL);
 846
 847       /* CHR must be a token delimiter, like a whitespace.  */
 848       return (SCM_MAKE_CHAR (chr));
 849     }
 850
 851   if (charname_len == 1)
 852     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
 853
 854   cp = scm_i_string_ref (charname, 0);
 855   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 856     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 857
 858   if (cp >= '0' && cp < '8')
 859     {
 860       /* Dirk:FIXME::  This type of character syntax is not R5RS
 861        * compliant.  Further, it should be verified that the constant
 862        * does only consist of octal digits.  */
 863       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 864       if (SCM_I_INUMP (p))
 865         {
 866           scm_t_wchar c = SCM_I_INUM (p);
 867           if (SCM_IS_UNICODE_CHAR (c))
 868             return SCM_MAKE_CHAR (c);
 869           else
 870             scm_i_input_error (FUNC_NAME, port,
 871                                "out-of-range octal character escape: ~a",
 872                                scm_list_1 (charname));
 873         }
 874     }
 875
 876   /* The names of characters should never have non-Latin1
 877      characters.  */
 878   if (scm_i_is_narrow_string (charname)
 879       || scm_i_try_narrow_string (charname))
 880     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 881                                        charname_len);
 882       if (scm_is_true (ch))
 883         return ch;
 884     }
 885
 886  char_error:
 887   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 888                      scm_list_1 (charname));
 889
 890   return SCM_UNSPECIFIED;
 891 }
 892 #undef FUNC_NAME
 893
 894 static inline SCM
 895 scm_read_keyword (int chr, SCM port)
 896 {
 897   SCM symbol;
 898
 899   /* Read the symbol that comprises the keyword.  Doing this instead of
 900      invoking a specific symbol reader function allows `scm_read_keyword ()'
 901      to adapt to the delimiters currently valid of symbols.
 902
 903      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 904   symbol = scm_read_expression (port);
 905   if (!scm_is_symbol (symbol))
 906     scm_i_input_error ("scm_read_keyword", port,
 907                        "keyword prefix `~a' not followed by a symbol: ~s",
 908                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
 909
 910   return (scm_symbol_to_keyword (symbol));
 911 }
 912
 913 static inline SCM
 914 scm_read_vector (int chr, SCM port)
 915 {
 916   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
 917      guarantee that it's going to do what we want.  After all, this is an
 918      implementation detail of `scm_read_vector ()', not a desirable
 919      property.  */
 920   return (scm_vector (scm_read_sexp (chr, port)));
 921 }
 922
 923 static inline SCM
 924 scm_read_srfi4_vector (int chr, SCM port)
 925 {
 926   return scm_i_read_array (port, chr);
 927 }
 928
 929 static SCM
 930 scm_read_bytevector (scm_t_wchar chr, SCM port)
 931 {
 932   chr = scm_getc (port);
 933   if (chr != 'u')
 934     goto syntax;
 935
 936   chr = scm_getc (port);
 937   if (chr != '8')
 938     goto syntax;
 939
 940   chr = scm_getc (port);
 941   if (chr != '(')
 942     goto syntax;
 943
 944   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
 945
 946  syntax:
 947   scm_i_input_error ("read_bytevector", port,
 948                      "invalid bytevector prefix",
 949                      SCM_MAKE_CHAR (chr));
 950   return SCM_UNSPECIFIED;
 951 }
 952
 953 static SCM
 954 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
 955 {
 956   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
 957      terribly inefficient but who cares?  */
 958   SCM s_bits = SCM_EOL;
 959
 960   for (chr = scm_getc (port);
 961        (chr != EOF) && ((chr == '0') || (chr == '1'));
 962        chr = scm_getc (port))
 963     {
 964       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
 965     }
 966
 967   if (chr != EOF)
 968     scm_ungetc (chr, port);
 969
 970   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
 971 }
 972
 973 static inline SCM
 974 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
 975 {
 976   int bang_seen = 0;
 977
 978   /* We can use the get_byte here because there is no need to get the
 979      locale correct when reading comments. This presumes that
 980      hash and exclamation points always represent themselves no
 981      matter what the source encoding is.*/
 982   for (;;)
 983     {
 984       int c = scm_get_byte_or_eof (port);
 985
 986       if (c == EOF)
 987         scm_i_input_error ("skip_block_comment", port,
 988                            "unterminated `#! ... !#' comment", SCM_EOL);
 989
 990       if (c == '!')
 991         bang_seen = 1;
 992       else if (c == '#' && bang_seen)
 993         break;
 994       else
 995         bang_seen = 0;
 996     }
 997
 998   return SCM_UNSPECIFIED;
 999 }
1000
1001 static SCM
1002 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1003 {
1004   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1005      nested.  So care must be taken.  */
1006   int nesting_level = 1;
1007   int opening_seen = 0, closing_seen = 0;
1008
1009   while (nesting_level > 0)
1010     {
1011       int c = scm_getc (port);
1012
1013       if (c == EOF)
1014         scm_i_input_error (__FUNCTION__, port,
1015                            "unterminated `#| ... |#' comment", SCM_EOL);
1016
1017       if (opening_seen)
1018         {
1019           if (c == '|')
1020             nesting_level++;
1021           opening_seen = 0;
1022         }
1023       else if (closing_seen)
1024         {
1025           if (c == '#')
1026             nesting_level--;
1027           closing_seen = 0;
1028         }
1029       else if (c == '|')
1030         closing_seen = 1;
1031       else if (c == '#')
1032         opening_seen = 1;
1033       else
1034         opening_seen = closing_seen = 0;
1035     }
1036
1037   return SCM_UNSPECIFIED;
1038 }
1039
1040 static SCM
1041 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1042 {
1043   scm_t_wchar c;
1044
1045   c = flush_ws (port, (char *) NULL);
1046   if (EOF == c)
1047     scm_i_input_error ("read_commented_expression", port,
1048                        "no expression after #; comment", SCM_EOL);
1049   scm_ungetc (c, port);
1050   scm_read_expression (port);
1051   return SCM_UNSPECIFIED;
1052 }
1053
1054 static SCM
1055 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1056 {
1057   /* Guile's extended symbol read syntax looks like this:
1058
1059        #{This is all a symbol name}#
1060
1061      So here, CHR is expected to be `{'.  */
1062   int saw_brace = 0, finished = 0;
1063   size_t len = 0;
1064   SCM buf = scm_i_make_string (1024, NULL);
1065
1066   buf = scm_i_string_start_writing (buf);
1067
1068   while ((chr = scm_getc (port)) != EOF)
1069     {
1070       if (saw_brace)
1071         {
1072           if (chr == '#')
1073             {
1074               finished = 1;
1075               break;
1076             }
1077           else
1078             {
1079               saw_brace = 0;
1080               scm_i_string_set_x (buf, len++, '}');
1081               scm_i_string_set_x (buf, len++, chr);
1082             }
1083         }
1084       else if (chr == '}')
1085         saw_brace = 1;
1086       else
1087         scm_i_string_set_x (buf, len++, chr);
1088
1089       if (len >= scm_i_string_length (buf) - 2)
1090         {
1091           scm_i_string_stop_writing ();
1092           SCM addy = scm_i_make_string (1024, NULL);
1093           buf = scm_string_append (scm_list_2 (buf, addy));
1094           len = 0;
1095           buf = scm_i_string_start_writing (buf);
1096         }
1097
1098       if (finished)
1099         break;
1100     }
1101   scm_i_string_stop_writing ();
1102
1103   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1104 }
1105
1106
1107 \f
1108 /* Top-level token readers, i.e., dispatchers.  */
1109
1110 static SCM
1111 scm_read_sharp_extension (int chr, SCM port)
1112 {
1113   SCM proc;
1114
1115   proc = scm_get_hash_procedure (chr);
1116   if (scm_is_true (scm_procedure_p (proc)))
1117     {
1118       long line = SCM_LINUM (port);
1119       int column = SCM_COL (port) - 2;
1120       SCM got;
1121
1122       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1123       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1124         {
1125           if (SCM_RECORD_POSITIONS_P)
1126             return (recsexpr (got, line, column,
1127                               SCM_FILENAME (port)));
1128           else
1129             return got;
1130         }
1131     }
1132
1133   return SCM_UNSPECIFIED;
1134 }
1135
1136 /* The reader for the sharp `#' character.  It basically dispatches reads
1137    among the above token readers.   */
1138 static SCM
1139 scm_read_sharp (scm_t_wchar chr, SCM port)
1140 #define FUNC_NAME "scm_lreadr"
1141 {
1142   SCM result;
1143
1144   chr = scm_getc (port);
1145
1146   result = scm_read_sharp_extension (chr, port);
1147   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1148     return result;
1149
1150   switch (chr)
1151     {
1152     case '\\':
1153       return (scm_read_character (chr, port));
1154     case '(':
1155       return (scm_read_vector (chr, port));
1156     case 's':
1157     case 'u':
1158     case 'f':
1159       /* This one may return either a boolean or an SRFI-4 vector.  */
1160       return (scm_read_srfi4_vector (chr, port));
1161     case 'v':
1162       return (scm_read_bytevector (chr, port));
1163     case '*':
1164       return (scm_read_guile_bit_vector (chr, port));
1165     case 't':
1166     case 'T':
1167     case 'F':
1168       /* This one may return either a boolean or an SRFI-4 vector.  */
1169       return (scm_read_boolean (chr, port));
1170     case ':':
1171       return (scm_read_keyword (chr, port));
1172     case '0': case '1': case '2': case '3': case '4':
1173     case '5': case '6': case '7': case '8': case '9':
1174     case '@':
1175 #if SCM_ENABLE_DEPRECATED
1176       /* See below for 'i' and 'e'. */
1177     case 'a':
1178     case 'c':
1179     case 'y':
1180     case 'h':
1181     case 'l':
1182 #endif
1183       return (scm_i_read_array (port, chr));
1184
1185     case 'i':
1186     case 'e':
1187 #if SCM_ENABLE_DEPRECATED
1188       {
1189         /* When next char is '(', it really is an old-style
1190            uniform array. */
1191         scm_t_wchar next_c = scm_getc (port);
1192         if (next_c != EOF)
1193           scm_ungetc (next_c, port);
1194         if (next_c == '(')
1195           return scm_i_read_array (port, chr);
1196         /* Fall through. */
1197       }
1198 #endif
1199     case 'b':
1200     case 'B':
1201     case 'o':
1202     case 'O':
1203     case 'd':
1204     case 'D':
1205     case 'x':
1206     case 'X':
1207     case 'I':
1208     case 'E':
1209       return (scm_read_number_and_radix (chr, port));
1210     case '{':
1211       return (scm_read_extended_symbol (chr, port));
1212     case '!':
1213       return (scm_read_scsh_block_comment (chr, port));
1214     case ';':
1215       return (scm_read_commented_expression (chr, port));
1216     case '`':
1217     case '\'':
1218     case ',':
1219       return (scm_read_syntax (chr, port));
1220     default:
1221       result = scm_read_sharp_extension (chr, port);
1222       if (scm_is_eq (result, SCM_UNSPECIFIED))
1223         {
1224           /* To remain compatible with 1.8 and earlier, the following
1225              characters have lower precedence than `read-hash-extend'
1226              characters.  */
1227           switch (chr)
1228             {
1229             case '|':
1230               return scm_read_r6rs_block_comment (chr, port);
1231             default:
1232               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1233                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1234             }
1235         }
1236       else
1237         return result;
1238     }
1239
1240   return SCM_UNSPECIFIED;
1241 }
1242 #undef FUNC_NAME
1243
1244 static SCM
1245 scm_read_expression (SCM port)
1246 #define FUNC_NAME "scm_read_expression"
1247 {
1248   while (1)
1249     {
1250       register scm_t_wchar chr;
1251
1252       chr = scm_getc (port);
1253
1254       switch (chr)
1255         {
1256         case SCM_WHITE_SPACES:
1257         case SCM_LINE_INCREMENTORS:
1258           break;
1259         case ';':
1260           (void) scm_read_semicolon_comment (chr, port);
1261           break;
1262         case '(':
1263           return (scm_read_sexp (chr, port));
1264         case '"':
1265           return (scm_read_string (chr, port));
1266         case '\'':
1267         case '`':
1268         case ',':
1269           return (scm_read_quote (chr, port));
1270         case '#':
1271           {
1272             SCM result;
1273             result = scm_read_sharp (chr, port);
1274             if (scm_is_eq (result, SCM_UNSPECIFIED))
1275               /* We read a comment or some such.  */
1276               break;
1277             else
1278               return result;
1279           }
1280         case ')':
1281           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1282           break;
1283         case EOF:
1284           return SCM_EOF_VAL;
1285         case ':':
1286           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1287             return scm_symbol_to_keyword (scm_read_expression (port));
1288           /* Fall through.  */
1289
1290         default:
1291           {
1292             if (((chr >= '0') && (chr <= '9'))
1293                 || (strchr ("+-.", chr)))
1294               return (scm_read_number (chr, port));
1295             else
1296               return (scm_read_mixed_case_symbol (chr, port));
1297           }
1298         }
1299     }
1300 }
1301 #undef FUNC_NAME
1302
1303 \f
1304 /* Actual reader.  */
1305
1306 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1307             (SCM port),
1308             "Read an s-expression from the input port @var{port}, or from\n"
1309             "the current input port if @var{port} is not specified.\n"
1310             "Any whitespace before the next token is discarded.")
1311 #define FUNC_NAME s_scm_read
1312 {
1313   int c;
1314
1315   if (SCM_UNBNDP (port))
1316     port = scm_current_input_port ();
1317   SCM_VALIDATE_OPINPORT (1, port);
1318
1319   c = flush_ws (port, (char *) NULL);
1320   if (EOF == c)
1321     return SCM_EOF_VAL;
1322   scm_ungetc (c, port);
1323
1324   return (scm_read_expression (port));
1325 }
1326 #undef FUNC_NAME
1327
1328
1329 \f
1330
1331 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1332 static SCM
1333 recsexpr (SCM obj, long line, int column, SCM filename)
1334 {
1335   if (!scm_is_pair(obj)) {
1336     return obj;
1337   } else {
1338     SCM tmp = obj, copy;
1339     /* If this sexpr is visible in the read:sharp source, we want to
1340        keep that information, so only record non-constant cons cells
1341        which haven't previously been read by the reader. */
1342     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1343       {
1344         if (SCM_COPY_SOURCE_P)
1345           {
1346             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1347                              SCM_UNDEFINED);
1348             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1349               {
1350                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1351                                                       line,
1352                                                       column,
1353                                                       filename),
1354                                             SCM_UNDEFINED));
1355                 copy = SCM_CDR (copy);
1356               }
1357             SCM_SETCDR (copy, tmp);
1358           }
1359         else
1360           {
1361             recsexpr (SCM_CAR (obj), line, column, filename);
1362             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1363               recsexpr (SCM_CAR (tmp), line, column, filename);
1364             copy = SCM_UNDEFINED;
1365           }
1366         scm_whash_insert (scm_source_whash,
1367                           obj,
1368                           scm_make_srcprops (line,
1369                                              column,
1370                                              filename,
1371                                              copy,
1372                                              SCM_EOL));
1373       }
1374     return obj;
1375   }
1376 }
1377
1378 /* Manipulate the read-hash-procedures alist.  This could be written in
1379    Scheme, but maybe it will also be used by C code during initialisation.  */
1380 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1381             (SCM chr, SCM proc),
1382             "Install the procedure @var{proc} for reading expressions\n"
1383             "starting with the character sequence @code{#} and @var{chr}.\n"
1384             "@var{proc} will be called with two arguments:  the character\n"
1385             "@var{chr} and the port to read further data from. The object\n"
1386             "returned will be the return value of @code{read}. \n"
1387             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1388             )
1389 #define FUNC_NAME s_scm_read_hash_extend
1390 {
1391   SCM this;
1392   SCM prev;
1393
1394   SCM_VALIDATE_CHAR (1, chr);
1395   SCM_ASSERT (scm_is_false (proc)
1396               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1397               proc, SCM_ARG2, FUNC_NAME);
1398
1399   /* Check if chr is already in the alist.  */
1400   this = *scm_read_hash_procedures;
1401   prev = SCM_BOOL_F;
1402   while (1)
1403     {
1404       if (scm_is_null (this))
1405         {
1406           /* not found, so add it to the beginning.  */
1407           if (scm_is_true (proc))
1408             {
1409               *scm_read_hash_procedures =
1410                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1411             }
1412           break;
1413         }
1414       if (scm_is_eq (chr, SCM_CAAR (this)))
1415         {
1416           /* already in the alist.  */
1417           if (scm_is_false (proc))
1418             {
1419               /* remove it.  */
1420               if (scm_is_false (prev))
1421                 {
1422                   *scm_read_hash_procedures =
1423                     SCM_CDR (*scm_read_hash_procedures);
1424                 }
1425               else
1426                 scm_set_cdr_x (prev, SCM_CDR (this));
1427             }
1428           else
1429             {
1430               /* replace it.  */
1431               scm_set_cdr_x (SCM_CAR (this), proc);
1432             }
1433           break;
1434         }
1435       prev = this;
1436       this = SCM_CDR (this);
1437     }
1438
1439   return SCM_UNSPECIFIED;
1440 }
1441 #undef FUNC_NAME
1442
1443 /* Recover the read-hash procedure corresponding to char c.  */
1444 static SCM
1445 scm_get_hash_procedure (int c)
1446 {
1447   SCM rest = *scm_read_hash_procedures;
1448
1449   while (1)
1450     {
1451       if (scm_is_null (rest))
1452         return SCM_BOOL_F;
1453
1454       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1455         return SCM_CDAR (rest);
1456
1457       rest = SCM_CDR (rest);
1458     }
1459 }
1460
1461 #define SCM_ENCODING_SEARCH_SIZE (500)
1462
1463 /* Search the first few hundred characters of a file for an Emacs-like coding
1464    declaration.  Returns either NULL or a string whose storage has been
1465    allocated with `scm_gc_malloc ()'.  */
1466 char *
1467 scm_i_scan_for_encoding (SCM port)
1468 {
1469   char header[SCM_ENCODING_SEARCH_SIZE+1];
1470   size_t bytes_read;
1471   char *encoding = NULL;
1472   int utf8_bom = 0;
1473   char *pos;
1474   int i;
1475   int in_comment;
1476
1477   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1478   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1479
1480   if (bytes_read > 3
1481       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1482     utf8_bom = 1;
1483
1484   /* search past "coding[:=]" */
1485   pos = header;
1486   while (1)
1487     {
1488       if ((pos = strstr(pos, "coding")) == NULL)
1489         return NULL;
1490
1491       pos += strlen("coding");
1492       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1493           (*pos == ':' || *pos == '='))
1494         {
1495           pos ++;
1496           break;
1497         }
1498     }
1499
1500   /* skip spaces */
1501   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1502          (*pos == ' ' || *pos == '\t'))
1503     pos ++;
1504
1505   /* grab the next token */
1506   i = 0;
1507   while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1508          && pos + i - header < bytes_read
1509          && (isalnum((int) pos[i]) || pos[i] == '_' || pos[i] == '-'
1510              || pos[i] == '.'))
1511     i++;
1512
1513   if (i == 0)
1514     return NULL;
1515
1516   encoding = scm_gc_strndup (pos, i + 1, "encoding");
1517   for (i = 0; i < strlen (encoding); i++)
1518     encoding[i] = toupper ((int) encoding[i]);
1519
1520   /* push backwards to make sure we were in a comment */
1521   in_comment = 0;
1522   while (pos - i - header > 0)
1523     {
1524       if (*(pos - i) == '\n')
1525         {
1526           /* This wasn't in a semicolon comment. Check for a
1527            hash-bang comment. */
1528           char *beg = strstr (header, "#!");
1529           char *end = strstr (header, "!#");
1530           if (beg < pos && pos < end)
1531             in_comment = 1;
1532           break;
1533         }
1534       if (*(pos - i) == ';')
1535         {
1536           in_comment = 1;
1537           break;
1538         }
1539       i ++;
1540     }
1541   if (!in_comment)
1542     /* This wasn't in a comment */
1543     return NULL;
1544
1545   if (utf8_bom && strcmp(encoding, "UTF-8"))
1546     scm_misc_error (NULL,
1547                     "the port input declares the encoding ~s but is encoded as UTF-8",
1548                     scm_list_1 (scm_from_locale_string (encoding)));
1549
1550   return encoding;
1551 }
1552
1553 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1554             (SCM port),
1555             "Scans the port for an EMACS-like character coding declaration\n"
1556             "near the top of the contents of a port with random-acessible contents.\n"
1557             "The coding declaration is of the form\n"
1558             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1559             "\n"
1560             "Returns a string containing the character encoding of the file\n"
1561             "if a declaration was found, or @code{#f} otherwise.\n")
1562 #define FUNC_NAME s_scm_file_encoding
1563 {
1564   char *enc;
1565   SCM s_enc;
1566
1567   enc = scm_i_scan_for_encoding (port);
1568   if (enc == NULL)
1569     return SCM_BOOL_F;
1570   else
1571     {
1572       s_enc = scm_from_locale_string (enc);
1573       return s_enc;
1574     }
1575
1576   return SCM_BOOL_F;
1577 }
1578 #undef FUNC_NAME
1579
1580 void
1581 scm_init_read ()
1582 {
1583   scm_read_hash_procedures =
1584     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1585
1586   scm_init_opts (scm_read_options, scm_read_opts);
1587 #include "libguile/read.x"
1588 }
1589
1590 /*
1591   Local Variables:
1592   c-file-style: "gnu"
1593   End:
1594 */