libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63
  64 scm_t_option scm_read_opts[] = {
  65   { SCM_OPTION_BOOLEAN, "copy", 0,
  66     "Copy source code expressions." },
  67   { SCM_OPTION_BOOLEAN, "positions", 0,
  68     "Record positions of source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  70     "Convert symbols to lower case."},
  71   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  72     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  73 #if SCM_ENABLE_ELISP
  74   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  75     "Support Elisp vector syntax, namely `[...]'."},
  76   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  77     "Support `\\(' and `\\)' in strings."},
  78 #endif
  79   { 0, },
  80 };
  81
  82 /*
  83   Give meaningful error messages for errors
  84
  85   We use the format
  86
  87   FILE:LINE:COL: MESSAGE
  88   This happened in ....
  89
  90   This is not standard GNU format, but the test-suite likes the real
  91   message to be in front.
  92
  93  */
  94
  95
  96 void
  97 scm_i_input_error (char const *function,
  98                    SCM port, const char *message, SCM arg)
  99 {
 100   SCM fn = (scm_is_string (SCM_FILENAME(port))
 101             ? SCM_FILENAME(port)
 102             : scm_from_locale_string ("#<unknown port>"));
 103
 104   SCM string_port = scm_open_output_string ();
 105   SCM string = SCM_EOL;
 106   scm_simple_format (string_port,
 107                      scm_from_locale_string ("~A:~S:~S: ~A"),
 108                      scm_list_4 (fn,
 109                                  scm_from_long (SCM_LINUM (port) + 1),
 110                                  scm_from_int (SCM_COL (port) + 1),
 111                                  scm_from_locale_string (message)));
 112
 113   string = scm_get_output_string (string_port);
 114   scm_close_output_port (string_port);
 115   scm_error_scm (scm_from_locale_symbol ("read-error"),
 116                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 117                  string,
 118                  arg,
 119                  SCM_BOOL_F);
 120 }
 121
 122
 123 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 124             (SCM setting),
 125             "Option interface for the read options. Instead of using\n"
 126             "this procedure directly, use the procedures @code{read-enable},\n"
 127             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 128 #define FUNC_NAME s_scm_read_options
 129 {
 130   SCM ans = scm_options (setting,
 131                          scm_read_opts,
 132                          FUNC_NAME);
 133   if (SCM_COPY_SOURCE_P)
 134     SCM_RECORD_POSITIONS_P = 1;
 135   return ans;
 136 }
 137 #undef FUNC_NAME
 138
 139 /* An association list mapping extra hash characters to procedures.  */
 140 static SCM *scm_read_hash_procedures;
 141
 142
 143 \f
 144 /* Token readers.  */
 145
 146
 147 /* Size of the C buffer used to read symbols and numbers.  */
 148 #define READER_BUFFER_SIZE            128
 149
 150 /* Size of the C buffer used to read strings.  */
 151 #define READER_STRING_BUFFER_SIZE     512
 152
 153 /* The maximum size of Scheme character names.  */
 154 #define READER_CHAR_NAME_MAX_SIZE      50
 155
 156
 157 /* `isblank' is only in C99.  */
 158 #define CHAR_IS_BLANK_(_chr)                                    \
 159   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 160    || ((_chr) == '\f') || ((_chr) == '\r'))
 161
 162 #ifdef MSDOS
 163 # define CHAR_IS_BLANK(_chr)                    \
 164   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 165 #else
 166 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 167 #endif
 168
 169
 170 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 171    structure'').  */
 172 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 173   (CHAR_IS_BLANK (c)                                            \
 174    || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
 175
 176 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 177
 178 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 179    Structure''.  */
 180 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 181   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 182    || ((_chr) == 'd') || ((_chr) == 'l'))
 183
 184 /* Read an SCSH block comment.  */
 185 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 186 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 187 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 188 static SCM scm_get_hash_procedure (int);
 189
 190 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Return
 191    zero if the whole token fits in BUF, non-zero otherwise.  */
 192 static inline int
 193 read_token (SCM port, SCM buf, size_t *read)
 194 {
 195   scm_t_wchar chr;
 196   *read = 0;
 197
 198   while (*read < scm_i_string_length (buf))
 199     {
 200       chr = scm_getc (port);
 201
 202       if (chr == EOF)
 203         return 0;
 204
 205       chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
 206
 207       if (CHAR_IS_DELIMITER (chr))
 208         {
 209           scm_ungetc (chr, port);
 210           return 0;
 211         }
 212
 213       scm_i_string_set_x (buf, *read, chr);
 214       (*read)++;
 215     }
 216
 217   return 1;
 218 }
 219
 220 static SCM
 221 read_complete_token (SCM port, size_t *read)
 222 {
 223   SCM buffer;
 224   int overflow;
 225   size_t overflow_read;
 226   SCM tail = SCM_EOL;
 227
 228   buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 229   overflow = read_token (port, buffer, read);
 230   while (overflow)
 231     {
 232       tail = scm_cons (buffer, tail);
 233       buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 234       overflow = read_token (port, buffer, &overflow_read);
 235       *read += overflow_read;
 236     }
 237
 238   if (scm_is_null (tail))
 239     return scm_i_substring (buffer, 0, *read);
 240   else
 241     return scm_string_append
 242       (scm_reverse (scm_cons (scm_i_substring (buffer, 0, overflow_read),
 243                               tail)));
 244 }
 245
 246 /* Skip whitespace from PORT and return the first non-whitespace character
 247    read.  Raise an error on end-of-file.  */
 248 static int
 249 flush_ws (SCM port, const char *eoferr)
 250 {
 251   register scm_t_wchar c;
 252   while (1)
 253     switch (c = scm_getc (port))
 254       {
 255       case EOF:
 256       goteof:
 257         if (eoferr)
 258           {
 259             scm_i_input_error (eoferr,
 260                                port,
 261                                "end of file",
 262                                SCM_EOL);
 263           }
 264         return c;
 265
 266       case ';':
 267       lp:
 268         switch (c = scm_getc (port))
 269           {
 270           case EOF:
 271             goto goteof;
 272           default:
 273             goto lp;
 274           case SCM_LINE_INCREMENTORS:
 275             break;
 276           }
 277         break;
 278
 279       case '#':
 280         switch (c = scm_getc (port))
 281           {
 282           case EOF:
 283             eoferr = "read_sharp";
 284             goto goteof;
 285           case '!':
 286             scm_read_scsh_block_comment (c, port);
 287             break;
 288           case ';':
 289             scm_read_commented_expression (c, port);
 290             break;
 291           case '|':
 292             if (scm_is_false (scm_get_hash_procedure (c)))
 293               {
 294                 scm_read_r6rs_block_comment (c, port);
 295                 break;
 296               }
 297             /* fall through */
 298           default:
 299             scm_ungetc (c, port);
 300             return '#';
 301           }
 302         break;
 303
 304       case SCM_LINE_INCREMENTORS:
 305       case SCM_SINGLE_SPACES:
 306       case '\t':
 307         break;
 308
 309       default:
 310         return c;
 311       }
 312
 313   return 0;
 314 }
 315
 316
 317 \f
 318 /* Token readers.  */
 319
 320 static SCM scm_read_expression (SCM port);
 321 static SCM scm_read_sharp (int chr, SCM port);
 322 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 323
 324
 325 static SCM
 326 scm_read_sexp (scm_t_wchar chr, SCM port)
 327 #define FUNC_NAME "scm_i_lreadparen"
 328 {
 329   register int c;
 330   register SCM tmp;
 331   register SCM tl, ans = SCM_EOL;
 332   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 333   static const int terminating_char = ')';
 334
 335   /* Need to capture line and column numbers here. */
 336   long line = SCM_LINUM (port);
 337   int column = SCM_COL (port) - 1;
 338
 339
 340   c = flush_ws (port, FUNC_NAME);
 341   if (terminating_char == c)
 342     return SCM_EOL;
 343
 344   scm_ungetc (c, port);
 345   if (scm_is_eq (scm_sym_dot,
 346                  (tmp = scm_read_expression (port))))
 347     {
 348       ans = scm_read_expression (port);
 349       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 350         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 351                            SCM_EOL);
 352       return ans;
 353     }
 354
 355   /* Build the head of the list structure. */
 356   ans = tl = scm_cons (tmp, SCM_EOL);
 357
 358   if (SCM_COPY_SOURCE_P)
 359     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 360                            ? copy
 361                            : tmp,
 362                            SCM_EOL);
 363
 364   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 365     {
 366       SCM new_tail;
 367
 368       scm_ungetc (c, port);
 369       if (scm_is_eq (scm_sym_dot,
 370                      (tmp = scm_read_expression (port))))
 371         {
 372           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 373
 374           if (SCM_COPY_SOURCE_P)
 375             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 376                                        SCM_EOL));
 377
 378           c = flush_ws (port, FUNC_NAME);
 379           if (terminating_char != c)
 380             scm_i_input_error (FUNC_NAME, port,
 381                                "in pair: missing close paren", SCM_EOL);
 382           goto exit;
 383         }
 384
 385       new_tail = scm_cons (tmp, SCM_EOL);
 386       SCM_SETCDR (tl, new_tail);
 387       tl = new_tail;
 388
 389       if (SCM_COPY_SOURCE_P)
 390         {
 391           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 392                                     ? copy
 393                                     : tmp, SCM_EOL);
 394           SCM_SETCDR (tl2, new_tail2);
 395           tl2 = new_tail2;
 396         }
 397     }
 398
 399  exit:
 400   if (SCM_RECORD_POSITIONS_P)
 401     scm_whash_insert (scm_source_whash,
 402                       ans,
 403                       scm_make_srcprops (line, column,
 404                                          SCM_FILENAME (port),
 405                                          SCM_COPY_SOURCE_P
 406                                          ? ans2
 407                                          : SCM_UNDEFINED,
 408                                          SCM_EOL));
 409   return ans;
 410 }
 411 #undef FUNC_NAME
 412
 413 static SCM
 414 scm_read_string (int chr, SCM port)
 415 #define FUNC_NAME "scm_lreadr"
 416 {
 417   /* For strings smaller than C_STR, this function creates only one Scheme
 418      object (the string returned).  */
 419
 420   SCM str = SCM_BOOL_F;
 421   unsigned c_str_len = 0;
 422   scm_t_wchar c;
 423
 424   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 425   while ('"' != (c = scm_getc (port)))
 426     {
 427       if (c == EOF)
 428         {
 429         str_eof:
 430           scm_i_input_error (FUNC_NAME, port,
 431                              "end of file in string constant", SCM_EOL);
 432         }
 433
 434       if (c_str_len + 1 >= scm_i_string_length (str))
 435         {
 436           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 437
 438           str = scm_string_append (scm_list_2 (str, addy));
 439         }
 440
 441       if (c == '\\')
 442         {
 443           switch (c = scm_getc (port))
 444             {
 445             case EOF:
 446               goto str_eof;
 447             case '"':
 448             case '\\':
 449               break;
 450 #if SCM_ENABLE_ELISP
 451             case '(':
 452             case ')':
 453               if (SCM_ESCAPED_PARENS_P)
 454                 break;
 455               goto bad_escaped;
 456 #endif
 457             case '\n':
 458               continue;
 459             case '0':
 460               c = '\0';
 461               break;
 462             case 'f':
 463               c = '\f';
 464               break;
 465             case 'n':
 466               c = '\n';
 467               break;
 468             case 'r':
 469               c = '\r';
 470               break;
 471             case 't':
 472               c = '\t';
 473               break;
 474             case 'a':
 475               c = '\007';
 476               break;
 477             case 'v':
 478               c = '\v';
 479               break;
 480             case 'x':
 481               {
 482                 scm_t_wchar a, b;
 483                 a = scm_getc (port);
 484                 if (a == EOF)
 485                   goto str_eof;
 486                 b = scm_getc (port);
 487                 if (b == EOF)
 488                   goto str_eof;
 489                 if ('0' <= a && a <= '9')
 490                   a -= '0';
 491                 else if ('A' <= a && a <= 'F')
 492                   a = a - 'A' + 10;
 493                 else if ('a' <= a && a <= 'f')
 494                   a = a - 'a' + 10;
 495                 else
 496                   {
 497                     c = a;
 498                     goto bad_escaped;
 499                   }
 500                 if ('0' <= b && b <= '9')
 501                   b -= '0';
 502                 else if ('A' <= b && b <= 'F')
 503                   b = b - 'A' + 10;
 504                 else if ('a' <= b && b <= 'f')
 505                   b = b - 'a' + 10;
 506                 else
 507                   {
 508                     c = b;
 509                     goto bad_escaped;
 510                   }
 511                 c = a * 16 + b;
 512                 break;
 513               }
 514             case 'u':
 515               {
 516                 scm_t_wchar a;
 517                 int i;
 518                 c = 0;
 519                 for (i = 0; i < 4; i++)
 520                   {
 521                     a = scm_getc (port);
 522                     if (a == EOF)
 523                       goto str_eof;
 524                     if ('0' <= a && a <= '9')
 525                       a -= '0';
 526                     else if ('A' <= a && a <= 'F')
 527                       a = a - 'A' + 10;
 528                     else if ('a' <= a && a <= 'f')
 529                       a = a - 'a' + 10;
 530                     else
 531                       {
 532                         c = a;
 533                         goto bad_escaped;
 534                       }
 535                     c = c * 16 + a;
 536                   }
 537                 break;
 538               }
 539             case 'U':
 540               {
 541                 scm_t_wchar a;
 542                 int i;
 543                 c = 0;
 544                 for (i = 0; i < 6; i++)
 545                   {
 546                     a = scm_getc (port);
 547                     if (a == EOF)
 548                       goto str_eof;
 549                     if ('0' <= a && a <= '9')
 550                       a -= '0';
 551                     else if ('A' <= a && a <= 'F')
 552                       a = a - 'A' + 10;
 553                     else if ('a' <= a && a <= 'f')
 554                       a = a - 'a' + 10;
 555                     else
 556                       {
 557                         c = a;
 558                         goto bad_escaped;
 559                       }
 560                     c = c * 16 + a;
 561                   }
 562                 break;
 563               }
 564             default:
 565             bad_escaped:
 566               scm_i_input_error (FUNC_NAME, port,
 567                                  "illegal character in escape sequence: ~S",
 568                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 569             }
 570         }
 571       str = scm_i_string_start_writing (str);
 572       scm_i_string_set_x (str, c_str_len++, c);
 573       scm_i_string_stop_writing ();
 574     }
 575
 576   if (c_str_len > 0)
 577     {
 578       return scm_i_substring_copy (str, 0, c_str_len);
 579     }
 580
 581   return scm_nullstr;
 582 }
 583 #undef FUNC_NAME
 584
 585
 586 static SCM
 587 scm_read_number (scm_t_wchar chr, SCM port)
 588 {
 589   SCM result;
 590   SCM buffer;
 591   size_t read;
 592
 593   scm_ungetc (chr, port);
 594   buffer = read_complete_token (port, &read);
 595   result = scm_string_to_number (buffer, SCM_UNDEFINED);
 596   if (!scm_is_true (result))
 597     /* Return a symbol instead of a number.  */
 598     result = scm_string_to_symbol (buffer);
 599
 600   return result;
 601 }
 602
 603 static SCM
 604 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 605 {
 606   SCM result;
 607   int ends_with_colon = 0;
 608   SCM buffer;
 609   size_t read = 0;
 610   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 611
 612   scm_ungetc (chr, port);
 613   buffer = read_complete_token (port, &read);
 614   if (read > 0)
 615     ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
 616
 617   if (postfix && ends_with_colon && (read > 1))
 618     result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
 619   else
 620     result = scm_string_to_symbol (buffer);
 621
 622   return result;
 623 }
 624
 625 static SCM
 626 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 627 #define FUNC_NAME "scm_lreadr"
 628 {
 629   SCM result;
 630   size_t read;
 631   SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 632   unsigned int radix;
 633
 634   switch (chr)
 635     {
 636     case 'B':
 637     case 'b':
 638       radix = 2;
 639       break;
 640
 641     case 'o':
 642     case 'O':
 643       radix = 8;
 644       break;
 645
 646     case 'd':
 647     case 'D':
 648       radix = 10;
 649       break;
 650
 651     case 'x':
 652     case 'X':
 653       radix = 16;
 654       break;
 655
 656     default:
 657       scm_ungetc (chr, port);
 658       scm_ungetc ('#', port);
 659       radix = 10;
 660     }
 661
 662   buffer = read_complete_token (port, &read);
 663   result = scm_string_to_number (buffer, scm_from_uint (radix));
 664
 665   if (scm_is_true (result))
 666     return result;
 667
 668   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 669
 670   return SCM_BOOL_F;
 671 }
 672 #undef FUNC_NAME
 673
 674 static SCM
 675 scm_read_quote (int chr, SCM port)
 676 {
 677   SCM p;
 678   long line = SCM_LINUM (port);
 679   int column = SCM_COL (port) - 1;
 680
 681   switch (chr)
 682     {
 683     case '`':
 684       p = scm_sym_quasiquote;
 685       break;
 686
 687     case '\'':
 688       p = scm_sym_quote;
 689       break;
 690
 691     case ',':
 692       {
 693         scm_t_wchar c;
 694
 695         c = scm_getc (port);
 696         if ('@' == c)
 697           p = scm_sym_uq_splicing;
 698         else
 699           {
 700             scm_ungetc (c, port);
 701             p = scm_sym_unquote;
 702           }
 703         break;
 704       }
 705
 706     default:
 707       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 708                "scm_read_quote", chr);
 709       abort ();
 710     }
 711
 712   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 713   if (SCM_RECORD_POSITIONS_P)
 714     scm_whash_insert (scm_source_whash, p,
 715                       scm_make_srcprops (line, column,
 716                                          SCM_FILENAME (port),
 717                                          SCM_COPY_SOURCE_P
 718                                          ? (scm_cons2 (SCM_CAR (p),
 719                                                        SCM_CAR (SCM_CDR (p)),
 720                                                        SCM_EOL))
 721                                          : SCM_UNDEFINED,
 722                                          SCM_EOL));
 723
 724
 725   return p;
 726 }
 727
 728 SCM_SYMBOL (sym_syntax, "syntax");
 729 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 730 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 731 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 732
 733 static SCM
 734 scm_read_syntax (int chr, SCM port)
 735 {
 736   SCM p;
 737   long line = SCM_LINUM (port);
 738   int column = SCM_COL (port) - 1;
 739
 740   switch (chr)
 741     {
 742     case '`':
 743       p = sym_quasisyntax;
 744       break;
 745
 746     case '\'':
 747       p = sym_syntax;
 748       break;
 749
 750     case ',':
 751       {
 752         int c;
 753
 754         c = scm_getc (port);
 755         if ('@' == c)
 756           p = sym_unsyntax_splicing;
 757         else
 758           {
 759             scm_ungetc (c, port);
 760             p = sym_unsyntax;
 761           }
 762         break;
 763       }
 764
 765     default:
 766       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 767                "scm_read_syntax", chr);
 768       abort ();
 769     }
 770
 771   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 772   if (SCM_RECORD_POSITIONS_P)
 773     scm_whash_insert (scm_source_whash, p,
 774                       scm_make_srcprops (line, column,
 775                                          SCM_FILENAME (port),
 776                                          SCM_COPY_SOURCE_P
 777                                          ? (scm_cons2 (SCM_CAR (p),
 778                                                        SCM_CAR (SCM_CDR (p)),
 779                                                        SCM_EOL))
 780                                          : SCM_UNDEFINED,
 781                                          SCM_EOL));
 782
 783
 784   return p;
 785 }
 786
 787 static inline SCM
 788 scm_read_semicolon_comment (int chr, SCM port)
 789 {
 790   int c;
 791
 792   /* We use the get_byte here because there is no need to get the
 793      locale correct with comment input. This presumes that newline
 794      always represents itself no matter what the encoding is.  */
 795   for (c = scm_get_byte_or_eof (port);
 796        (c != EOF) && (c != '\n');
 797        c = scm_getc (port));
 798
 799   return SCM_UNSPECIFIED;
 800 }
 801
 802 \f
 803 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 804
 805 static SCM
 806 scm_read_boolean (int chr, SCM port)
 807 {
 808   switch (chr)
 809     {
 810     case 't':
 811     case 'T':
 812       return SCM_BOOL_T;
 813
 814     case 'f':
 815     case 'F':
 816       return SCM_BOOL_F;
 817     }
 818
 819   return SCM_UNSPECIFIED;
 820 }
 821
 822 static SCM
 823 scm_read_character (scm_t_wchar chr, SCM port)
 824 #define FUNC_NAME "scm_lreadr"
 825 {
 826   SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
 827   size_t charname_len;
 828   scm_t_wchar cp;
 829   int overflow;
 830
 831   overflow = read_token (port, charname, &charname_len);
 832   charname = scm_c_substring (charname, 0, charname_len);
 833
 834   if (overflow)
 835     goto char_error;
 836
 837   if (charname_len == 0)
 838     {
 839       chr = scm_getc (port);
 840       if (chr == EOF)
 841         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 842                            "while reading character", SCM_EOL);
 843
 844       /* CHR must be a token delimiter, like a whitespace.  */
 845       return (SCM_MAKE_CHAR (chr));
 846     }
 847
 848   if (charname_len == 1)
 849     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
 850
 851   cp = scm_i_string_ref (charname, 0);
 852   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 853     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 854
 855   if (cp >= '0' && cp < '8')
 856     {
 857       /* Dirk:FIXME::  This type of character syntax is not R5RS
 858        * compliant.  Further, it should be verified that the constant
 859        * does only consist of octal digits.  */
 860       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 861       if (SCM_I_INUMP (p))
 862         {
 863           scm_t_wchar c = SCM_I_INUM (p);
 864           if (SCM_IS_UNICODE_CHAR (c))
 865             return SCM_MAKE_CHAR (c);
 866           else
 867             scm_i_input_error (FUNC_NAME, port,
 868                                "out-of-range octal character escape: ~a",
 869                                scm_list_1 (charname));
 870         }
 871     }
 872
 873   /* The names of characters should never have non-Latin1
 874      characters.  */
 875   if (scm_i_is_narrow_string (charname)
 876       || scm_i_try_narrow_string (charname))
 877     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 878                                        charname_len);
 879       if (scm_is_true (ch))
 880         return ch;
 881     }
 882
 883  char_error:
 884   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 885                      scm_list_1 (charname));
 886
 887   return SCM_UNSPECIFIED;
 888 }
 889 #undef FUNC_NAME
 890
 891 static inline SCM
 892 scm_read_keyword (int chr, SCM port)
 893 {
 894   SCM symbol;
 895
 896   /* Read the symbol that comprises the keyword.  Doing this instead of
 897      invoking a specific symbol reader function allows `scm_read_keyword ()'
 898      to adapt to the delimiters currently valid of symbols.
 899
 900      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 901   symbol = scm_read_expression (port);
 902   if (!scm_is_symbol (symbol))
 903     scm_i_input_error ("scm_read_keyword", port,
 904                        "keyword prefix `~a' not followed by a symbol: ~s",
 905                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
 906
 907   return (scm_symbol_to_keyword (symbol));
 908 }
 909
 910 static inline SCM
 911 scm_read_vector (int chr, SCM port)
 912 {
 913   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
 914      guarantee that it's going to do what we want.  After all, this is an
 915      implementation detail of `scm_read_vector ()', not a desirable
 916      property.  */
 917   return (scm_vector (scm_read_sexp (chr, port)));
 918 }
 919
 920 static inline SCM
 921 scm_read_srfi4_vector (int chr, SCM port)
 922 {
 923   return scm_i_read_array (port, chr);
 924 }
 925
 926 static SCM
 927 scm_read_bytevector (scm_t_wchar chr, SCM port)
 928 {
 929   chr = scm_getc (port);
 930   if (chr != 'u')
 931     goto syntax;
 932
 933   chr = scm_getc (port);
 934   if (chr != '8')
 935     goto syntax;
 936
 937   chr = scm_getc (port);
 938   if (chr != '(')
 939     goto syntax;
 940
 941   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
 942
 943  syntax:
 944   scm_i_input_error ("read_bytevector", port,
 945                      "invalid bytevector prefix",
 946                      SCM_MAKE_CHAR (chr));
 947   return SCM_UNSPECIFIED;
 948 }
 949
 950 static SCM
 951 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
 952 {
 953   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
 954      terribly inefficient but who cares?  */
 955   SCM s_bits = SCM_EOL;
 956
 957   for (chr = scm_getc (port);
 958        (chr != EOF) && ((chr == '0') || (chr == '1'));
 959        chr = scm_getc (port))
 960     {
 961       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
 962     }
 963
 964   if (chr != EOF)
 965     scm_ungetc (chr, port);
 966
 967   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
 968 }
 969
 970 static inline SCM
 971 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
 972 {
 973   int bang_seen = 0;
 974
 975   /* We can use the get_byte here because there is no need to get the
 976      locale correct when reading comments. This presumes that
 977      hash and exclamation points always represent themselves no
 978      matter what the source encoding is.*/
 979   for (;;)
 980     {
 981       int c = scm_get_byte_or_eof (port);
 982
 983       if (c == EOF)
 984         scm_i_input_error ("skip_block_comment", port,
 985                            "unterminated `#! ... !#' comment", SCM_EOL);
 986
 987       if (c == '!')
 988         bang_seen = 1;
 989       else if (c == '#' && bang_seen)
 990         break;
 991       else
 992         bang_seen = 0;
 993     }
 994
 995   return SCM_UNSPECIFIED;
 996 }
 997
 998 static SCM
 999 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1000 {
1001   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1002      nested.  So care must be taken.  */
1003   int nesting_level = 1;
1004   int opening_seen = 0, closing_seen = 0;
1005
1006   while (nesting_level > 0)
1007     {
1008       int c = scm_getc (port);
1009
1010       if (c == EOF)
1011         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1012                            "unterminated `#| ... |#' comment", SCM_EOL);
1013
1014       if (opening_seen)
1015         {
1016           if (c == '|')
1017             nesting_level++;
1018           opening_seen = 0;
1019         }
1020       else if (closing_seen)
1021         {
1022           if (c == '#')
1023             nesting_level--;
1024           closing_seen = 0;
1025         }
1026       else if (c == '|')
1027         closing_seen = 1;
1028       else if (c == '#')
1029         opening_seen = 1;
1030       else
1031         opening_seen = closing_seen = 0;
1032     }
1033
1034   return SCM_UNSPECIFIED;
1035 }
1036
1037 static SCM
1038 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1039 {
1040   scm_t_wchar c;
1041
1042   c = flush_ws (port, (char *) NULL);
1043   if (EOF == c)
1044     scm_i_input_error ("read_commented_expression", port,
1045                        "no expression after #; comment", SCM_EOL);
1046   scm_ungetc (c, port);
1047   scm_read_expression (port);
1048   return SCM_UNSPECIFIED;
1049 }
1050
1051 static SCM
1052 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1053 {
1054   /* Guile's extended symbol read syntax looks like this:
1055
1056        #{This is all a symbol name}#
1057
1058      So here, CHR is expected to be `{'.  */
1059   int saw_brace = 0, finished = 0;
1060   size_t len = 0;
1061   SCM buf = scm_i_make_string (1024, NULL);
1062
1063   buf = scm_i_string_start_writing (buf);
1064
1065   while ((chr = scm_getc (port)) != EOF)
1066     {
1067       if (saw_brace)
1068         {
1069           if (chr == '#')
1070             {
1071               finished = 1;
1072               break;
1073             }
1074           else
1075             {
1076               saw_brace = 0;
1077               scm_i_string_set_x (buf, len++, '}');
1078               scm_i_string_set_x (buf, len++, chr);
1079             }
1080         }
1081       else if (chr == '}')
1082         saw_brace = 1;
1083       else
1084         scm_i_string_set_x (buf, len++, chr);
1085
1086       if (len >= scm_i_string_length (buf) - 2)
1087         {
1088           SCM addy;
1089
1090           scm_i_string_stop_writing ();
1091           addy = scm_i_make_string (1024, NULL);
1092           buf = scm_string_append (scm_list_2 (buf, addy));
1093           len = 0;
1094           buf = scm_i_string_start_writing (buf);
1095         }
1096
1097       if (finished)
1098         break;
1099     }
1100   scm_i_string_stop_writing ();
1101
1102   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1103 }
1104
1105
1106 \f
1107 /* Top-level token readers, i.e., dispatchers.  */
1108
1109 static SCM
1110 scm_read_sharp_extension (int chr, SCM port)
1111 {
1112   SCM proc;
1113
1114   proc = scm_get_hash_procedure (chr);
1115   if (scm_is_true (scm_procedure_p (proc)))
1116     {
1117       long line = SCM_LINUM (port);
1118       int column = SCM_COL (port) - 2;
1119       SCM got;
1120
1121       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1122       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1123         {
1124           if (SCM_RECORD_POSITIONS_P)
1125             return (recsexpr (got, line, column,
1126                               SCM_FILENAME (port)));
1127           else
1128             return got;
1129         }
1130     }
1131
1132   return SCM_UNSPECIFIED;
1133 }
1134
1135 /* The reader for the sharp `#' character.  It basically dispatches reads
1136    among the above token readers.   */
1137 static SCM
1138 scm_read_sharp (scm_t_wchar chr, SCM port)
1139 #define FUNC_NAME "scm_lreadr"
1140 {
1141   SCM result;
1142
1143   chr = scm_getc (port);
1144
1145   result = scm_read_sharp_extension (chr, port);
1146   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1147     return result;
1148
1149   switch (chr)
1150     {
1151     case '\\':
1152       return (scm_read_character (chr, port));
1153     case '(':
1154       return (scm_read_vector (chr, port));
1155     case 's':
1156     case 'u':
1157     case 'f':
1158       /* This one may return either a boolean or an SRFI-4 vector.  */
1159       return (scm_read_srfi4_vector (chr, port));
1160     case 'v':
1161       return (scm_read_bytevector (chr, port));
1162     case '*':
1163       return (scm_read_guile_bit_vector (chr, port));
1164     case 't':
1165     case 'T':
1166     case 'F':
1167       /* This one may return either a boolean or an SRFI-4 vector.  */
1168       return (scm_read_boolean (chr, port));
1169     case ':':
1170       return (scm_read_keyword (chr, port));
1171     case '0': case '1': case '2': case '3': case '4':
1172     case '5': case '6': case '7': case '8': case '9':
1173     case '@':
1174 #if SCM_ENABLE_DEPRECATED
1175       /* See below for 'i' and 'e'. */
1176     case 'a':
1177     case 'c':
1178     case 'y':
1179     case 'h':
1180     case 'l':
1181 #endif
1182       return (scm_i_read_array (port, chr));
1183
1184     case 'i':
1185     case 'e':
1186 #if SCM_ENABLE_DEPRECATED
1187       {
1188         /* When next char is '(', it really is an old-style
1189            uniform array. */
1190         scm_t_wchar next_c = scm_getc (port);
1191         if (next_c != EOF)
1192           scm_ungetc (next_c, port);
1193         if (next_c == '(')
1194           return scm_i_read_array (port, chr);
1195         /* Fall through. */
1196       }
1197 #endif
1198     case 'b':
1199     case 'B':
1200     case 'o':
1201     case 'O':
1202     case 'd':
1203     case 'D':
1204     case 'x':
1205     case 'X':
1206     case 'I':
1207     case 'E':
1208       return (scm_read_number_and_radix (chr, port));
1209     case '{':
1210       return (scm_read_extended_symbol (chr, port));
1211     case '!':
1212       return (scm_read_scsh_block_comment (chr, port));
1213     case ';':
1214       return (scm_read_commented_expression (chr, port));
1215     case '`':
1216     case '\'':
1217     case ',':
1218       return (scm_read_syntax (chr, port));
1219     default:
1220       result = scm_read_sharp_extension (chr, port);
1221       if (scm_is_eq (result, SCM_UNSPECIFIED))
1222         {
1223           /* To remain compatible with 1.8 and earlier, the following
1224              characters have lower precedence than `read-hash-extend'
1225              characters.  */
1226           switch (chr)
1227             {
1228             case '|':
1229               return scm_read_r6rs_block_comment (chr, port);
1230             default:
1231               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1232                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1233             }
1234         }
1235       else
1236         return result;
1237     }
1238
1239   return SCM_UNSPECIFIED;
1240 }
1241 #undef FUNC_NAME
1242
1243 static SCM
1244 scm_read_expression (SCM port)
1245 #define FUNC_NAME "scm_read_expression"
1246 {
1247   while (1)
1248     {
1249       register scm_t_wchar chr;
1250
1251       chr = scm_getc (port);
1252
1253       switch (chr)
1254         {
1255         case SCM_WHITE_SPACES:
1256         case SCM_LINE_INCREMENTORS:
1257           break;
1258         case ';':
1259           (void) scm_read_semicolon_comment (chr, port);
1260           break;
1261         case '(':
1262           return (scm_read_sexp (chr, port));
1263         case '"':
1264           return (scm_read_string (chr, port));
1265         case '\'':
1266         case '`':
1267         case ',':
1268           return (scm_read_quote (chr, port));
1269         case '#':
1270           {
1271             SCM result;
1272             result = scm_read_sharp (chr, port);
1273             if (scm_is_eq (result, SCM_UNSPECIFIED))
1274               /* We read a comment or some such.  */
1275               break;
1276             else
1277               return result;
1278           }
1279         case ')':
1280           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1281           break;
1282         case EOF:
1283           return SCM_EOF_VAL;
1284         case ':':
1285           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1286             return scm_symbol_to_keyword (scm_read_expression (port));
1287           /* Fall through.  */
1288
1289         default:
1290           {
1291             if (((chr >= '0') && (chr <= '9'))
1292                 || (strchr ("+-.", chr)))
1293               return (scm_read_number (chr, port));
1294             else
1295               return (scm_read_mixed_case_symbol (chr, port));
1296           }
1297         }
1298     }
1299 }
1300 #undef FUNC_NAME
1301
1302 \f
1303 /* Actual reader.  */
1304
1305 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1306             (SCM port),
1307             "Read an s-expression from the input port @var{port}, or from\n"
1308             "the current input port if @var{port} is not specified.\n"
1309             "Any whitespace before the next token is discarded.")
1310 #define FUNC_NAME s_scm_read
1311 {
1312   int c;
1313
1314   if (SCM_UNBNDP (port))
1315     port = scm_current_input_port ();
1316   SCM_VALIDATE_OPINPORT (1, port);
1317
1318   c = flush_ws (port, (char *) NULL);
1319   if (EOF == c)
1320     return SCM_EOF_VAL;
1321   scm_ungetc (c, port);
1322
1323   return (scm_read_expression (port));
1324 }
1325 #undef FUNC_NAME
1326
1327
1328 \f
1329
1330 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1331 static SCM
1332 recsexpr (SCM obj, long line, int column, SCM filename)
1333 {
1334   if (!scm_is_pair(obj)) {
1335     return obj;
1336   } else {
1337     SCM tmp = obj, copy;
1338     /* If this sexpr is visible in the read:sharp source, we want to
1339        keep that information, so only record non-constant cons cells
1340        which haven't previously been read by the reader. */
1341     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1342       {
1343         if (SCM_COPY_SOURCE_P)
1344           {
1345             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1346                              SCM_UNDEFINED);
1347             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1348               {
1349                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1350                                                       line,
1351                                                       column,
1352                                                       filename),
1353                                             SCM_UNDEFINED));
1354                 copy = SCM_CDR (copy);
1355               }
1356             SCM_SETCDR (copy, tmp);
1357           }
1358         else
1359           {
1360             recsexpr (SCM_CAR (obj), line, column, filename);
1361             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1362               recsexpr (SCM_CAR (tmp), line, column, filename);
1363             copy = SCM_UNDEFINED;
1364           }
1365         scm_whash_insert (scm_source_whash,
1366                           obj,
1367                           scm_make_srcprops (line,
1368                                              column,
1369                                              filename,
1370                                              copy,
1371                                              SCM_EOL));
1372       }
1373     return obj;
1374   }
1375 }
1376
1377 /* Manipulate the read-hash-procedures alist.  This could be written in
1378    Scheme, but maybe it will also be used by C code during initialisation.  */
1379 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1380             (SCM chr, SCM proc),
1381             "Install the procedure @var{proc} for reading expressions\n"
1382             "starting with the character sequence @code{#} and @var{chr}.\n"
1383             "@var{proc} will be called with two arguments:  the character\n"
1384             "@var{chr} and the port to read further data from. The object\n"
1385             "returned will be the return value of @code{read}. \n"
1386             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1387             )
1388 #define FUNC_NAME s_scm_read_hash_extend
1389 {
1390   SCM this;
1391   SCM prev;
1392
1393   SCM_VALIDATE_CHAR (1, chr);
1394   SCM_ASSERT (scm_is_false (proc)
1395               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1396               proc, SCM_ARG2, FUNC_NAME);
1397
1398   /* Check if chr is already in the alist.  */
1399   this = *scm_read_hash_procedures;
1400   prev = SCM_BOOL_F;
1401   while (1)
1402     {
1403       if (scm_is_null (this))
1404         {
1405           /* not found, so add it to the beginning.  */
1406           if (scm_is_true (proc))
1407             {
1408               *scm_read_hash_procedures =
1409                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1410             }
1411           break;
1412         }
1413       if (scm_is_eq (chr, SCM_CAAR (this)))
1414         {
1415           /* already in the alist.  */
1416           if (scm_is_false (proc))
1417             {
1418               /* remove it.  */
1419               if (scm_is_false (prev))
1420                 {
1421                   *scm_read_hash_procedures =
1422                     SCM_CDR (*scm_read_hash_procedures);
1423                 }
1424               else
1425                 scm_set_cdr_x (prev, SCM_CDR (this));
1426             }
1427           else
1428             {
1429               /* replace it.  */
1430               scm_set_cdr_x (SCM_CAR (this), proc);
1431             }
1432           break;
1433         }
1434       prev = this;
1435       this = SCM_CDR (this);
1436     }
1437
1438   return SCM_UNSPECIFIED;
1439 }
1440 #undef FUNC_NAME
1441
1442 /* Recover the read-hash procedure corresponding to char c.  */
1443 static SCM
1444 scm_get_hash_procedure (int c)
1445 {
1446   SCM rest = *scm_read_hash_procedures;
1447
1448   while (1)
1449     {
1450       if (scm_is_null (rest))
1451         return SCM_BOOL_F;
1452
1453       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1454         return SCM_CDAR (rest);
1455
1456       rest = SCM_CDR (rest);
1457     }
1458 }
1459
1460 #define SCM_ENCODING_SEARCH_SIZE (500)
1461
1462 /* Search the first few hundred characters of a file for an Emacs-like coding
1463    declaration.  Returns either NULL or a string whose storage has been
1464    allocated with `scm_gc_malloc ()'.  */
1465 char *
1466 scm_i_scan_for_encoding (SCM port)
1467 {
1468   char header[SCM_ENCODING_SEARCH_SIZE+1];
1469   size_t bytes_read;
1470   char *encoding = NULL;
1471   int utf8_bom = 0;
1472   char *pos;
1473   int i;
1474   int in_comment;
1475
1476   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1477     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1478        "guile <(echo '(display "hello")')") so bail out.  */
1479     return NULL;
1480
1481   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1482
1483   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1484
1485   if (bytes_read > 3
1486       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1487     utf8_bom = 1;
1488
1489   /* search past "coding[:=]" */
1490   pos = header;
1491   while (1)
1492     {
1493       if ((pos = strstr(pos, "coding")) == NULL)
1494         return NULL;
1495
1496       pos += strlen("coding");
1497       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1498           (*pos == ':' || *pos == '='))
1499         {
1500           pos ++;
1501           break;
1502         }
1503     }
1504
1505   /* skip spaces */
1506   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1507          (*pos == ' ' || *pos == '\t'))
1508     pos ++;
1509
1510   /* grab the next token */
1511   i = 0;
1512   while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1513          && pos + i - header < bytes_read
1514          && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
1515     i++;
1516
1517   if (i == 0)
1518     return NULL;
1519
1520   encoding = scm_gc_strndup (pos, i, "encoding");
1521   for (i = 0; i < strlen (encoding); i++)
1522     encoding[i] = toupper ((int) encoding[i]);
1523
1524   /* push backwards to make sure we were in a comment */
1525   in_comment = 0;
1526   while (pos - i - header > 0)
1527     {
1528       if (*(pos - i) == '\n')
1529         {
1530           /* This wasn't in a semicolon comment. Check for a
1531            hash-bang comment. */
1532           char *beg = strstr (header, "#!");
1533           char *end = strstr (header, "!#");
1534           if (beg < pos && pos < end)
1535             in_comment = 1;
1536           break;
1537         }
1538       if (*(pos - i) == ';')
1539         {
1540           in_comment = 1;
1541           break;
1542         }
1543       i ++;
1544     }
1545   if (!in_comment)
1546     /* This wasn't in a comment */
1547     return NULL;
1548
1549   if (utf8_bom && strcmp(encoding, "UTF-8"))
1550     scm_misc_error (NULL,
1551                     "the port input declares the encoding ~s but is encoded as UTF-8",
1552                     scm_list_1 (scm_from_locale_string (encoding)));
1553
1554   return encoding;
1555 }
1556
1557 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1558             (SCM port),
1559             "Scans the port for an Emacs-like character coding declaration\n"
1560             "near the top of the contents of a port with random-acessible contents.\n"
1561             "The coding declaration is of the form\n"
1562             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1563             "\n"
1564             "Returns a string containing the character encoding of the file\n"
1565             "if a declaration was found, or @code{#f} otherwise.\n")
1566 #define FUNC_NAME s_scm_file_encoding
1567 {
1568   char *enc;
1569   SCM s_enc;
1570
1571   enc = scm_i_scan_for_encoding (port);
1572   if (enc == NULL)
1573     return SCM_BOOL_F;
1574   else
1575     {
1576       s_enc = scm_from_locale_string (enc);
1577       return s_enc;
1578     }
1579
1580   return SCM_BOOL_F;
1581 }
1582 #undef FUNC_NAME
1583
1584 void
1585 scm_init_read ()
1586 {
1587   scm_read_hash_procedures =
1588     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1589
1590   scm_init_opts (scm_read_options, scm_read_opts);
1591 #include "libguile/read.x"
1592 }
1593
1594 /*
1595   Local Variables:
1596   c-file-style: "gnu"
1597   End:
1598 */