libguile/read.c

   1 /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
   2  * Foundation, Inc.
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public License
   6  * as published by the Free Software Foundation; either version 3 of
   7  * the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17  * 02110-1301 USA
  18  */
  19
  20
  21 \f
  22
  23 #ifdef HAVE_CONFIG_H
  24 # include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unicase.h>
  32
  33 #include "libguile/_scm.h"
  34 #include "libguile/bytevectors.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/eval.h"
  37 #include "libguile/arrays.h"
  38 #include "libguile/bitvectors.h"
  39 #include "libguile/keywords.h"
  40 #include "libguile/alist.h"
  41 #include "libguile/srcprop.h"
  42 #include "libguile/hashtab.h"
  43 #include "libguile/hash.h"
  44 #include "libguile/ports.h"
  45 #include "libguile/fports.h"
  46 #include "libguile/root.h"
  47 #include "libguile/strings.h"
  48 #include "libguile/strports.h"
  49 #include "libguile/vectors.h"
  50 #include "libguile/validate.h"
  51 #include "libguile/srfi-4.h"
  52 #include "libguile/srfi-13.h"
  53
  54 #include "libguile/read.h"
  55 #include "libguile/private-options.h"
  56
  57
  58 \f
  59
  60 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  61 SCM_SYMBOL (scm_keyword_prefix, "prefix");
  62 SCM_SYMBOL (scm_keyword_postfix, "postfix");
  63
  64 scm_t_option scm_read_opts[] = {
  65   { SCM_OPTION_BOOLEAN, "copy", 0,
  66     "Copy source code expressions." },
  67   { SCM_OPTION_BOOLEAN, "positions", 0,
  68     "Record positions of source code expressions." },
  69   { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  70     "Convert symbols to lower case."},
  71   { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
  72     "Style of keyword recognition: #f, 'prefix or 'postfix."},
  73 #if SCM_ENABLE_ELISP
  74   { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
  75     "Support Elisp vector syntax, namely `[...]'."},
  76   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
  77     "Support `\\(' and `\\)' in strings."},
  78 #endif
  79   { 0, },
  80 };
  81
  82 /*
  83   Give meaningful error messages for errors
  84
  85   We use the format
  86
  87   FILE:LINE:COL: MESSAGE
  88   This happened in ....
  89
  90   This is not standard GNU format, but the test-suite likes the real
  91   message to be in front.
  92
  93  */
  94
  95
  96 void
  97 scm_i_input_error (char const *function,
  98                    SCM port, const char *message, SCM arg)
  99 {
 100   SCM fn = (scm_is_string (SCM_FILENAME(port))
 101             ? SCM_FILENAME(port)
 102             : scm_from_locale_string ("#<unknown port>"));
 103
 104   SCM string_port = scm_open_output_string ();
 105   SCM string = SCM_EOL;
 106   scm_simple_format (string_port,
 107                      scm_from_locale_string ("~A:~S:~S: ~A"),
 108                      scm_list_4 (fn,
 109                                  scm_from_long (SCM_LINUM (port) + 1),
 110                                  scm_from_int (SCM_COL (port) + 1),
 111                                  scm_from_locale_string (message)));
 112
 113   string = scm_get_output_string (string_port);
 114   scm_close_output_port (string_port);
 115   scm_error_scm (scm_from_locale_symbol ("read-error"),
 116                  function? scm_from_locale_string (function) : SCM_BOOL_F,
 117                  string,
 118                  arg,
 119                  SCM_BOOL_F);
 120 }
 121
 122
 123 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
 124             (SCM setting),
 125             "Option interface for the read options. Instead of using\n"
 126             "this procedure directly, use the procedures @code{read-enable},\n"
 127             "@code{read-disable}, @code{read-set!} and @code{read-options}.")
 128 #define FUNC_NAME s_scm_read_options
 129 {
 130   SCM ans = scm_options (setting,
 131                          scm_read_opts,
 132                          FUNC_NAME);
 133   if (SCM_COPY_SOURCE_P)
 134     SCM_RECORD_POSITIONS_P = 1;
 135   return ans;
 136 }
 137 #undef FUNC_NAME
 138
 139 /* An association list mapping extra hash characters to procedures.  */
 140 static SCM *scm_read_hash_procedures;
 141
 142
 143 \f
 144 /* Token readers.  */
 145
 146
 147 /* Size of the C buffer used to read symbols and numbers.  */
 148 #define READER_BUFFER_SIZE            128
 149
 150 /* Size of the C buffer used to read strings.  */
 151 #define READER_STRING_BUFFER_SIZE     512
 152
 153 /* The maximum size of Scheme character names.  */
 154 #define READER_CHAR_NAME_MAX_SIZE      50
 155
 156
 157 /* `isblank' is only in C99.  */
 158 #define CHAR_IS_BLANK_(_chr)                                    \
 159   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')      \
 160    || ((_chr) == '\f') || ((_chr) == '\r'))
 161
 162 #ifdef MSDOS
 163 # define CHAR_IS_BLANK(_chr)                    \
 164   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
 165 #else
 166 # define CHAR_IS_BLANK CHAR_IS_BLANK_
 167 #endif
 168
 169
 170 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
 171    structure'').  */
 172 #define CHAR_IS_R5RS_DELIMITER(c)                               \
 173   (CHAR_IS_BLANK (c)                                            \
 174    || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
 175
 176 #define CHAR_IS_DELIMITER  CHAR_IS_R5RS_DELIMITER
 177
 178 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
 179    Structure''.  */
 180 #define CHAR_IS_EXPONENT_MARKER(_chr)                           \
 181   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')        \
 182    || ((_chr) == 'd') || ((_chr) == 'l'))
 183
 184 /* Read an SCSH block comment.  */
 185 static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
 186 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
 187 static SCM scm_read_commented_expression (scm_t_wchar, SCM);
 188 static SCM scm_get_hash_procedure (int);
 189
 190 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Return
 191    zero if the whole token fits in BUF, non-zero otherwise.  */
 192 static inline int
 193 read_token (SCM port, SCM buf, size_t *read)
 194 {
 195   scm_t_wchar chr;
 196   *read = 0;
 197
 198   buf = scm_i_string_start_writing (buf);
 199   while (*read < scm_i_string_length (buf))
 200     {
 201       chr = scm_getc (port);
 202
 203       if (chr == EOF)
 204         {
 205           scm_i_string_stop_writing ();
 206           return 0;
 207         }
 208
 209       chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
 210
 211       if (CHAR_IS_DELIMITER (chr))
 212         {
 213           scm_i_string_stop_writing ();
 214           scm_ungetc (chr, port);
 215           return 0;
 216         }
 217
 218       scm_i_string_set_x (buf, *read, chr);
 219       (*read)++;
 220     }
 221   scm_i_string_stop_writing ();
 222
 223   return 1;
 224 }
 225
 226 static SCM
 227 read_complete_token (SCM port, size_t *read)
 228 {
 229   SCM buffer, str = SCM_EOL;
 230   size_t len;
 231   int overflow;
 232
 233   buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 234   overflow = read_token (port, buffer, read);
 235   if (!overflow)
 236     return scm_i_substring (buffer, 0, *read);
 237
 238   str = scm_string_copy (buffer);
 239   do
 240     {
 241       overflow = read_token (port, buffer, &len);
 242       str = scm_string_append (scm_list_2 (str, buffer));
 243       *read += len;
 244     }
 245   while (overflow);
 246
 247   return scm_i_substring (str, 0, *read);
 248 }
 249
 250 /* Skip whitespace from PORT and return the first non-whitespace character
 251    read.  Raise an error on end-of-file.  */
 252 static int
 253 flush_ws (SCM port, const char *eoferr)
 254 {
 255   register scm_t_wchar c;
 256   while (1)
 257     switch (c = scm_getc (port))
 258       {
 259       case EOF:
 260       goteof:
 261         if (eoferr)
 262           {
 263             scm_i_input_error (eoferr,
 264                                port,
 265                                "end of file",
 266                                SCM_EOL);
 267           }
 268         return c;
 269
 270       case ';':
 271       lp:
 272         switch (c = scm_getc (port))
 273           {
 274           case EOF:
 275             goto goteof;
 276           default:
 277             goto lp;
 278           case SCM_LINE_INCREMENTORS:
 279             break;
 280           }
 281         break;
 282
 283       case '#':
 284         switch (c = scm_getc (port))
 285           {
 286           case EOF:
 287             eoferr = "read_sharp";
 288             goto goteof;
 289           case '!':
 290             scm_read_scsh_block_comment (c, port);
 291             break;
 292           case ';':
 293             scm_read_commented_expression (c, port);
 294             break;
 295           case '|':
 296             if (scm_is_false (scm_get_hash_procedure (c)))
 297               {
 298                 scm_read_r6rs_block_comment (c, port);
 299                 break;
 300               }
 301             /* fall through */
 302           default:
 303             scm_ungetc (c, port);
 304             return '#';
 305           }
 306         break;
 307
 308       case SCM_LINE_INCREMENTORS:
 309       case SCM_SINGLE_SPACES:
 310       case '\t':
 311         break;
 312
 313       default:
 314         return c;
 315       }
 316
 317   return 0;
 318 }
 319
 320
 321 \f
 322 /* Token readers.  */
 323
 324 static SCM scm_read_expression (SCM port);
 325 static SCM scm_read_sharp (int chr, SCM port);
 326 static SCM recsexpr (SCM obj, long line, int column, SCM filename);
 327
 328
 329 static SCM
 330 scm_read_sexp (scm_t_wchar chr, SCM port)
 331 #define FUNC_NAME "scm_i_lreadparen"
 332 {
 333   register int c;
 334   register SCM tmp;
 335   register SCM tl, ans = SCM_EOL;
 336   SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
 337   static const int terminating_char = ')';
 338
 339   /* Need to capture line and column numbers here. */
 340   long line = SCM_LINUM (port);
 341   int column = SCM_COL (port) - 1;
 342
 343
 344   c = flush_ws (port, FUNC_NAME);
 345   if (terminating_char == c)
 346     return SCM_EOL;
 347
 348   scm_ungetc (c, port);
 349   if (scm_is_eq (scm_sym_dot,
 350                  (tmp = scm_read_expression (port))))
 351     {
 352       ans = scm_read_expression (port);
 353       if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 354         scm_i_input_error (FUNC_NAME, port, "missing close paren",
 355                            SCM_EOL);
 356       return ans;
 357     }
 358
 359   /* Build the head of the list structure. */
 360   ans = tl = scm_cons (tmp, SCM_EOL);
 361
 362   if (SCM_COPY_SOURCE_P)
 363     ans2 = tl2 = scm_cons (scm_is_pair (tmp)
 364                            ? copy
 365                            : tmp,
 366                            SCM_EOL);
 367
 368   while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
 369     {
 370       SCM new_tail;
 371
 372       scm_ungetc (c, port);
 373       if (scm_is_eq (scm_sym_dot,
 374                      (tmp = scm_read_expression (port))))
 375         {
 376           SCM_SETCDR (tl, tmp = scm_read_expression (port));
 377
 378           if (SCM_COPY_SOURCE_P)
 379             SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
 380                                        SCM_EOL));
 381
 382           c = flush_ws (port, FUNC_NAME);
 383           if (terminating_char != c)
 384             scm_i_input_error (FUNC_NAME, port,
 385                                "in pair: missing close paren", SCM_EOL);
 386           goto exit;
 387         }
 388
 389       new_tail = scm_cons (tmp, SCM_EOL);
 390       SCM_SETCDR (tl, new_tail);
 391       tl = new_tail;
 392
 393       if (SCM_COPY_SOURCE_P)
 394         {
 395           SCM new_tail2 = scm_cons (scm_is_pair (tmp)
 396                                     ? copy
 397                                     : tmp, SCM_EOL);
 398           SCM_SETCDR (tl2, new_tail2);
 399           tl2 = new_tail2;
 400         }
 401     }
 402
 403  exit:
 404   if (SCM_RECORD_POSITIONS_P)
 405     scm_whash_insert (scm_source_whash,
 406                       ans,
 407                       scm_make_srcprops (line, column,
 408                                          SCM_FILENAME (port),
 409                                          SCM_COPY_SOURCE_P
 410                                          ? ans2
 411                                          : SCM_UNDEFINED,
 412                                          SCM_EOL));
 413   return ans;
 414 }
 415 #undef FUNC_NAME
 416
 417 static SCM
 418 scm_read_string (int chr, SCM port)
 419 #define FUNC_NAME "scm_lreadr"
 420 {
 421   /* For strings smaller than C_STR, this function creates only one Scheme
 422      object (the string returned).  */
 423
 424   SCM str = SCM_BOOL_F;
 425   unsigned c_str_len = 0;
 426   scm_t_wchar c;
 427
 428   str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 429   while ('"' != (c = scm_getc (port)))
 430     {
 431       if (c == EOF)
 432         {
 433         str_eof:
 434           scm_i_input_error (FUNC_NAME, port,
 435                              "end of file in string constant", SCM_EOL);
 436         }
 437
 438       if (c_str_len + 1 >= scm_i_string_length (str))
 439         {
 440           SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
 441
 442           str = scm_string_append (scm_list_2 (str, addy));
 443         }
 444
 445       if (c == '\\')
 446         {
 447           switch (c = scm_getc (port))
 448             {
 449             case EOF:
 450               goto str_eof;
 451             case '"':
 452             case '\\':
 453               break;
 454 #if SCM_ENABLE_ELISP
 455             case '(':
 456             case ')':
 457               if (SCM_ESCAPED_PARENS_P)
 458                 break;
 459               goto bad_escaped;
 460 #endif
 461             case '\n':
 462               continue;
 463             case '0':
 464               c = '\0';
 465               break;
 466             case 'f':
 467               c = '\f';
 468               break;
 469             case 'n':
 470               c = '\n';
 471               break;
 472             case 'r':
 473               c = '\r';
 474               break;
 475             case 't':
 476               c = '\t';
 477               break;
 478             case 'a':
 479               c = '\007';
 480               break;
 481             case 'v':
 482               c = '\v';
 483               break;
 484             case 'x':
 485               {
 486                 scm_t_wchar a, b;
 487                 a = scm_getc (port);
 488                 if (a == EOF)
 489                   goto str_eof;
 490                 b = scm_getc (port);
 491                 if (b == EOF)
 492                   goto str_eof;
 493                 if ('0' <= a && a <= '9')
 494                   a -= '0';
 495                 else if ('A' <= a && a <= 'F')
 496                   a = a - 'A' + 10;
 497                 else if ('a' <= a && a <= 'f')
 498                   a = a - 'a' + 10;
 499                 else
 500                   {
 501                     c = a;
 502                     goto bad_escaped;
 503                   }
 504                 if ('0' <= b && b <= '9')
 505                   b -= '0';
 506                 else if ('A' <= b && b <= 'F')
 507                   b = b - 'A' + 10;
 508                 else if ('a' <= b && b <= 'f')
 509                   b = b - 'a' + 10;
 510                 else
 511                   {
 512                     c = b;
 513                     goto bad_escaped;
 514                   }
 515                 c = a * 16 + b;
 516                 break;
 517               }
 518             case 'u':
 519               {
 520                 scm_t_wchar a;
 521                 int i;
 522                 c = 0;
 523                 for (i = 0; i < 4; i++)
 524                   {
 525                     a = scm_getc (port);
 526                     if (a == EOF)
 527                       goto str_eof;
 528                     if ('0' <= a && a <= '9')
 529                       a -= '0';
 530                     else if ('A' <= a && a <= 'F')
 531                       a = a - 'A' + 10;
 532                     else if ('a' <= a && a <= 'f')
 533                       a = a - 'a' + 10;
 534                     else
 535                       {
 536                         c = a;
 537                         goto bad_escaped;
 538                       }
 539                     c = c * 16 + a;
 540                   }
 541                 break;
 542               }
 543             case 'U':
 544               {
 545                 scm_t_wchar a;
 546                 int i;
 547                 c = 0;
 548                 for (i = 0; i < 6; i++)
 549                   {
 550                     a = scm_getc (port);
 551                     if (a == EOF)
 552                       goto str_eof;
 553                     if ('0' <= a && a <= '9')
 554                       a -= '0';
 555                     else if ('A' <= a && a <= 'F')
 556                       a = a - 'A' + 10;
 557                     else if ('a' <= a && a <= 'f')
 558                       a = a - 'a' + 10;
 559                     else
 560                       {
 561                         c = a;
 562                         goto bad_escaped;
 563                       }
 564                     c = c * 16 + a;
 565                   }
 566                 break;
 567               }
 568             default:
 569             bad_escaped:
 570               scm_i_input_error (FUNC_NAME, port,
 571                                  "illegal character in escape sequence: ~S",
 572                                  scm_list_1 (SCM_MAKE_CHAR (c)));
 573             }
 574         }
 575       str = scm_i_string_start_writing (str);
 576       scm_i_string_set_x (str, c_str_len++, c);
 577       scm_i_string_stop_writing ();
 578     }
 579
 580   if (c_str_len > 0)
 581     {
 582       return scm_i_substring_copy (str, 0, c_str_len);
 583     }
 584
 585   return scm_nullstr;
 586 }
 587 #undef FUNC_NAME
 588
 589
 590 static SCM
 591 scm_read_number (scm_t_wchar chr, SCM port)
 592 {
 593   SCM result;
 594   SCM buffer;
 595   size_t read;
 596
 597   scm_ungetc (chr, port);
 598   buffer = read_complete_token (port, &read);
 599   result = scm_string_to_number (buffer, SCM_UNDEFINED);
 600   if (!scm_is_true (result))
 601     /* Return a symbol instead of a number.  */
 602     result = scm_string_to_symbol (buffer);
 603
 604   return result;
 605 }
 606
 607 static SCM
 608 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
 609 {
 610   SCM result;
 611   int ends_with_colon = 0;
 612   SCM buffer;
 613   size_t read = 0;
 614   int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
 615
 616   scm_ungetc (chr, port);
 617   buffer = read_complete_token (port, &read);
 618   if (read > 0)
 619     ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
 620
 621   if (postfix && ends_with_colon && (read > 1))
 622     result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
 623   else
 624     result = scm_string_to_symbol (buffer);
 625
 626   return result;
 627 }
 628
 629 static SCM
 630 scm_read_number_and_radix (scm_t_wchar chr, SCM port)
 631 #define FUNC_NAME "scm_lreadr"
 632 {
 633   SCM result;
 634   size_t read;
 635   SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
 636   unsigned int radix;
 637
 638   switch (chr)
 639     {
 640     case 'B':
 641     case 'b':
 642       radix = 2;
 643       break;
 644
 645     case 'o':
 646     case 'O':
 647       radix = 8;
 648       break;
 649
 650     case 'd':
 651     case 'D':
 652       radix = 10;
 653       break;
 654
 655     case 'x':
 656     case 'X':
 657       radix = 16;
 658       break;
 659
 660     default:
 661       scm_ungetc (chr, port);
 662       scm_ungetc ('#', port);
 663       radix = 10;
 664     }
 665
 666   buffer = read_complete_token (port, &read);
 667   result = scm_string_to_number (buffer, scm_from_uint (radix));
 668
 669   if (scm_is_true (result))
 670     return result;
 671
 672   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
 673
 674   return SCM_BOOL_F;
 675 }
 676 #undef FUNC_NAME
 677
 678 static SCM
 679 scm_read_quote (int chr, SCM port)
 680 {
 681   SCM p;
 682   long line = SCM_LINUM (port);
 683   int column = SCM_COL (port) - 1;
 684
 685   switch (chr)
 686     {
 687     case '`':
 688       p = scm_sym_quasiquote;
 689       break;
 690
 691     case '\'':
 692       p = scm_sym_quote;
 693       break;
 694
 695     case ',':
 696       {
 697         scm_t_wchar c;
 698
 699         c = scm_getc (port);
 700         if ('@' == c)
 701           p = scm_sym_uq_splicing;
 702         else
 703           {
 704             scm_ungetc (c, port);
 705             p = scm_sym_unquote;
 706           }
 707         break;
 708       }
 709
 710     default:
 711       fprintf (stderr, "%s: unhandled quote character (%i)\n",
 712                "scm_read_quote", chr);
 713       abort ();
 714     }
 715
 716   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 717   if (SCM_RECORD_POSITIONS_P)
 718     scm_whash_insert (scm_source_whash, p,
 719                       scm_make_srcprops (line, column,
 720                                          SCM_FILENAME (port),
 721                                          SCM_COPY_SOURCE_P
 722                                          ? (scm_cons2 (SCM_CAR (p),
 723                                                        SCM_CAR (SCM_CDR (p)),
 724                                                        SCM_EOL))
 725                                          : SCM_UNDEFINED,
 726                                          SCM_EOL));
 727
 728
 729   return p;
 730 }
 731
 732 SCM_SYMBOL (sym_syntax, "syntax");
 733 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
 734 SCM_SYMBOL (sym_unsyntax, "unsyntax");
 735 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
 736
 737 static SCM
 738 scm_read_syntax (int chr, SCM port)
 739 {
 740   SCM p;
 741   long line = SCM_LINUM (port);
 742   int column = SCM_COL (port) - 1;
 743
 744   switch (chr)
 745     {
 746     case '`':
 747       p = sym_quasisyntax;
 748       break;
 749
 750     case '\'':
 751       p = sym_syntax;
 752       break;
 753
 754     case ',':
 755       {
 756         int c;
 757
 758         c = scm_getc (port);
 759         if ('@' == c)
 760           p = sym_unsyntax_splicing;
 761         else
 762           {
 763             scm_ungetc (c, port);
 764             p = sym_unsyntax;
 765           }
 766         break;
 767       }
 768
 769     default:
 770       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
 771                "scm_read_syntax", chr);
 772       abort ();
 773     }
 774
 775   p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
 776   if (SCM_RECORD_POSITIONS_P)
 777     scm_whash_insert (scm_source_whash, p,
 778                       scm_make_srcprops (line, column,
 779                                          SCM_FILENAME (port),
 780                                          SCM_COPY_SOURCE_P
 781                                          ? (scm_cons2 (SCM_CAR (p),
 782                                                        SCM_CAR (SCM_CDR (p)),
 783                                                        SCM_EOL))
 784                                          : SCM_UNDEFINED,
 785                                          SCM_EOL));
 786
 787
 788   return p;
 789 }
 790
 791 static inline SCM
 792 scm_read_semicolon_comment (int chr, SCM port)
 793 {
 794   int c;
 795
 796   /* We use the get_byte here because there is no need to get the
 797      locale correct with comment input. This presumes that newline
 798      always represents itself no matter what the encoding is.  */
 799   for (c = scm_get_byte_or_eof (port);
 800        (c != EOF) && (c != '\n');
 801        c = scm_getc (port));
 802
 803   return SCM_UNSPECIFIED;
 804 }
 805
 806 \f
 807 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
 808
 809 static SCM
 810 scm_read_boolean (int chr, SCM port)
 811 {
 812   switch (chr)
 813     {
 814     case 't':
 815     case 'T':
 816       return SCM_BOOL_T;
 817
 818     case 'f':
 819     case 'F':
 820       return SCM_BOOL_F;
 821     }
 822
 823   return SCM_UNSPECIFIED;
 824 }
 825
 826 static SCM
 827 scm_read_character (scm_t_wchar chr, SCM port)
 828 #define FUNC_NAME "scm_lreadr"
 829 {
 830   SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
 831   size_t charname_len;
 832   scm_t_wchar cp;
 833   int overflow;
 834
 835   overflow = read_token (port, charname, &charname_len);
 836   charname = scm_c_substring (charname, 0, charname_len);
 837
 838   if (overflow)
 839     goto char_error;
 840
 841   if (charname_len == 0)
 842     {
 843       chr = scm_getc (port);
 844       if (chr == EOF)
 845         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
 846                            "while reading character", SCM_EOL);
 847
 848       /* CHR must be a token delimiter, like a whitespace.  */
 849       return (SCM_MAKE_CHAR (chr));
 850     }
 851
 852   if (charname_len == 1)
 853     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
 854
 855   cp = scm_i_string_ref (charname, 0);
 856   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
 857     return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
 858
 859   if (cp >= '0' && cp < '8')
 860     {
 861       /* Dirk:FIXME::  This type of character syntax is not R5RS
 862        * compliant.  Further, it should be verified that the constant
 863        * does only consist of octal digits.  */
 864       SCM p = scm_string_to_number (charname, scm_from_uint (8));
 865       if (SCM_I_INUMP (p))
 866         {
 867           scm_t_wchar c = SCM_I_INUM (p);
 868           if (SCM_IS_UNICODE_CHAR (c))
 869             return SCM_MAKE_CHAR (c);
 870           else
 871             scm_i_input_error (FUNC_NAME, port,
 872                                "out-of-range octal character escape: ~a",
 873                                scm_list_1 (charname));
 874         }
 875     }
 876
 877   /* The names of characters should never have non-Latin1
 878      characters.  */
 879   if (scm_i_is_narrow_string (charname)
 880       || scm_i_try_narrow_string (charname))
 881     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
 882                                        charname_len);
 883       if (scm_is_true (ch))
 884         return ch;
 885     }
 886
 887  char_error:
 888   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
 889                      scm_list_1 (charname));
 890
 891   return SCM_UNSPECIFIED;
 892 }
 893 #undef FUNC_NAME
 894
 895 static inline SCM
 896 scm_read_keyword (int chr, SCM port)
 897 {
 898   SCM symbol;
 899
 900   /* Read the symbol that comprises the keyword.  Doing this instead of
 901      invoking a specific symbol reader function allows `scm_read_keyword ()'
 902      to adapt to the delimiters currently valid of symbols.
 903
 904      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
 905   symbol = scm_read_expression (port);
 906   if (!scm_is_symbol (symbol))
 907     scm_i_input_error ("scm_read_keyword", port,
 908                        "keyword prefix `~a' not followed by a symbol: ~s",
 909                        scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
 910
 911   return (scm_symbol_to_keyword (symbol));
 912 }
 913
 914 static inline SCM
 915 scm_read_vector (int chr, SCM port)
 916 {
 917   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
 918      guarantee that it's going to do what we want.  After all, this is an
 919      implementation detail of `scm_read_vector ()', not a desirable
 920      property.  */
 921   return (scm_vector (scm_read_sexp (chr, port)));
 922 }
 923
 924 static inline SCM
 925 scm_read_srfi4_vector (int chr, SCM port)
 926 {
 927   return scm_i_read_array (port, chr);
 928 }
 929
 930 static SCM
 931 scm_read_bytevector (scm_t_wchar chr, SCM port)
 932 {
 933   chr = scm_getc (port);
 934   if (chr != 'u')
 935     goto syntax;
 936
 937   chr = scm_getc (port);
 938   if (chr != '8')
 939     goto syntax;
 940
 941   chr = scm_getc (port);
 942   if (chr != '(')
 943     goto syntax;
 944
 945   return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
 946
 947  syntax:
 948   scm_i_input_error ("read_bytevector", port,
 949                      "invalid bytevector prefix",
 950                      SCM_MAKE_CHAR (chr));
 951   return SCM_UNSPECIFIED;
 952 }
 953
 954 static SCM
 955 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
 956 {
 957   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
 958      terribly inefficient but who cares?  */
 959   SCM s_bits = SCM_EOL;
 960
 961   for (chr = scm_getc (port);
 962        (chr != EOF) && ((chr == '0') || (chr == '1'));
 963        chr = scm_getc (port))
 964     {
 965       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
 966     }
 967
 968   if (chr != EOF)
 969     scm_ungetc (chr, port);
 970
 971   return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
 972 }
 973
 974 static inline SCM
 975 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
 976 {
 977   int bang_seen = 0;
 978
 979   /* We can use the get_byte here because there is no need to get the
 980      locale correct when reading comments. This presumes that
 981      hash and exclamation points always represent themselves no
 982      matter what the source encoding is.*/
 983   for (;;)
 984     {
 985       int c = scm_get_byte_or_eof (port);
 986
 987       if (c == EOF)
 988         scm_i_input_error ("skip_block_comment", port,
 989                            "unterminated `#! ... !#' comment", SCM_EOL);
 990
 991       if (c == '!')
 992         bang_seen = 1;
 993       else if (c == '#' && bang_seen)
 994         break;
 995       else
 996         bang_seen = 0;
 997     }
 998
 999   return SCM_UNSPECIFIED;
1000 }
1001
1002 static SCM
1003 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1004 {
1005   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1006      nested.  So care must be taken.  */
1007   int nesting_level = 1;
1008   int opening_seen = 0, closing_seen = 0;
1009
1010   while (nesting_level > 0)
1011     {
1012       int c = scm_getc (port);
1013
1014       if (c == EOF)
1015         scm_i_input_error ("scm_read_r6rs_block_comment", port,
1016                            "unterminated `#| ... |#' comment", SCM_EOL);
1017
1018       if (opening_seen)
1019         {
1020           if (c == '|')
1021             nesting_level++;
1022           opening_seen = 0;
1023         }
1024       else if (closing_seen)
1025         {
1026           if (c == '#')
1027             nesting_level--;
1028           closing_seen = 0;
1029         }
1030       else if (c == '|')
1031         closing_seen = 1;
1032       else if (c == '#')
1033         opening_seen = 1;
1034       else
1035         opening_seen = closing_seen = 0;
1036     }
1037
1038   return SCM_UNSPECIFIED;
1039 }
1040
1041 static SCM
1042 scm_read_commented_expression (scm_t_wchar chr, SCM port)
1043 {
1044   scm_t_wchar c;
1045
1046   c = flush_ws (port, (char *) NULL);
1047   if (EOF == c)
1048     scm_i_input_error ("read_commented_expression", port,
1049                        "no expression after #; comment", SCM_EOL);
1050   scm_ungetc (c, port);
1051   scm_read_expression (port);
1052   return SCM_UNSPECIFIED;
1053 }
1054
1055 static SCM
1056 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1057 {
1058   /* Guile's extended symbol read syntax looks like this:
1059
1060        #{This is all a symbol name}#
1061
1062      So here, CHR is expected to be `{'.  */
1063   int saw_brace = 0, finished = 0;
1064   size_t len = 0;
1065   SCM buf = scm_i_make_string (1024, NULL);
1066
1067   buf = scm_i_string_start_writing (buf);
1068
1069   while ((chr = scm_getc (port)) != EOF)
1070     {
1071       if (saw_brace)
1072         {
1073           if (chr == '#')
1074             {
1075               finished = 1;
1076               break;
1077             }
1078           else
1079             {
1080               saw_brace = 0;
1081               scm_i_string_set_x (buf, len++, '}');
1082               scm_i_string_set_x (buf, len++, chr);
1083             }
1084         }
1085       else if (chr == '}')
1086         saw_brace = 1;
1087       else
1088         scm_i_string_set_x (buf, len++, chr);
1089
1090       if (len >= scm_i_string_length (buf) - 2)
1091         {
1092           SCM addy;
1093
1094           scm_i_string_stop_writing ();
1095           addy = scm_i_make_string (1024, NULL);
1096           buf = scm_string_append (scm_list_2 (buf, addy));
1097           len = 0;
1098           buf = scm_i_string_start_writing (buf);
1099         }
1100
1101       if (finished)
1102         break;
1103     }
1104   scm_i_string_stop_writing ();
1105
1106   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1107 }
1108
1109
1110 \f
1111 /* Top-level token readers, i.e., dispatchers.  */
1112
1113 static SCM
1114 scm_read_sharp_extension (int chr, SCM port)
1115 {
1116   SCM proc;
1117
1118   proc = scm_get_hash_procedure (chr);
1119   if (scm_is_true (scm_procedure_p (proc)))
1120     {
1121       long line = SCM_LINUM (port);
1122       int column = SCM_COL (port) - 2;
1123       SCM got;
1124
1125       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1126       if (!scm_is_eq (got, SCM_UNSPECIFIED))
1127         {
1128           if (SCM_RECORD_POSITIONS_P)
1129             return (recsexpr (got, line, column,
1130                               SCM_FILENAME (port)));
1131           else
1132             return got;
1133         }
1134     }
1135
1136   return SCM_UNSPECIFIED;
1137 }
1138
1139 /* The reader for the sharp `#' character.  It basically dispatches reads
1140    among the above token readers.   */
1141 static SCM
1142 scm_read_sharp (scm_t_wchar chr, SCM port)
1143 #define FUNC_NAME "scm_lreadr"
1144 {
1145   SCM result;
1146
1147   chr = scm_getc (port);
1148
1149   result = scm_read_sharp_extension (chr, port);
1150   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1151     return result;
1152
1153   switch (chr)
1154     {
1155     case '\\':
1156       return (scm_read_character (chr, port));
1157     case '(':
1158       return (scm_read_vector (chr, port));
1159     case 's':
1160     case 'u':
1161     case 'f':
1162       /* This one may return either a boolean or an SRFI-4 vector.  */
1163       return (scm_read_srfi4_vector (chr, port));
1164     case 'v':
1165       return (scm_read_bytevector (chr, port));
1166     case '*':
1167       return (scm_read_guile_bit_vector (chr, port));
1168     case 't':
1169     case 'T':
1170     case 'F':
1171       /* This one may return either a boolean or an SRFI-4 vector.  */
1172       return (scm_read_boolean (chr, port));
1173     case ':':
1174       return (scm_read_keyword (chr, port));
1175     case '0': case '1': case '2': case '3': case '4':
1176     case '5': case '6': case '7': case '8': case '9':
1177     case '@':
1178 #if SCM_ENABLE_DEPRECATED
1179       /* See below for 'i' and 'e'. */
1180     case 'a':
1181     case 'c':
1182     case 'y':
1183     case 'h':
1184     case 'l':
1185 #endif
1186       return (scm_i_read_array (port, chr));
1187
1188     case 'i':
1189     case 'e':
1190 #if SCM_ENABLE_DEPRECATED
1191       {
1192         /* When next char is '(', it really is an old-style
1193            uniform array. */
1194         scm_t_wchar next_c = scm_getc (port);
1195         if (next_c != EOF)
1196           scm_ungetc (next_c, port);
1197         if (next_c == '(')
1198           return scm_i_read_array (port, chr);
1199         /* Fall through. */
1200       }
1201 #endif
1202     case 'b':
1203     case 'B':
1204     case 'o':
1205     case 'O':
1206     case 'd':
1207     case 'D':
1208     case 'x':
1209     case 'X':
1210     case 'I':
1211     case 'E':
1212       return (scm_read_number_and_radix (chr, port));
1213     case '{':
1214       return (scm_read_extended_symbol (chr, port));
1215     case '!':
1216       return (scm_read_scsh_block_comment (chr, port));
1217     case ';':
1218       return (scm_read_commented_expression (chr, port));
1219     case '`':
1220     case '\'':
1221     case ',':
1222       return (scm_read_syntax (chr, port));
1223     default:
1224       result = scm_read_sharp_extension (chr, port);
1225       if (scm_is_eq (result, SCM_UNSPECIFIED))
1226         {
1227           /* To remain compatible with 1.8 and earlier, the following
1228              characters have lower precedence than `read-hash-extend'
1229              characters.  */
1230           switch (chr)
1231             {
1232             case '|':
1233               return scm_read_r6rs_block_comment (chr, port);
1234             default:
1235               scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1236                                  scm_list_1 (SCM_MAKE_CHAR (chr)));
1237             }
1238         }
1239       else
1240         return result;
1241     }
1242
1243   return SCM_UNSPECIFIED;
1244 }
1245 #undef FUNC_NAME
1246
1247 static SCM
1248 scm_read_expression (SCM port)
1249 #define FUNC_NAME "scm_read_expression"
1250 {
1251   while (1)
1252     {
1253       register scm_t_wchar chr;
1254
1255       chr = scm_getc (port);
1256
1257       switch (chr)
1258         {
1259         case SCM_WHITE_SPACES:
1260         case SCM_LINE_INCREMENTORS:
1261           break;
1262         case ';':
1263           (void) scm_read_semicolon_comment (chr, port);
1264           break;
1265         case '(':
1266           return (scm_read_sexp (chr, port));
1267         case '"':
1268           return (scm_read_string (chr, port));
1269         case '\'':
1270         case '`':
1271         case ',':
1272           return (scm_read_quote (chr, port));
1273         case '#':
1274           {
1275             SCM result;
1276             result = scm_read_sharp (chr, port);
1277             if (scm_is_eq (result, SCM_UNSPECIFIED))
1278               /* We read a comment or some such.  */
1279               break;
1280             else
1281               return result;
1282           }
1283         case ')':
1284           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1285           break;
1286         case EOF:
1287           return SCM_EOF_VAL;
1288         case ':':
1289           if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1290             return scm_symbol_to_keyword (scm_read_expression (port));
1291           /* Fall through.  */
1292
1293         default:
1294           {
1295             if (((chr >= '0') && (chr <= '9'))
1296                 || (strchr ("+-.", chr)))
1297               return (scm_read_number (chr, port));
1298             else
1299               return (scm_read_mixed_case_symbol (chr, port));
1300           }
1301         }
1302     }
1303 }
1304 #undef FUNC_NAME
1305
1306 \f
1307 /* Actual reader.  */
1308
1309 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1310             (SCM port),
1311             "Read an s-expression from the input port @var{port}, or from\n"
1312             "the current input port if @var{port} is not specified.\n"
1313             "Any whitespace before the next token is discarded.")
1314 #define FUNC_NAME s_scm_read
1315 {
1316   int c;
1317
1318   if (SCM_UNBNDP (port))
1319     port = scm_current_input_port ();
1320   SCM_VALIDATE_OPINPORT (1, port);
1321
1322   c = flush_ws (port, (char *) NULL);
1323   if (EOF == c)
1324     return SCM_EOF_VAL;
1325   scm_ungetc (c, port);
1326
1327   return (scm_read_expression (port));
1328 }
1329 #undef FUNC_NAME
1330
1331
1332 \f
1333
1334 /* Used when recording expressions constructed by `scm_read_sharp ()'.  */
1335 static SCM
1336 recsexpr (SCM obj, long line, int column, SCM filename)
1337 {
1338   if (!scm_is_pair(obj)) {
1339     return obj;
1340   } else {
1341     SCM tmp = obj, copy;
1342     /* If this sexpr is visible in the read:sharp source, we want to
1343        keep that information, so only record non-constant cons cells
1344        which haven't previously been read by the reader. */
1345     if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1346       {
1347         if (SCM_COPY_SOURCE_P)
1348           {
1349             copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1350                              SCM_UNDEFINED);
1351             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1352               {
1353                 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1354                                                       line,
1355                                                       column,
1356                                                       filename),
1357                                             SCM_UNDEFINED));
1358                 copy = SCM_CDR (copy);
1359               }
1360             SCM_SETCDR (copy, tmp);
1361           }
1362         else
1363           {
1364             recsexpr (SCM_CAR (obj), line, column, filename);
1365             while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1366               recsexpr (SCM_CAR (tmp), line, column, filename);
1367             copy = SCM_UNDEFINED;
1368           }
1369         scm_whash_insert (scm_source_whash,
1370                           obj,
1371                           scm_make_srcprops (line,
1372                                              column,
1373                                              filename,
1374                                              copy,
1375                                              SCM_EOL));
1376       }
1377     return obj;
1378   }
1379 }
1380
1381 /* Manipulate the read-hash-procedures alist.  This could be written in
1382    Scheme, but maybe it will also be used by C code during initialisation.  */
1383 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1384             (SCM chr, SCM proc),
1385             "Install the procedure @var{proc} for reading expressions\n"
1386             "starting with the character sequence @code{#} and @var{chr}.\n"
1387             "@var{proc} will be called with two arguments:  the character\n"
1388             "@var{chr} and the port to read further data from. The object\n"
1389             "returned will be the return value of @code{read}. \n"
1390             "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1391             )
1392 #define FUNC_NAME s_scm_read_hash_extend
1393 {
1394   SCM this;
1395   SCM prev;
1396
1397   SCM_VALIDATE_CHAR (1, chr);
1398   SCM_ASSERT (scm_is_false (proc)
1399               || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1400               proc, SCM_ARG2, FUNC_NAME);
1401
1402   /* Check if chr is already in the alist.  */
1403   this = *scm_read_hash_procedures;
1404   prev = SCM_BOOL_F;
1405   while (1)
1406     {
1407       if (scm_is_null (this))
1408         {
1409           /* not found, so add it to the beginning.  */
1410           if (scm_is_true (proc))
1411             {
1412               *scm_read_hash_procedures =
1413                 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
1414             }
1415           break;
1416         }
1417       if (scm_is_eq (chr, SCM_CAAR (this)))
1418         {
1419           /* already in the alist.  */
1420           if (scm_is_false (proc))
1421             {
1422               /* remove it.  */
1423               if (scm_is_false (prev))
1424                 {
1425                   *scm_read_hash_procedures =
1426                     SCM_CDR (*scm_read_hash_procedures);
1427                 }
1428               else
1429                 scm_set_cdr_x (prev, SCM_CDR (this));
1430             }
1431           else
1432             {
1433               /* replace it.  */
1434               scm_set_cdr_x (SCM_CAR (this), proc);
1435             }
1436           break;
1437         }
1438       prev = this;
1439       this = SCM_CDR (this);
1440     }
1441
1442   return SCM_UNSPECIFIED;
1443 }
1444 #undef FUNC_NAME
1445
1446 /* Recover the read-hash procedure corresponding to char c.  */
1447 static SCM
1448 scm_get_hash_procedure (int c)
1449 {
1450   SCM rest = *scm_read_hash_procedures;
1451
1452   while (1)
1453     {
1454       if (scm_is_null (rest))
1455         return SCM_BOOL_F;
1456
1457       if (SCM_CHAR (SCM_CAAR (rest)) == c)
1458         return SCM_CDAR (rest);
1459
1460       rest = SCM_CDR (rest);
1461     }
1462 }
1463
1464 #define SCM_ENCODING_SEARCH_SIZE (500)
1465
1466 /* Search the first few hundred characters of a file for an Emacs-like coding
1467    declaration.  Returns either NULL or a string whose storage has been
1468    allocated with `scm_gc_malloc ()'.  */
1469 char *
1470 scm_i_scan_for_encoding (SCM port)
1471 {
1472   char header[SCM_ENCODING_SEARCH_SIZE+1];
1473   size_t bytes_read;
1474   char *encoding = NULL;
1475   int utf8_bom = 0;
1476   char *pos;
1477   int i;
1478   int in_comment;
1479
1480   if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1481     /* PORT is a non-seekable file port (e.g., as created by Bash when using
1482        "guile <(echo '(display "hello")')") so bail out.  */
1483     return NULL;
1484
1485   bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1486
1487   scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1488
1489   if (bytes_read > 3
1490       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1491     utf8_bom = 1;
1492
1493   /* search past "coding[:=]" */
1494   pos = header;
1495   while (1)
1496     {
1497       if ((pos = strstr(pos, "coding")) == NULL)
1498         return NULL;
1499
1500       pos += strlen("coding");
1501       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1502           (*pos == ':' || *pos == '='))
1503         {
1504           pos ++;
1505           break;
1506         }
1507     }
1508
1509   /* skip spaces */
1510   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1511          (*pos == ' ' || *pos == '\t'))
1512     pos ++;
1513
1514   /* grab the next token */
1515   i = 0;
1516   while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
1517          && pos + i - header < bytes_read
1518          && (isalnum((int) pos[i]) || pos[i] == '_' || pos[i] == '-'
1519              || pos[i] == '.'))
1520     i++;
1521
1522   if (i == 0)
1523     return NULL;
1524
1525   encoding = scm_gc_strndup (pos, i, "encoding");
1526   for (i = 0; i < strlen (encoding); i++)
1527     encoding[i] = toupper ((int) encoding[i]);
1528
1529   /* push backwards to make sure we were in a comment */
1530   in_comment = 0;
1531   while (pos - i - header > 0)
1532     {
1533       if (*(pos - i) == '\n')
1534         {
1535           /* This wasn't in a semicolon comment. Check for a
1536            hash-bang comment. */
1537           char *beg = strstr (header, "#!");
1538           char *end = strstr (header, "!#");
1539           if (beg < pos && pos < end)
1540             in_comment = 1;
1541           break;
1542         }
1543       if (*(pos - i) == ';')
1544         {
1545           in_comment = 1;
1546           break;
1547         }
1548       i ++;
1549     }
1550   if (!in_comment)
1551     /* This wasn't in a comment */
1552     return NULL;
1553
1554   if (utf8_bom && strcmp(encoding, "UTF-8"))
1555     scm_misc_error (NULL,
1556                     "the port input declares the encoding ~s but is encoded as UTF-8",
1557                     scm_list_1 (scm_from_locale_string (encoding)));
1558
1559   return encoding;
1560 }
1561
1562 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1563             (SCM port),
1564             "Scans the port for an Emacs-like character coding declaration\n"
1565             "near the top of the contents of a port with random-acessible contents.\n"
1566             "The coding declaration is of the form\n"
1567             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1568             "\n"
1569             "Returns a string containing the character encoding of the file\n"
1570             "if a declaration was found, or @code{#f} otherwise.\n")
1571 #define FUNC_NAME s_scm_file_encoding
1572 {
1573   char *enc;
1574   SCM s_enc;
1575
1576   enc = scm_i_scan_for_encoding (port);
1577   if (enc == NULL)
1578     return SCM_BOOL_F;
1579   else
1580     {
1581       s_enc = scm_from_locale_string (enc);
1582       return s_enc;
1583     }
1584
1585   return SCM_BOOL_F;
1586 }
1587 #undef FUNC_NAME
1588
1589 void
1590 scm_init_read ()
1591 {
1592   scm_read_hash_procedures =
1593     SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
1594
1595   scm_init_opts (scm_read_options, scm_read_opts);
1596 #include "libguile/read.x"
1597 }
1598
1599 /*
1600   Local Variables:
1601   c-file-style: "gnu"
1602   End:
1603 */