Merge remote-tracking branch 'local-2.0/stable-2.0'

[bpt/guile.git] / libguile / read.c
diff --git a/libguile/read.c b/libguile/read.c

index ee87861..dff9d85 100644 (file)
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -1,5 +1,5 @@
-/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
- * Foundation, Inc.
+/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
+ *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   * 
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public License
@@ -29,6 +29,7 @@
  #include <string.h>
  #include <unistd.h>
  #include <unicase.h>
+#include <unictype.h>
  
  #include "libguile/_scm.h"
  #include "libguile/bytevectors.h"
@@ -60,24 +61,23 @@
  SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  SCM_SYMBOL (scm_keyword_prefix, "prefix");
  SCM_SYMBOL (scm_keyword_postfix, "postfix");
+SCM_SYMBOL (sym_nil, "nil");
  
  scm_t_option scm_read_opts[] = {
    { SCM_OPTION_BOOLEAN, "copy", 0,
      "Copy source code expressions." },
-  { SCM_OPTION_BOOLEAN, "positions", 0,
+  { SCM_OPTION_BOOLEAN, "positions", 1,
      "Record positions of source code expressions." },
    { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
      "Convert symbols to lower case."},
-  { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
+  { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
      "Style of keyword recognition: #f, 'prefix or 'postfix."},
-  { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
-    "Support Elisp vector syntax, namely `[...]'."},
-  { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
-    "Support `\\(' and `\\)' in strings."},
    { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
      "Use R6RS variable-length character and string hex escapes."},
    { SCM_OPTION_BOOLEAN, "square-brackets", 1,
      "Treat `[' and `]' as parentheses, for R6RS compatibility."},
+  { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
+    "In strings, consume leading whitespace after an escaped end-of-line."},
    { 0, },
  };
  
@@ -114,7 +114,7 @@ scm_i_input_error (char const *function,
      
    string = scm_get_output_string (string_port);
    scm_close_output_port (string_port);
-  scm_error_scm (scm_from_locale_symbol ("read-error"),
+  scm_error_scm (scm_from_latin1_symbol ("read-error"),
                  function? scm_from_locale_string (function) : SCM_BOOL_F,
                  string,
                  arg,
@@ -138,9 +138,21 @@ SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
  }
  #undef FUNC_NAME
  
-/* An association list mapping extra hash characters to procedures.  */
-static SCM *scm_read_hash_procedures;
+/* A fluid referring to an association list mapping extra hash
+   characters to procedures.  */
+static SCM *scm_i_read_hash_procedures;
  
+static SCM
+scm_i_read_hash_procedures_ref (void)
+{
+  return scm_fluid_ref (*scm_i_read_hash_procedures);
+}
+
+static void
+scm_i_read_hash_procedures_set_x (SCM value)
+{
+  scm_fluid_set_x (*scm_i_read_hash_procedures, value);
+}
  
  \f
  /* Token readers.  */
@@ -185,16 +197,17 @@ static SCM *scm_read_hash_procedures;
     || ((_chr) == 'd') || ((_chr) == 'l'))
  
  /* Read an SCSH block comment.  */
-static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
+static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
  static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
  static SCM scm_read_commented_expression (scm_t_wchar, SCM);
+static SCM scm_read_shebang (scm_t_wchar, SCM);
  static SCM scm_get_hash_procedure (int);
  
  /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
     result in the pre-allocated buffer BUF.  Return zero if the whole token has
     fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
     bytes actually read.  */
-static inline int
+static int
  read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
   {
     *read = 0;
@@ -203,13 +216,13 @@ read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
       {
         int chr;
  
-       chr = scm_get_byte_or_eof (port);
+       chr = scm_get_byte_or_eof_unlocked (port);
  
         if (chr == EOF)
          return 0;
        else if (CHAR_IS_DELIMITER (chr))
          {
-          scm_unget_byte (chr, port);
+          scm_unget_byte_unlocked (chr, port);
            return 0;
          }
        else
@@ -273,9 +286,9 @@ read_complete_token (SCM port, char *buffer, const size_t buffer_size,
  static int
  flush_ws (SCM port, const char *eoferr)
  {
-  register scm_t_wchar c;
+  scm_t_wchar c;
    while (1)
-    switch (c = scm_getc (port))
+    switch (c = scm_getc_unlocked (port))
        {
        case EOF:
        goteof:
@@ -290,7 +303,7 @@ flush_ws (SCM port, const char *eoferr)
  
        case ';':
        lp:
-       switch (c = scm_getc (port))
+       switch (c = scm_getc_unlocked (port))
           {
           case EOF:
             goto goteof;
@@ -302,13 +315,13 @@ flush_ws (SCM port, const char *eoferr)
         break;
  
        case '#':
-       switch (c = scm_getc (port))
+       switch (c = scm_getc_unlocked (port))
           {
           case EOF:
             eoferr = "read_sharp";
             goto goteof;
           case '!':
-           scm_read_scsh_block_comment (c, port);
+           scm_read_shebang (c, port);
             break;
           case ';':
             scm_read_commented_expression (c, port);
@@ -321,7 +334,7 @@ flush_ws (SCM port, const char *eoferr)
               }
             /* fall through */
           default:
-           scm_ungetc (c, port);
+           scm_ungetc_unlocked (c, port);
             return '#';
           }
         break;
@@ -343,32 +356,40 @@ flush_ws (SCM port, const char *eoferr)
  /* Token readers.  */
  
  static SCM scm_read_expression (SCM port);
-static SCM scm_read_sharp (int chr, SCM port);
-static SCM recsexpr (SCM obj, long line, int column, SCM filename);
+static SCM scm_read_sharp (int chr, SCM port, long line, int column);
+
  
+static SCM
+maybe_annotate_source (SCM x, SCM port, long line, int column)
+{
+  if (SCM_RECORD_POSITIONS_P)
+    scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
+  return x;
+}
  
  static SCM
  scm_read_sexp (scm_t_wchar chr, SCM port)
  #define FUNC_NAME "scm_i_lreadparen"
  {
-  register int c;
-  register SCM tmp;
-  register SCM tl, ans = SCM_EOL;
-  SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
+  int c;
+  SCM tmp, tl, ans = SCM_EOL;
    const int terminating_char = ((chr == '[') ? ']' : ')');
  
    /* Need to capture line and column numbers here. */
    long line = SCM_LINUM (port);
    int column = SCM_COL (port) - 1;
  
-
    c = flush_ws (port, FUNC_NAME);
    if (terminating_char == c)
      return SCM_EOL;
  
-  scm_ungetc (c, port);
-  if (scm_is_eq (scm_sym_dot,
-                (tmp = scm_read_expression (port))))
+  scm_ungetc_unlocked (c, port);
+  tmp = scm_read_expression (port);
+
+  /* Note that it is possible for scm_read_expression to return
+     scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
+     check that it's a real dot by checking `c'.  */
+  if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
      {
        ans = scm_read_expression (port);
        if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
@@ -380,25 +401,22 @@ scm_read_sexp (scm_t_wchar chr, SCM port)
    /* Build the head of the list structure. */
    ans = tl = scm_cons (tmp, SCM_EOL);
  
-  if (SCM_COPY_SOURCE_P)
-    ans2 = tl2 = scm_cons (scm_is_pair (tmp)
-                          ? copy
-                          : tmp,
-                          SCM_EOL);
-
    while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
      {
        SCM new_tail;
  
-      scm_ungetc (c, port);
-      if (scm_is_eq (scm_sym_dot,
-                    (tmp = scm_read_expression (port))))
-       {
-         SCM_SETCDR (tl, tmp = scm_read_expression (port));
+      if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
+        scm_i_input_error (FUNC_NAME, port,
+                           "in pair: mismatched close paren: ~A",
+                           scm_list_1 (SCM_MAKE_CHAR (c)));
  
-         if (SCM_COPY_SOURCE_P)
-           SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
-                                      SCM_EOL));
+      scm_ungetc_unlocked (c, port);
+      tmp = scm_read_expression (port);
+
+      /* See above note about scm_sym_dot.  */
+      if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
+       {
+         SCM_SETCDR (tl, scm_read_expression (port));
  
           c = flush_ws (port, FUNC_NAME);
           if (terminating_char != c)
@@ -410,28 +428,10 @@ scm_read_sexp (scm_t_wchar chr, SCM port)
        new_tail = scm_cons (tmp, SCM_EOL);
        SCM_SETCDR (tl, new_tail);
        tl = new_tail;
-
-      if (SCM_COPY_SOURCE_P)
-       {
-         SCM new_tail2 = scm_cons (scm_is_pair (tmp)
-                                   ? copy
-                                   : tmp, SCM_EOL);
-         SCM_SETCDR (tl2, new_tail2);
-         tl2 = new_tail2;
-       }
      }
  
   exit:
-  if (SCM_RECORD_POSITIONS_P)
-    scm_whash_insert (scm_source_whash,
-                     ans,
-                     scm_make_srcprops (line, column,
-                                        SCM_FILENAME (port),
-                                        SCM_COPY_SOURCE_P
-                                        ? ans2
-                                        : SCM_UNDEFINED,
-                                        SCM_EOL));
-  return ans;
+  return maybe_annotate_source (ans, port, line, column);
  }
  #undef FUNC_NAME
  
@@ -447,7 +447,7 @@ scm_read_sexp (scm_t_wchar chr, SCM port)
        c = 0;                                                       \
        while (i < ndigits)                                          \
          {                                                          \
-          a = scm_getc (port);                                     \
+          a = scm_getc_unlocked (port);                                     \
            if (a == EOF)                                            \
              goto str_eof;                                          \
            if (terminator                                           \
@@ -470,6 +470,22 @@ scm_read_sexp (scm_t_wchar chr, SCM port)
          }                                                          \
      } while (0)
  
+static void
+skip_intraline_whitespace (SCM port)
+{
+  scm_t_wchar c;
+  
+  do
+    {
+      c = scm_getc_unlocked (port);
+      if (c == EOF)
+        return;
+    }
+  while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
+
+  scm_ungetc_unlocked (c, port);
+}                                         
+
  static SCM
  scm_read_string (int chr, SCM port)
  #define FUNC_NAME "scm_lreadr"
@@ -481,8 +497,12 @@ scm_read_string (int chr, SCM port)
    unsigned c_str_len = 0;
    scm_t_wchar c;
  
-  str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
-  while ('"' != (c = scm_getc (port)))
+  /* Need to capture line and column numbers here. */
+  long line = SCM_LINUM (port);
+  int column = SCM_COL (port) - 1;
+
+  str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
+  while ('"' != (c = scm_getc_unlocked (port)))
      {
        if (c == EOF)
          {
@@ -493,26 +513,23 @@ scm_read_string (int chr, SCM port)
  
        if (c_str_len + 1 >= scm_i_string_length (str))
          {
-          SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
+          SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
  
            str = scm_string_append (scm_list_2 (str, addy));
          }
  
        if (c == '\\')
          {
-          switch (c = scm_getc (port))
+          switch (c = scm_getc_unlocked (port))
              {
              case EOF:
                goto str_eof;
              case '"':
              case '\\':
                break;
-            case '(':
-            case ')':
-              if (SCM_ESCAPED_PARENS_P)
-                break;
-              goto bad_escaped;
              case '\n':
+              if (SCM_HUNGRY_EOL_ESCAPES_P)
+                skip_intraline_whitespace (port);
                continue;
              case '0':
                c = '\0';
@@ -567,13 +584,8 @@ scm_read_string (int chr, SCM port)
        scm_i_string_set_x (str, c_str_len++, c);
        scm_i_string_stop_writing ();
      }
-
-  if (c_str_len > 0)
-    {
-      return scm_i_substring_copy (str, 0, c_str_len);
-    }
-
-  return scm_nullstr;
+  return maybe_annotate_source (scm_i_substring_copy (str, 0, c_str_len),
+                                port, line, column);
  }
  #undef FUNC_NAME
  
@@ -588,7 +600,11 @@ scm_read_number (scm_t_wchar chr, SCM port)
    int overflow;
    scm_t_port *pt = SCM_PTAB_ENTRY (port);
  
-  scm_ungetc (chr, port);
+  /* Need to capture line and column numbers here. */
+  long line = SCM_LINUM (port);
+  int column = SCM_COL (port) - 1;
+
+  scm_ungetc_unlocked (chr, port);
    overflow = read_complete_token (port, buffer, sizeof (buffer),
                                    &overflow_buffer, &bytes_read);
  
@@ -599,13 +615,15 @@ scm_read_number (scm_t_wchar chr, SCM port)
                              pt->ilseq_handler);
  
    result = scm_string_to_number (str, SCM_UNDEFINED);
-  if (!scm_is_true (result))
+  if (scm_is_false (result))
      {
        /* Return a symbol instead of a number */
        if (SCM_CASE_INSENSITIVE_P)
          str = scm_string_downcase_x (str);
        result = scm_string_to_symbol (str);
      }
+  else if (SCM_NIMP (result))
+    result = maybe_annotate_source (result, port, line, column);
  
    if (overflow)
      free (overflow_buffer);
@@ -625,7 +643,7 @@ scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
    scm_t_port *pt = SCM_PTAB_ENTRY (port);
    SCM str;
  
-  scm_ungetc (chr, port);
+  scm_ungetc_unlocked (chr, port);
    overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
                                    &overflow_buffer, &bytes_read);
    if (bytes_read > 0)
@@ -702,8 +720,8 @@ scm_read_number_and_radix (scm_t_wchar chr, SCM port)
        break;
  
      default:
-      scm_ungetc (chr, port);
-      scm_ungetc ('#', port);
+      scm_ungetc_unlocked (chr, port);
+      scm_ungetc_unlocked ('#', port);
        radix = 10;
      }
  
@@ -754,12 +772,12 @@ scm_read_quote (int chr, SCM port)
        {
         scm_t_wchar c;
  
-       c = scm_getc (port);
+       c = scm_getc_unlocked (port);
         if ('@' == c)
           p = scm_sym_uq_splicing;
         else
           {
-           scm_ungetc (c, port);
+           scm_ungetc_unlocked (c, port);
             p = scm_sym_unquote;
           }
         break;
@@ -772,19 +790,7 @@ scm_read_quote (int chr, SCM port)
      }
  
    p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
-  if (SCM_RECORD_POSITIONS_P)
-    scm_whash_insert (scm_source_whash, p,
-                     scm_make_srcprops (line, column,
-                                        SCM_FILENAME (port),
-                                        SCM_COPY_SOURCE_P
-                                        ? (scm_cons2 (SCM_CAR (p),
-                                                      SCM_CAR (SCM_CDR (p)),
-                                                      SCM_EOL))
-                                        : SCM_UNDEFINED,
-                                        SCM_EOL));
-
-
-  return p;
+  return maybe_annotate_source (p, port, line, column);
  }
  
  SCM_SYMBOL (sym_syntax, "syntax");
@@ -813,12 +819,12 @@ scm_read_syntax (int chr, SCM port)
        {
         int c;
  
-       c = scm_getc (port);
+       c = scm_getc_unlocked (port);
         if ('@' == c)
           p = sym_unsyntax_splicing;
         else
           {
-           scm_ungetc (c, port);
+           scm_ungetc_unlocked (c, port);
             p = sym_unsyntax;
           }
         break;
@@ -831,22 +837,23 @@ scm_read_syntax (int chr, SCM port)
      }
  
    p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
-  if (SCM_RECORD_POSITIONS_P)
-    scm_whash_insert (scm_source_whash, p,
-                     scm_make_srcprops (line, column,
-                                        SCM_FILENAME (port),
-                                        SCM_COPY_SOURCE_P
-                                        ? (scm_cons2 (SCM_CAR (p),
-                                                      SCM_CAR (SCM_CDR (p)),
-                                                      SCM_EOL))
-                                        : SCM_UNDEFINED,
-                                        SCM_EOL));
-
-
-  return p;
+  return maybe_annotate_source (p, port, line, column);
  }
  
-static inline SCM
+static SCM
+scm_read_nil (int chr, SCM port)
+{
+  SCM id = scm_read_mixed_case_symbol (chr, port);
+
+  if (!scm_is_eq (id, sym_nil))
+    scm_i_input_error ("scm_read_nil", port,
+                       "unexpected input while reading #nil: ~a",
+                       scm_list_1 (id));
+
+  return SCM_ELISP_NIL;
+}
+  
+static SCM
  scm_read_semicolon_comment (int chr, SCM port)
  {
    int c;
@@ -854,9 +861,9 @@ scm_read_semicolon_comment (int chr, SCM port)
    /* We use the get_byte here because there is no need to get the
       locale correct with comment input. This presumes that newline
       always represents itself no matter what the encoding is.  */
-  for (c = scm_get_byte_or_eof (port);
+  for (c = scm_get_byte_or_eof_unlocked (port);
         (c != EOF) && (c != '\n');
-       c = scm_get_byte_or_eof (port));
+       c = scm_get_byte_or_eof_unlocked (port));
  
    return SCM_UNSPECIFIED;
  }
@@ -894,11 +901,11 @@ scm_read_character (scm_t_wchar chr, SCM port)
  
    overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
    if (overflow)
-    goto char_error;
+    scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
  
    if (bytes_read == 0)
      {
-      chr = scm_getc (port);
+      chr = scm_getc_unlocked (port);
        if (chr == EOF)
         scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
                            "while reading character", SCM_EOL);
@@ -941,26 +948,26 @@ scm_read_character (scm_t_wchar chr, SCM port)
        SCM p = scm_string_to_number (charname, scm_from_uint (8));
        if (SCM_I_INUMP (p))
          {
-          scm_t_wchar c = SCM_I_INUM (p);
+          scm_t_wchar c = scm_to_uint32 (p);
            if (SCM_IS_UNICODE_CHAR (c))
              return SCM_MAKE_CHAR (c);
            else
-            scm_i_input_error (FUNC_NAME, port, 
+            scm_i_input_error (FUNC_NAME, port,
                                 "out-of-range octal character escape: ~a",
                                 scm_list_1 (charname));
          }
      }
  
-  if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
+  if (cp == 'x' && (charname_len > 1))
      {
        SCM p;
-      
+
        /* Convert from hex, skipping the initial 'x' character in CHARNAME */
        p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
                                  scm_from_uint (16));
        if (SCM_I_INUMP (p))
          {
-          scm_t_wchar c = SCM_I_INUM (p);
+          scm_t_wchar c = scm_to_uint32 (p);
            if (SCM_IS_UNICODE_CHAR (c))
              return SCM_MAKE_CHAR (c);
            else
@@ -980,7 +987,6 @@ scm_read_character (scm_t_wchar chr, SCM port)
          return ch;
      }
  
- char_error:
    scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
                      scm_list_1 (charname));
  
@@ -988,7 +994,7 @@ scm_read_character (scm_t_wchar chr, SCM port)
  }
  #undef FUNC_NAME
  
-static inline SCM
+static SCM
  scm_read_keyword (int chr, SCM port)
  {
    SCM symbol;
@@ -1007,38 +1013,51 @@ scm_read_keyword (int chr, SCM port)
    return (scm_symbol_to_keyword (symbol));
  }
  
-static inline SCM
-scm_read_vector (int chr, SCM port)
+static SCM
+scm_read_vector (int chr, SCM port, long line, int column)
  {
    /* Note: We call `scm_read_sexp ()' rather than READER here in order to
       guarantee that it's going to do what we want.  After all, this is an
       implementation detail of `scm_read_vector ()', not a desirable
       property.  */
-  return (scm_vector (scm_read_sexp (chr, port)));
+  return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
+                                port, line, column);
  }
  
-static inline SCM
-scm_read_srfi4_vector (int chr, SCM port)
+static SCM
+scm_read_array (int chr, SCM port, long line, int column)
  {
-  return scm_i_read_array (port, chr);
+  SCM result = scm_i_read_array (port, chr);
+  if (scm_is_false (result))
+    return result;
+  else
+    return maybe_annotate_source (result, port, line, column);
  }
  
  static SCM
-scm_read_bytevector (scm_t_wchar chr, SCM port)
+scm_read_srfi4_vector (int chr, SCM port, long line, int column)
  {
-  chr = scm_getc (port);
+  return scm_read_array (chr, port, line, column);
+}
+
+static SCM
+scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
+{
+  chr = scm_getc_unlocked (port);
    if (chr != 'u')
      goto syntax;
  
-  chr = scm_getc (port);
+  chr = scm_getc_unlocked (port);
    if (chr != '8')
      goto syntax;
  
-  chr = scm_getc (port);
+  chr = scm_getc_unlocked (port);
    if (chr != '(')
      goto syntax;
  
-  return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
+  return maybe_annotate_source
+    (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
+     port, line, column);
  
   syntax:
    scm_i_input_error ("read_bytevector", port,
@@ -1048,37 +1067,35 @@ scm_read_bytevector (scm_t_wchar chr, SCM port)
  }
  
  static SCM
-scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
+scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
  {
    /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
       terribly inefficient but who cares?  */
    SCM s_bits = SCM_EOL;
  
-  for (chr = scm_getc (port);
+  for (chr = scm_getc_unlocked (port);
         (chr != EOF) && ((chr == '0') || (chr == '1'));
-       chr = scm_getc (port))
+       chr = scm_getc_unlocked (port))
      {
        s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
      }
  
    if (chr != EOF)
-    scm_ungetc (chr, port);
+    scm_ungetc_unlocked (chr, port);
  
-  return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
+  return maybe_annotate_source
+    (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
+     port, line, column);
  }
  
-static inline SCM
+static SCM
  scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
  {
    int bang_seen = 0;
  
-  /* We can use the get_byte here because there is no need to get the
-     locale correct when reading comments. This presumes that 
-     hash and exclamation points always represent themselves no
-     matter what the source encoding is.*/
    for (;;)
      {
-      int c = scm_get_byte_or_eof (port);
+      int c = scm_getc_unlocked (port);
  
        if (c == EOF)
         scm_i_input_error ("skip_block_comment", port,
@@ -1095,40 +1112,73 @@ scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
    return SCM_UNSPECIFIED;
  }
  
+static SCM
+scm_read_shebang (scm_t_wchar chr, SCM port)
+{
+  int c = 0;
+  if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
+    {
+      scm_ungetc_unlocked (c, port);
+      return scm_read_scsh_block_comment (chr, port);
+    }
+  if ((c = scm_get_byte_or_eof_unlocked (port)) != '6')
+    {
+      scm_ungetc_unlocked (c, port);
+      scm_ungetc_unlocked ('r', port);
+      return scm_read_scsh_block_comment (chr, port);
+    }
+  if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
+    {
+      scm_ungetc_unlocked (c, port);
+      scm_ungetc_unlocked ('6', port);
+      scm_ungetc_unlocked ('r', port);
+      return scm_read_scsh_block_comment (chr, port);
+    }
+  if ((c = scm_get_byte_or_eof_unlocked (port)) != 's')
+    {
+      scm_ungetc_unlocked (c, port);
+      scm_ungetc_unlocked ('r', port);
+      scm_ungetc_unlocked ('6', port);
+      scm_ungetc_unlocked ('r', port);
+      return scm_read_scsh_block_comment (chr, port);
+    }
+  
+  return SCM_UNSPECIFIED;
+}
+
  static SCM
  scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
  {
    /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
       nested.  So care must be taken.  */
    int nesting_level = 1;
-  int opening_seen = 0, closing_seen = 0;
+
+  int a = scm_getc_unlocked (port);
+
+  if (a == EOF)
+    scm_i_input_error ("scm_read_r6rs_block_comment", port,
+                       "unterminated `#| ... |#' comment", SCM_EOL);
  
    while (nesting_level > 0)
      {
-      int c = scm_getc (port);
+      int b = scm_getc_unlocked (port);
  
-      if (c == EOF)
+      if (b == EOF)
         scm_i_input_error ("scm_read_r6rs_block_comment", port,
                            "unterminated `#| ... |#' comment", SCM_EOL);
  
-      if (opening_seen)
-       {
-         if (c == '|')
-           nesting_level++;
-         opening_seen = 0;
-       }
-      else if (closing_seen)
-       {
-         if (c == '#')
-           nesting_level--;
-         closing_seen = 0;
-       }
-      else if (c == '|')
-       closing_seen = 1;
-      else if (c == '#')
-       opening_seen = 1;
-      else
-       opening_seen = closing_seen = 0;
+      if (a == '|' && b == '#')
+        {
+          nesting_level--;
+          b = EOF;
+        }
+      else if (a == '#' && b == '|')
+        {
+          nesting_level++;
+          b = EOF;
+        }
+
+      a = b;
      }
  
    return SCM_UNSPECIFIED;
@@ -1143,7 +1193,7 @@ scm_read_commented_expression (scm_t_wchar chr, SCM port)
    if (EOF == c)
      scm_i_input_error ("read_commented_expression", port,
                         "no expression after #; comment", SCM_EOL);
-  scm_ungetc (c, port);
+  scm_ungetc_unlocked (c, port);
    scm_read_expression (port);
    return SCM_UNSPECIFIED;
  }
@@ -1156,48 +1206,87 @@ scm_read_extended_symbol (scm_t_wchar chr, SCM port)
         #{This is all a symbol name}#
  
       So here, CHR is expected to be `{'.  */
-  int saw_brace = 0, finished = 0;
+  int saw_brace = 0;
    size_t len = 0;
-  SCM buf = scm_i_make_string (1024, NULL);
+  SCM buf = scm_i_make_string (1024, NULL, 0);
  
    buf = scm_i_string_start_writing (buf);
  
-  while ((chr = scm_getc (port)) != EOF)
+  while ((chr = scm_getc_unlocked (port)) != EOF)
      {
        if (saw_brace)
         {
           if (chr == '#')
             {
-             finished = 1;
               break;
             }
           else
             {
               saw_brace = 0;
               scm_i_string_set_x (buf, len++, '}');
-             scm_i_string_set_x (buf, len++, chr);
             }
         }
-      else if (chr == '}')
+
+      if (chr == '}')
         saw_brace = 1;
+      else if (chr == '\\')
+        {
+          /* It used to be that print.c would print extended-read-syntax
+             symbols with backslashes before "non-standard" chars, but
+             this routine wouldn't do anything with those escapes.
+             Bummer.  What we've done is to change print.c to output
+             R6RS hex escapes for those characters, relying on the fact
+             that the extended read syntax would never put a `\' before
+             an `x'.  For now, we just ignore other instances of
+             backslash in the string.  */
+          switch ((chr = scm_getc_unlocked (port)))
+            {
+            case EOF:
+              goto done;
+            case 'x':
+              {
+                scm_t_wchar c;
+                
+                SCM_READ_HEX_ESCAPE (10, ';');
+                scm_i_string_set_x (buf, len++, c);
+                break;
+
+              str_eof:
+                chr = EOF;
+                goto done;
+
+              bad_escaped:
+                scm_i_string_stop_writing ();
+                scm_i_input_error ("scm_read_extended_symbol", port,
+                                   "illegal character in escape sequence: ~S",
+                                   scm_list_1 (SCM_MAKE_CHAR (c)));
+                break;
+              }
+            default:
+             scm_i_string_set_x (buf, len++, chr);
+              break;
+            }
+        }
        else
-       scm_i_string_set_x (buf, len++, chr);
+        scm_i_string_set_x (buf, len++, chr);
  
        if (len >= scm_i_string_length (buf) - 2)
         {
           SCM addy;
  
           scm_i_string_stop_writing ();
-         addy = scm_i_make_string (1024, NULL);
+         addy = scm_i_make_string (1024, NULL, 0);
           buf = scm_string_append (scm_list_2 (buf, addy));
           len = 0;
           buf = scm_i_string_start_writing (buf);
         }
-
-      if (finished)
-       break;
      }
+
+ done:
    scm_i_string_stop_writing ();
+  if (chr == EOF)
+    scm_i_input_error ("scm_read_extended_symbol", port,
+                       "end of file while reading symbol", SCM_EOL);
  
    return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
  }
@@ -1219,14 +1308,11 @@ scm_read_sharp_extension (int chr, SCM port)
        SCM got;
  
        got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
-      if (!scm_is_eq (got, SCM_UNSPECIFIED))
-       {
-         if (SCM_RECORD_POSITIONS_P)
-           return (recsexpr (got, line, column,
-                             SCM_FILENAME (port)));
-         else
-           return got;
-       }
+
+      if (scm_is_pair (got) && !scm_i_has_source_properties (got))
+        scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
+      
+      return got;
      }
  
    return SCM_UNSPECIFIED;
@@ -1235,12 +1321,12 @@ scm_read_sharp_extension (int chr, SCM port)
  /* The reader for the sharp `#' character.  It basically dispatches reads
     among the above token readers.   */
  static SCM
-scm_read_sharp (scm_t_wchar chr, SCM port)
+scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
  #define FUNC_NAME "scm_lreadr"
  {
    SCM result;
  
-  chr = scm_getc (port);
+  chr = scm_getc_unlocked (port);
  
    result = scm_read_sharp_extension (chr, port);
    if (!scm_is_eq (result, SCM_UNSPECIFIED))
@@ -1251,50 +1337,30 @@ scm_read_sharp (scm_t_wchar chr, SCM port)
      case '\\':
        return (scm_read_character (chr, port));
      case '(':
-      return (scm_read_vector (chr, port));
+      return (scm_read_vector (chr, port, line, column));
      case 's':
      case 'u':
      case 'f':
+    case 'c':
        /* This one may return either a boolean or an SRFI-4 vector.  */
-      return (scm_read_srfi4_vector (chr, port));
+      return (scm_read_srfi4_vector (chr, port, line, column));
      case 'v':
-      return (scm_read_bytevector (chr, port));
+      return (scm_read_bytevector (chr, port, line, column));
      case '*':
-      return (scm_read_guile_bit_vector (chr, port));
+      return (scm_read_guile_bit_vector (chr, port, line, column));
      case 't':
      case 'T':
      case 'F':
-      /* This one may return either a boolean or an SRFI-4 vector.  */
        return (scm_read_boolean (chr, port));
      case ':':
        return (scm_read_keyword (chr, port));
      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
      case '@':
-#if SCM_ENABLE_DEPRECATED
-      /* See below for 'i' and 'e'. */
-    case 'a':
-    case 'c':
-    case 'y':
-    case 'h':
-    case 'l':
-#endif
-      return (scm_i_read_array (port, chr));
+        return (scm_read_array (chr, port, line, column));
  
      case 'i':
      case 'e':
-#if SCM_ENABLE_DEPRECATED
-      {
-       /* When next char is '(', it really is an old-style
-          uniform array. */
-       scm_t_wchar next_c = scm_getc (port);
-       if (next_c != EOF)
-         scm_ungetc (next_c, port);
-       if (next_c == '(')
-         return scm_i_read_array (port, chr);
-       /* Fall through. */
-      }
-#endif
      case 'b':
      case 'B':
      case 'o':
@@ -1309,13 +1375,15 @@ scm_read_sharp (scm_t_wchar chr, SCM port)
      case '{':
        return (scm_read_extended_symbol (chr, port));
      case '!':
-      return (scm_read_scsh_block_comment (chr, port));
+      return (scm_read_shebang (chr, port));
      case ';':
        return (scm_read_commented_expression (chr, port));
      case '`':
      case '\'':
      case ',':
        return (scm_read_syntax (chr, port));
+    case 'n':
+      return (scm_read_nil (chr, port));
      default:
        result = scm_read_sharp_extension (chr, port);
        if (scm_is_eq (result, SCM_UNSPECIFIED))
@@ -1346,9 +1414,9 @@ scm_read_expression (SCM port)
  {
    while (1)
      {
-      register scm_t_wchar chr;
+      scm_t_wchar chr;
  
-      chr = scm_getc (port);
+      chr = scm_getc_unlocked (port);
  
        switch (chr)
         {
@@ -1372,8 +1440,9 @@ scm_read_expression (SCM port)
           return (scm_read_quote (chr, port));
         case '#':
           {
-           SCM result;
-           result = scm_read_sharp (chr, port);
+            long line  = SCM_LINUM (port);
+            int column = SCM_COL (port) - 1;
+           SCM result = scm_read_sharp (chr, port, line, column);
             if (scm_is_eq (result, SCM_UNSPECIFIED))
               /* We read a comment or some such.  */
               break;
@@ -1383,6 +1452,10 @@ scm_read_expression (SCM port)
         case ')':
           scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
           break;
+       case ']':
+          if (SCM_SQUARE_BRACKETS_P)
+            scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
+          /* otherwise fall through */
         case EOF:
           return SCM_EOF_VAL;
         case ':':
@@ -1422,7 +1495,7 @@ SCM_DEFINE (scm_read, "read", 0, 1, 0,
    c = flush_ws (port, (char *) NULL);
    if (EOF == c)
      return SCM_EOF_VAL;
-  scm_ungetc (c, port);
+  scm_ungetc_unlocked (c, port);
  
    return (scm_read_expression (port));
  }
@@ -1431,53 +1504,6 @@ SCM_DEFINE (scm_read, "read", 0, 1, 0,
  
  \f
  
-/* Used when recording expressions constructed by `scm_read_sharp ()'.  */
-static SCM
-recsexpr (SCM obj, long line, int column, SCM filename)
-{
-  if (!scm_is_pair(obj)) {
-    return obj;
-  } else {
-    SCM tmp = obj, copy;
-    /* If this sexpr is visible in the read:sharp source, we want to
-       keep that information, so only record non-constant cons cells
-       which haven't previously been read by the reader. */
-    if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
-      {
-       if (SCM_COPY_SOURCE_P)
-         {
-           copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
-                            SCM_UNDEFINED);
-           while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
-             {
-               SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
-                                                     line,
-                                                     column,
-                                                     filename),
-                                           SCM_UNDEFINED));
-               copy = SCM_CDR (copy);
-             }
-           SCM_SETCDR (copy, tmp);
-         }
-       else
-         {
-           recsexpr (SCM_CAR (obj), line, column, filename);
-           while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
-             recsexpr (SCM_CAR (tmp), line, column, filename);
-           copy = SCM_UNDEFINED;
-         }
-       scm_whash_insert (scm_source_whash,
-                         obj,
-                         scm_make_srcprops (line,
-                                            column,
-                                            filename,
-                                            copy,
-                                            SCM_EOL));
-      }
-    return obj;
-  }
-}
-
  /* Manipulate the read-hash-procedures alist.  This could be written in
     Scheme, but maybe it will also be used by C code during initialisation.  */
  SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
@@ -1500,7 +1526,7 @@ SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
               proc, SCM_ARG2, FUNC_NAME);
  
    /* Check if chr is already in the alist.  */
-  this = *scm_read_hash_procedures;
+  this = scm_i_read_hash_procedures_ref ();
    prev = SCM_BOOL_F;
    while (1)
      {
@@ -1509,8 +1535,9 @@ SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
           /* not found, so add it to the beginning.  */
           if (scm_is_true (proc))
             {
-             *scm_read_hash_procedures = 
-               scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
+              SCM new = scm_cons (scm_cons (chr, proc),
+                                  scm_i_read_hash_procedures_ref ());
+             scm_i_read_hash_procedures_set_x (new);
             }
           break;
         }
@@ -1522,8 +1549,8 @@ SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
               /* remove it.  */
               if (scm_is_false (prev))
                 {
-                 *scm_read_hash_procedures =
-                   SCM_CDR (*scm_read_hash_procedures);
+                  SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
+                 scm_i_read_hash_procedures_set_x (rest);
                 }
               else
                 scm_set_cdr_x (prev, SCM_CDR (this));
@@ -1547,7 +1574,7 @@ SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
  static SCM
  scm_get_hash_procedure (int c)
  {
-  SCM rest = *scm_read_hash_procedures;
+  SCM rest = scm_i_read_hash_procedures_ref ();
  
    while (1)
      {
@@ -1569,22 +1596,54 @@ scm_get_hash_procedure (int c)
  char *
  scm_i_scan_for_encoding (SCM port)
  {
+  scm_t_port *pt;
    char header[SCM_ENCODING_SEARCH_SIZE+1];
-  size_t bytes_read;
+  size_t bytes_read, encoding_length, i;
    char *encoding = NULL;
    int utf8_bom = 0;
-  char *pos;
-  int i;
+  char *pos, *encoding_start;
    int in_comment;
  
-  if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
-    /* PORT is a non-seekable file port (e.g., as created by Bash when using
-       "guile <(echo '(display "hello")')") so bail out.  */
-    return NULL;
+  pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->rw_active == SCM_PORT_WRITE)
+    scm_flush_unlocked (port);
  
-  bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
+  if (pt->rw_random)
+    pt->rw_active = SCM_PORT_READ;
  
-  scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+  if (pt->read_pos == pt->read_end)
+    {
+      /* We can use the read buffer, and thus avoid a seek. */
+      if (scm_fill_input_unlocked (port) == EOF)
+        return NULL;
+
+      bytes_read = pt->read_end - pt->read_pos;
+      if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
+        bytes_read = SCM_ENCODING_SEARCH_SIZE;
+
+      if (bytes_read <= 1)
+        /* An unbuffered port -- don't scan.  */
+        return NULL;
+
+      memcpy (header, pt->read_pos, bytes_read);
+      header[bytes_read] = '\0';
+    }
+  else
+    {
+      /* Try to read some bytes and then seek back.  Not all ports
+         support seeking back; and indeed some file ports (like
+         /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
+         check performed by SCM_FPORT_FDES---but fail to seek
+         backwards.  Hence this block comes second.  We prefer to use
+         the read buffer in-place.  */
+      if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
+        return NULL;
+
+      bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
+      header[bytes_read] = '\0';
+      scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+    }
  
    if (bytes_read > 3 
        && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
@@ -1612,46 +1671,54 @@ scm_i_scan_for_encoding (SCM port)
      pos ++;
  
    /* grab the next token */
+  encoding_start = pos;
    i = 0;
-  while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE 
-         && pos + i - header < bytes_read
-        && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
+  while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
+         && encoding_start + i - header < bytes_read
+        && (isalnum ((int) encoding_start[i])
+            || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
      i++;
  
-  if (i == 0)
+  encoding_length = i;
+  if (encoding_length == 0)
      return NULL;
  
-  encoding = scm_gc_strndup (pos, i, "encoding");
-  for (i = 0; i < strlen (encoding); i++)
+  encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
+  for (i = 0; i < encoding_length; i++)
      encoding[i] = toupper ((int) encoding[i]);
  
    /* push backwards to make sure we were in a comment */
    in_comment = 0;
-  while (pos - i - header > 0)
+  pos = encoding_start;
+  while (pos >= header)
      {
-      if (*(pos - i) == '\n')
+      if (*pos == ';')
+       {
+         in_comment = 1;
+         break;
+       }
+      else if (*pos == '\n' || pos == header)
         {
           /* This wasn't in a semicolon comment. Check for a
            hash-bang comment. */
           char *beg = strstr (header, "#!");
           char *end = strstr (header, "!#");
-         if (beg < pos && pos < end)
+         if (beg < encoding_start && encoding_start + encoding_length <= end)
             in_comment = 1;
           break;
         }
-      if (*(pos - i) == ';')
-       {
-         in_comment = 1;
-         break;
-       }
-      i ++;
+      else
+        {
+          pos --;
+          continue;
+        }
      }
    if (!in_comment)
      /* This wasn't in a comment */
      return NULL;
  
    if (utf8_bom && strcmp(encoding, "UTF-8"))
-    scm_misc_error (NULL, 
+    scm_misc_error (NULL,
                     "the port input declares the encoding ~s but is encoded as UTF-8",
                     scm_list_1 (scm_from_locale_string (encoding)));
  
@@ -1661,7 +1728,7 @@ scm_i_scan_for_encoding (SCM port)
  SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
              (SCM port),
              "Scans the port for an Emacs-like character coding declaration\n"
-            "near the top of the contents of a port with random-acessible contents.\n"
+            "near the top of the contents of a port with random-accessible contents.\n"
              "The coding declaration is of the form\n"
              "@code{coding: XXXXX} and must appear in a scheme comment.\n"
              "\n"
@@ -1672,6 +1739,8 @@ SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
    char *enc;
    SCM s_enc;
  
+  SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
+
    enc = scm_i_scan_for_encoding (port);
    if (enc == NULL)
      return SCM_BOOL_F;
@@ -1688,8 +1757,12 @@ SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
  void
  scm_init_read ()
  {
-  scm_read_hash_procedures =
-    SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
+  SCM read_hash_procs;
+
+  read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
+  
+  scm_i_read_hash_procedures =
+    SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
  
    scm_init_opts (scm_read_options, scm_read_opts);
  #include "libguile/read.x"