Optimize 'string-hash'.

[bpt/guile.git] / libguile / read.c
diff --git a/libguile/read.c b/libguile/read.c

index 03a53aa..60a40d9 100644 (file)
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -1,5 +1,5 @@
-/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
- *   2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
+/* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014
+ *   Free Software Foundation, Inc.
   * 
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public License
@@ -31,6 +31,7 @@
  #include <unicase.h>
  #include <unictype.h>
  #include <c-strcase.h>
+#include <c-ctype.h>
  
  #include "libguile/_scm.h"
  #include "libguile/bytevectors.h"
@@ -87,6 +88,8 @@ scm_t_option scm_read_opts[] =
        "In strings, consume leading whitespace after an escaped end-of-line."},
      { SCM_OPTION_BOOLEAN, "curly-infix", 0,
        "Support SRFI-105 curly infix expressions."},
+    { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
+      "Support R7RS |...| symbol notation."},
      { 0, },
    };
   
@@ -112,6 +115,7 @@ struct t_read_opts
    unsigned int hungry_eol_escapes_p : 1;
    unsigned int curly_infix_p        : 1;
    unsigned int neoteric_p           : 1;
+  unsigned int r7rs_symbols_p       : 1;
  };
  
  typedef struct t_read_opts scm_t_read_opts;
@@ -587,8 +591,11 @@ skip_intraline_whitespace (SCM port)
    scm_ungetc (c, port);
  }                                         
  
+/* Read either a double-quoted string or an R7RS-style symbol delimited
+   by vertical lines, depending on the value of 'chr' ('"' or '|').
+   Regardless, the result is always returned as a string.  */
  static SCM
-scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
+scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
  #define FUNC_NAME "scm_lreadr"
  {
    /* For strings smaller than C_STR, this function creates only one Scheme
@@ -602,13 +609,16 @@ scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
    long line = SCM_LINUM (port);
    int column = SCM_COL (port) - 1;
  
-  while ('"' != (c = scm_getc (port)))
+  while (chr != (c = scm_getc (port)))
      {
        if (c == EOF)
          {
          str_eof:
            scm_i_input_error (FUNC_NAME, port,
-                             "end of file in string constant", SCM_EOL);
+                             (chr == '|'
+                              ? "end of file in symbol"
+                              : "end of file in string constant"),
+                             SCM_EOL);
          }
  
        if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
@@ -623,8 +633,11 @@ scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
              {
              case EOF:
                goto str_eof;
-            case '"':
+            case '|':
              case '\\':
+            case '(':  /* Accept "\(" for use at the beginning of lines
+                         in multiline strings to avoid confusing emacs
+                         lisp modes.  */
                break;
              case '\n':
                if (opts->hungry_eol_escapes_p)
@@ -655,7 +668,7 @@ scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
                c = '\010';
                break;
              case 'x':
-              if (opts->r6rs_escapes_p)
+              if (opts->r6rs_escapes_p || chr == '|')
                  SCM_READ_HEX_ESCAPE (10, ';');
                else
                  SCM_READ_HEX_ESCAPE (2, '\0');
@@ -673,6 +686,8 @@ scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
                    break;
                  }
              default:
+              if (c == chr)
+                break;
              bad_escaped:
                scm_i_input_error (FUNC_NAME, port,
                                   "illegal character in escape sequence: ~S",
@@ -698,6 +713,17 @@ scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
  }
  #undef FUNC_NAME
  
+static SCM
+scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
+{
+  return scm_read_string_like_syntax (chr, port, opts);
+}
+
+static SCM
+scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
+{
+  return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
+}
  
  static SCM
  scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
@@ -965,7 +991,7 @@ try_read_ci_chars (SCM port, const char *expected_chars)
        c = scm_getc (port);
        if (c == EOF)
          break;
-      else if (tolower (c) != expected_chars[num_chars_read])
+      else if (c_tolower (c) != expected_chars[num_chars_read])
          {
            scm_ungetc (c, port);
            break;
@@ -1786,6 +1812,11 @@ read_inner_expression (SCM port, scm_t_read_opts *opts)
           return (scm_read_sexp (chr, port, opts));
         case '"':
           return (scm_read_string (chr, port, opts));
+        case '|':
+          if (opts->r7rs_symbols_p)
+            return scm_read_r7rs_symbol (chr, port, opts);
+          else
+            return scm_read_mixed_case_symbol (chr, port, opts);
         case '\'':
         case '`':
         case ',':
@@ -2018,11 +2049,20 @@ scm_get_hash_procedure (int c)
      }
  }
  
-#define SCM_ENCODING_SEARCH_SIZE (500)
+/* Maximum size of an encoding name.  This is a bit more than the
+   longest name listed at
+   <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
+   characters.)  */
+#define ENCODING_NAME_MAX_SIZE 20
  
-/* Search the first few hundred characters of a file for an Emacs-like coding
-   declaration.  Returns either NULL or a string whose storage has been
-   allocated with `scm_gc_malloc ()'.  */
+/* Number of bytes at the beginning or end of a file that are scanned
+   for a "coding:" declaration.  */
+#define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
+
+
+/* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
+   coding declaration.  Returns either NULL or a string whose storage
+   has been allocated with `scm_gc_malloc'.  */
  char *
  scm_i_scan_for_encoding (SCM port)
  {
@@ -2081,8 +2121,8 @@ scm_i_scan_for_encoding (SCM port)
        if ((pos = strstr(pos, "coding")) == NULL)
          return NULL;
  
-      pos += strlen("coding");
-      if (pos - header >= SCM_ENCODING_SEARCH_SIZE || 
+      pos += strlen ("coding");
+      if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
            (*pos == ':' || *pos == '='))
          {
            pos ++;
@@ -2091,10 +2131,17 @@ scm_i_scan_for_encoding (SCM port)
      }
  
    /* skip spaces */
-  while (pos - header <= SCM_ENCODING_SEARCH_SIZE && 
+  while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
          (*pos == ' ' || *pos == '\t'))
      pos ++;
  
+  if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
+    /* We found the "coding:" string, but there is probably not enough
+       room to store an encoding name in its entirety, so ignore it.
+       This makes sure we do not end up returning a truncated encoding
+       name.  */
+    return NULL;
+
    /* grab the next token */
    encoding_start = pos;
    i = 0;
@@ -2202,9 +2249,10 @@ SCM_SYMBOL (sym_port_read_options, "port-read-options");
  #define READ_OPTION_SQUARE_BRACKETS_P     10
  #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
  #define READ_OPTION_CURLY_INFIX_P         14
+#define READ_OPTION_R7RS_SYMBOLS_P        16
  
  /* The total width in bits of the per-port overrides */
-#define READ_OPTIONS_NUM_BITS             16
+#define READ_OPTIONS_NUM_BITS             18
  
  #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
  #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
@@ -2302,6 +2350,7 @@ init_read_options (SCM port, scm_t_read_opts *opts)
    RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
    RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
    RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
+  RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P,       r7rs_symbols_p);
  
  #undef RESOLVE_BOOLEAN_OPTION