nt/config.nt: Sync with autogen/config.in.

[bpt/emacs.git] / src / coding.c
diff --git a/src/coding.c b/src/coding.c

index 93da9db..5047e11 100644 (file)
--- a/src/coding.c
+++ b/src/coding.c
@@ -286,6 +286,10 @@ encode_coding_XXX (struct coding_system *coding)
  #include <config.h>
  #include <stdio.h>
  
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
+#endif /* HAVE_WCHAR_H */
+
  #include "lisp.h"
  #include "character.h"
  #include "buffer.h"
@@ -322,8 +326,7 @@ Lisp_Object Qcall_process, Qcall_process_region;
  Lisp_Object Qstart_process, Qopen_network_stream;
  static Lisp_Object Qtarget_idx;
  
-static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
-static Lisp_Object Qinterrupted, Qinsufficient_memory;
+static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
  
  /* If a symbol has this property, evaluate the value to define the
     symbol as a coding system.  */
@@ -820,18 +823,12 @@ record_conversion_result (struct coding_system *coding,
      case CODING_RESULT_INSUFFICIENT_SRC:
        Vlast_code_conversion_error = Qinsufficient_source;
        break;
-    case CODING_RESULT_INCONSISTENT_EOL:
-      Vlast_code_conversion_error = Qinconsistent_eol;
-      break;
      case CODING_RESULT_INVALID_SRC:
        Vlast_code_conversion_error = Qinvalid_source;
        break;
      case CODING_RESULT_INTERRUPT:
        Vlast_code_conversion_error = Qinterrupted;
        break;
-    case CODING_RESULT_INSUFFICIENT_MEM:
-      Vlast_code_conversion_error = Qinsufficient_memory;
-      break;
      case CODING_RESULT_INSUFFICIENT_DST:
        /* Don't record this error in Vlast_code_conversion_error
          because it happens just temporarily and is resolved when the
@@ -6074,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system)
  #define EOL_SEEN_CR    2
  #define EOL_SEEN_CRLF  4
  
+
+static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen);
+
+
+/* Return 1 if all the source bytes are ASCII, and return 0 otherwize.
+   By side effects, set coding->head_ascii and coding->eol_seen.  The
+   value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
+   EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
+   all the source bytes are ASCII.  */
+
+static bool
+detect_ascii (struct coding_system *coding)
+{
+  const unsigned char *src, *end;
+  Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
+  int eol_seen;
+
+  eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
+             : EQ (eol_type, Qunix) ? EOL_SEEN_LF
+             : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
+             : EOL_SEEN_CR);
+  coding_set_source (coding);
+  src = coding->source;
+  end = src + coding->src_bytes;
+
+  if (inhibit_eol_conversion)
+    {
+      /* We don't have to check EOL format.  */
+      while (src < end && !( *src & 0x80)) src++;
+      eol_seen = EOL_SEEN_LF;
+      adjust_coding_eol_type (coding, eol_seen);
+    }
+  else if (eol_seen != EOL_SEEN_NONE)
+    {
+      /* We don't have to check EOL format either.  */
+      while (src < end && !(*src & 0x80)) src++;
+    }
+  else
+    {
+      end--;                   /* We look ahead one byte.  */
+      while (src < end)
+       {
+         int c = *src;
+
+         if (c & 0x80)
+           break;
+         src++;
+         if (c < 0x20)
+           {
+             if (c == '\r')
+               {
+                 if (*src == '\n')
+                   {
+                     eol_seen |= EOL_SEEN_CRLF;
+                     src++;
+                   }
+                 else
+                   eol_seen |= EOL_SEEN_CR;
+               }
+             else if (c == '\n')
+               eol_seen |= EOL_SEEN_LF;
+           }
+       }
+      if (src > end)
+       /* The last two bytes are CR LF, which means that we have
+          scanned all bytes. */
+       end++;
+      else if (src == end)
+       {
+         end++;
+         if (! (*src & 0x80))
+           {
+             if (*src == '\r')
+               eol_seen |= EOL_SEEN_CR;
+             else if (*src  == '\n')
+               eol_seen |= EOL_SEEN_LF;
+             src++;
+           }
+       }
+      adjust_coding_eol_type (coding, eol_seen);
+    }
+  coding->head_ascii = src - coding->source;
+  coding->eol_seen = eol_seen;
+  return (src == end);
+}
+
+
  /* Detect how end-of-line of a text of length SRC_BYTES pointed by
     SOURCE is encoded.  If CATEGORY is one of
     coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@@ -6218,7 +6302,6 @@ detect_coding (struct coding_system *coding)
    coding_set_source (coding);
  
    src_end = coding->source + coding->src_bytes;
-  coding->head_ascii = 0;
  
    /* If we have not yet decided the text encoding type, detect it
       now.  */
@@ -6228,6 +6311,8 @@ detect_coding (struct coding_system *coding)
        struct coding_detection_info detect_info;
        bool null_byte_found = 0, eight_bit_found = 0;
  
+      coding->head_ascii = 0;
+      coding->eol_seen = EOL_SEEN_NONE;
        detect_info.checked = detect_info.found = detect_info.rejected = 0;
        for (src = coding->source; src < src_end; src++)
         {
@@ -6266,6 +6351,26 @@ detect_coding (struct coding_system *coding)
                   if (eight_bit_found)
                     break;
                 }
+             else if (! disable_ascii_optimization
+                      && ! inhibit_eol_conversion)
+               {
+                 if (c == '\r')
+                   {
+                     if (src < src_end && src[1] == '\n')
+                       {
+                         coding->eol_seen |= EOL_SEEN_CRLF;
+                         src++;
+                         coding->head_ascii++;
+                       }
+                     else
+                       coding->eol_seen |= EOL_SEEN_CR;
+                   }
+                 else if (c == '\n')
+                   {
+                     coding->eol_seen |= EOL_SEEN_LF;
+                   }
+               }
+
               if (! eight_bit_found)
                 coding->head_ascii++;
             }
@@ -6356,14 +6461,20 @@ detect_coding (struct coding_system *coding)
        coding_systems
         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
        detect_info.found = detect_info.rejected = 0;
-      coding->head_ascii = 0;
-      if (CONSP (coding_systems)
-         && detect_coding_utf_8 (coding, &detect_info))
+      if (detect_ascii (coding))
         {
-         if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
-           setup_coding_system (XCAR (coding_systems), coding);
-         else
-           setup_coding_system (XCDR (coding_systems), coding);
+         setup_coding_system (XCDR (coding_systems), coding);
+       }
+      else
+       {
+         if (CONSP (coding_systems)
+             && detect_coding_utf_8 (coding, &detect_info))
+           {
+             if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+               setup_coding_system (XCAR (coding_systems), coding);
+             else
+               setup_coding_system (XCDR (coding_systems), coding);
+           }
         }
      }
    else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -6376,6 +6487,7 @@ detect_coding (struct coding_system *coding)
         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
        detect_info.found = detect_info.rejected = 0;
        coding->head_ascii = 0;
+      coding->eol_seen = EOL_SEEN_NONE;
        if (CONSP (coding_systems)
           && detect_coding_utf_16 (coding, &detect_info))
         {
@@ -6813,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
  
    produced = dst - (coding->destination + coding->produced);
    if (BUFFERP (coding->dst_object) && produced_chars > 0)
-    insert_from_gap (produced_chars, produced);
+    insert_from_gap (produced_chars, produced, 0);
    coding->produced += produced;
    coding->produced_char += produced_chars;
    return carryover;
@@ -6884,22 +6996,8 @@ produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
  
  #define ALLOC_CONVERSION_WORK_AREA(coding)                             \
    do {                                                                 \
-    int size = CHARBUF_SIZE;                                           \
-                                                                       \
-    coding->charbuf = NULL;                                            \
-    while (size > 1024)                                                        \
-      {                                                                        \
-       coding->charbuf = alloca (sizeof (int) * size);                 \
-       if (coding->charbuf)                                            \
-         break;                                                        \
-       size >>= 1;                                                     \
-      }                                                                        \
-    if (! coding->charbuf)                                             \
-      {                                                                        \
-       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
-       return;                                                         \
-      }                                                                        \
-    coding->charbuf_size = size;                                       \
+    coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));       \
+    coding->charbuf_size = CHARBUF_SIZE;                               \
    } while (0)
  
  
@@ -6968,6 +7066,8 @@ decode_coding (struct coding_system *coding)
    int carryover;
    int i;
  
+  USE_SAFE_ALLOCA;
+
    if (BUFFERP (coding->src_object)
        && coding->src_pos > 0
        && coding->src_pos < GPT
@@ -7090,6 +7190,8 @@ decode_coding (struct coding_system *coding)
        bset_undo_list (current_buffer, undo_list);
        record_insert (coding->dst_pos, coding->produced_char);
      }
+
+  SAFE_FREE ();
  }
  
  
@@ -7373,6 +7475,8 @@ encode_coding (struct coding_system *coding)
    int max_lookup;
    struct ccl_spec cclspec;
  
+  USE_SAFE_ALLOCA;
+
    attrs = CODING_ID_ATTRS (coding->id);
    if (coding->encoder == encode_coding_raw_text)
      translation_table = Qnil, max_lookup = 0;
@@ -7406,7 +7510,9 @@ encode_coding (struct coding_system *coding)
    } while (coding->consumed_char < coding->src_chars);
  
    if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
-    insert_from_gap (coding->produced_char, coding->produced);
+    insert_from_gap (coding->produced_char, coding->produced, 0);
+
+  SAFE_FREE ();
  }
  
  
@@ -7500,8 +7606,6 @@ decode_coding_gap (struct coding_system *coding,
    ptrdiff_t count = SPECPDL_INDEX ();
    Lisp_Object attrs;
  
-  code_conversion_save (0, 0);
-
    coding->src_object = Fcurrent_buffer ();
    coding->src_chars = chars;
    coding->src_bytes = bytes;
@@ -7515,13 +7619,53 @@ decode_coding_gap (struct coding_system *coding,
  
    if (CODING_REQUIRE_DETECTION (coding))
      detect_coding (coding);
+  attrs = CODING_ID_ATTRS (coding->id);
+  if (! disable_ascii_optimization)
+    {
+      if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
+         && NILP (CODING_ATTR_POST_READ (attrs))
+         && NILP (get_translation_table (attrs, 0, NULL))
+         && (coding->head_ascii >= 0 /* We've already called detect_coding */
+             ? coding->head_ascii == bytes
+             : detect_ascii (coding)))
+       {
+         if (coding->eol_seen == EOL_SEEN_CR)
+           {
+             unsigned char *src_end = GAP_END_ADDR;
+             unsigned char *src = src - coding->src_bytes;
+
+             while (src < src_end)
+               {
+                 if (*src++ == '\r')
+                   src[-1] = '\n';
+               }
+           }
+         else if (coding->eol_seen == EOL_SEEN_CRLF)
+           {
+             unsigned char *src = GAP_END_ADDR;
+             unsigned char *src_beg = src - coding->src_bytes;
+             unsigned char *dst = src;
+
+             while (src_beg < src)
+               {
+                 *--dst = *--src;
+                 if (*src == '\n')
+                   src--;
+               }
+             bytes -= dst - src;
+           }
+         coding->produced_char = coding->produced = bytes;
+         insert_from_gap (bytes, bytes, 1);
+         return;
+       }
+    }
+  code_conversion_save (0, 0);
  
    coding->mode |= CODING_MODE_LAST_BLOCK;
    current_buffer->text->inhibit_shrinking = 1;
    decode_coding (coding);
    current_buffer->text->inhibit_shrinking = 0;
  
-  attrs = CODING_ID_ATTRS (coding->id);
    if (! NILP (CODING_ATTR_POST_READ (attrs)))
      {
        ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
@@ -7695,14 +7839,8 @@ decode_coding_object (struct coding_system *coding,
        set_buffer_internal (XBUFFER (coding->dst_object));
        if (dst_bytes < coding->produced)
         {
+         eassert (coding->produced > 0);
           destination = xrealloc (destination, coding->produced);
-         if (! destination)
-           {
-             record_conversion_result (coding,
-                                       CODING_RESULT_INSUFFICIENT_MEM);
-             unbind_to (count, Qnil);
-             return;
-           }
           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
             move_gap_both (BEGV, BEGV_BYTE);
           memcpy (destination, BEGV_ADDR, coding->produced);
@@ -7985,11 +8123,21 @@ from_unicode (Lisp_Object str)
    return code_convert_string_norecord (str, Qutf_16le, 0);
  }
  
+Lisp_Object
+from_unicode_buffer (const wchar_t* wstr)
+{
+    return from_unicode (
+        make_unibyte_string (
+            (char*) wstr,
+            /* we get one of the two final 0 bytes for free. */
+            1 + sizeof (wchar_t) * wcslen (wstr)));
+}
+
  wchar_t *
  to_unicode (Lisp_Object str, Lisp_Object *buf)
  {
    *buf = code_convert_string_norecord (str, Qutf_16le, 1);
-  /* We need to make a another copy (in addition to the one made by
+  /* We need to make another copy (in addition to the one made by
       code_convert_string_norecord) to ensure that the final string is
       _doubly_ zero terminated --- that is, that the string is
       terminated by two zero bytes and one utf-16le null character.
@@ -9483,7 +9631,7 @@ make_subsidiaries (Lisp_Object base)
    int i;
  
    memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
-  subsidiaries = Fmake_vector (make_number (3), Qnil);
+  subsidiaries = make_uninit_vector (3);
    for (i = 0; i < 3; i++)
      {
        strcpy (buf + base_name_len, suffixes[i]);
@@ -9783,7 +9931,7 @@ usage: (define-coding-system-internal ...)  */)
        CHECK_VECTOR (initial);
        for (i = 0; i < 4; i++)
         {
-         val = Faref (initial, make_number (i));
+         val = AREF (initial, i);
           if (! NILP (val))
             {
               struct charset *charset;
@@ -9988,7 +10136,8 @@ usage: (define-coding-system-internal ...)  */)
           this_name = AREF (eol_type, i);
           this_aliases = Fcons (this_name, Qnil);
           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
-         this_spec = Fmake_vector (make_number (3), attrs);
+         this_spec = make_uninit_vector (3);
+         ASET (this_spec, 0, attrs);
           ASET (this_spec, 1, this_aliases);
           ASET (this_spec, 2, this_eol_type);
           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
@@ -10001,7 +10150,8 @@ usage: (define-coding-system-internal ...)  */)
         }
      }
  
-  spec_vec = Fmake_vector (make_number (3), attrs);
+  spec_vec = make_uninit_vector (3);
+  ASET (spec_vec, 0, attrs);
    ASET (spec_vec, 1, aliases);
    ASET (spec_vec, 2, eol_type);
  
@@ -10406,10 +10556,8 @@ syms_of_coding (void)
         intern_c_string ("coding-category-undecided"));
  
    DEFSYM (Qinsufficient_source, "insufficient-source");
-  DEFSYM (Qinconsistent_eol, "inconsistent-eol");
    DEFSYM (Qinvalid_source, "invalid-source");
    DEFSYM (Qinterrupted, "interrupted");
-  DEFSYM (Qinsufficient_memory, "insufficient-memory");
    DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
  
    defsubr (&Scoding_system_p);
@@ -10710,7 +10858,7 @@ reading if you suppress escape sequence detection.
  
  The other way to read escape sequences in a file without decoding is
  to explicitly specify some coding system that doesn't use ISO-2022
-escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
+escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
    inhibit_iso_escape_detection = 0;
  
    DEFVAR_BOOL ("inhibit-null-byte-detection",
@@ -10726,6 +10874,11 @@ from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
  decode text as usual.  */);
    inhibit_null_byte_detection = 0;
  
+  DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
+              doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
+Internal use only.  Removed after the experimental optimizer gets stable. */);
+  disable_ascii_optimization = 0;
+
    DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
                doc: /* Char table for translating self-inserting characters.
  This is applied to the result of input methods, not their input.