(detect_coding): Preserve coding->mode.

[bpt/emacs.git] / src / coding.c
diff --git a/src/coding.c b/src/coding.c

index 81e8e4e..ec57467 100644 (file)
--- a/src/coding.c
+++ b/src/coding.c
@@ -1,8 +1,8 @@
  /* Coding system handler (conversion, detection, etc).
     Copyright (C) 2001, 2002, 2003, 2004, 2005,
-                 2006, 2007, 2008 Free Software Foundation, Inc.
+                 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
     Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-     2005, 2006, 2007, 2008
+     2005, 2006, 2007, 2008, 2009
       National Institute of Advanced Industrial Science and Technology (AIST)
       Registration Number H14PRO021
     Copyright (C) 2003
@@ -11,10 +11,10 @@
  
  This file is part of GNU Emacs.
  
-GNU Emacs is free software; you can redistribute it and/or modify
+GNU Emacs is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
  
  GNU Emacs is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -22,9 +22,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with GNU Emacs; see the file COPYING.  If not, write to
-the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-Boston, MA 02110-1301, USA.  */
+along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  
  /*** TABLE OF CONTENTS ***
  
@@ -316,7 +314,7 @@ Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
  Lisp_Object Qbig, Qlittle;
  Lisp_Object Qcoding_system_history;
  Lisp_Object Qvalid_codes;
-Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
+Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
  Lisp_Object QCdecode_translation_table, QCencode_translation_table;
  Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
  Lisp_Object QCascii_compatible_p;
@@ -382,6 +380,9 @@ int inhibit_eol_conversion;
  /* Flag to inhibit ISO2022 escape sequence detection.  */
  int inhibit_iso_escape_detection;
  
+/* Flag to inhibit detection of binary files through null bytes.  */
+int inhibit_null_byte_detection;
+
  /* Flag to make buffer-file-coding-system inherit from process-coding.  */
  int inherit_process_coding_system;
  
@@ -548,6 +549,9 @@ enum iso_code_class_type
     character is prohibited by CODING_ISO_FLAG_SAFE.  */
  #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
  
+/* UTF-8 section */
+#define CODING_UTF_8_BOM(coding)       \
+  ((coding)->spec.utf_8_bom)
  
  /* UTF-16 section */
  #define CODING_UTF_16_BOM(coding)      \
@@ -578,7 +582,9 @@ enum coding_category
      coding_category_iso_8_2,
      coding_category_iso_7_else,
      coding_category_iso_8_else,
-    coding_category_utf_8,
+    coding_category_utf_8_auto,
+    coding_category_utf_8_nosig,
+    coding_category_utf_8_sig,
      coding_category_utf_16_auto,
      coding_category_utf_16_be,
      coding_category_utf_16_le,
@@ -602,7 +608,9 @@ enum coding_category
  #define CATEGORY_MASK_ISO_8_2          (1 << coding_category_iso_8_2)
  #define CATEGORY_MASK_ISO_7_ELSE       (1 << coding_category_iso_7_else)
  #define CATEGORY_MASK_ISO_8_ELSE       (1 << coding_category_iso_8_else)
-#define CATEGORY_MASK_UTF_8            (1 << coding_category_utf_8)
+#define CATEGORY_MASK_UTF_8_AUTO       (1 << coding_category_utf_8_auto)
+#define CATEGORY_MASK_UTF_8_NOSIG      (1 << coding_category_utf_8_nosig)
+#define CATEGORY_MASK_UTF_8_SIG                (1 << coding_category_utf_8_sig)
  #define CATEGORY_MASK_UTF_16_AUTO      (1 << coding_category_utf_16_auto)
  #define CATEGORY_MASK_UTF_16_BE                (1 << coding_category_utf_16_be)
  #define CATEGORY_MASK_UTF_16_LE                (1 << coding_category_utf_16_le)
@@ -624,7 +632,9 @@ enum coding_category
     | CATEGORY_MASK_ISO_8_2             \
     | CATEGORY_MASK_ISO_7_ELSE          \
     | CATEGORY_MASK_ISO_8_ELSE          \
-   | CATEGORY_MASK_UTF_8               \
+   | CATEGORY_MASK_UTF_8_AUTO          \
+   | CATEGORY_MASK_UTF_8_NOSIG         \
+   | CATEGORY_MASK_UTF_8_SIG           \
     | CATEGORY_MASK_UTF_16_AUTO         \
     | CATEGORY_MASK_UTF_16_BE           \
     | CATEGORY_MASK_UTF_16_LE           \
@@ -664,6 +674,10 @@ enum coding_category
     | CATEGORY_MASK_UTF_16_BE_NOSIG     \
     | CATEGORY_MASK_UTF_16_LE_NOSIG)
  
+#define CATEGORY_MASK_UTF_8    \
+  (CATEGORY_MASK_UTF_8_AUTO    \
+   | CATEGORY_MASK_UTF_8_NOSIG \
+   | CATEGORY_MASK_UTF_8_SIG)
  
  /* List of symbols `coding-category-xxx' ordered by priority.  This
     variable is exposed to Emacs Lisp.  */
@@ -729,6 +743,45 @@ static struct coding_system coding_categories[coding_category_max];
      consumed_chars++;                                  \
    } while (0)
  
+/* Safely get two bytes from the source text pointed by SRC which ends
+   at SRC_END, and set C1 and C2 to those bytes while skipping the
+   heading multibyte characters.  If there are not enough bytes in the
+   source, it jumps to `no_more_source'.  If multibytep is nonzero and
+   a multibyte character is found for C2, set C2 to the negative value
+   of the character code.  The caller should declare and set these
+   variables appropriately in advance:
+       src, src_end, multibytep
+   It is intended that this macro is used in detect_coding_utf_16.  */
+
+#define TWO_MORE_BYTES(c1, c2)                         \
+  do {                                                 \
+    do {                                               \
+      if (src == src_end)                              \
+       goto no_more_source;                            \
+      c1 = *src++;                                     \
+      if (multibytep && (c1 & 0x80))                   \
+       {                                               \
+         if ((c1 & 0xFE) == 0xC0)                      \
+           c1 = ((c1 & 1) << 6) | *src++;              \
+         else                                          \
+           {                                           \
+             src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
+             c1 = -1;                                  \
+           }                                           \
+       }                                               \
+    } while (c1 < 0);                                  \
+    if (src == src_end)                                        \
+      goto no_more_source;                             \
+    c2 = *src++;                                       \
+    if (multibytep && (c2 & 0x80))                     \
+      {                                                        \
+       if ((c2 & 0xFE) == 0xC0)                        \
+         c2 = ((c2 & 1) << 6) | *src++;                \
+       else                                            \
+         c2 = -1;                                      \
+      }                                                        \
+  } while (0)
+
  
  #define ONE_MORE_BYTE_NO_CHECK(c)                      \
    do {                                                 \
@@ -1216,6 +1269,11 @@ alloc_destination (coding, nbytes, dst)
  #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
  #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
  
+#define UTF_BOM 0xFEFF
+#define UTF_8_BOM_1 0xEF
+#define UTF_8_BOM_2 0xBB
+#define UTF_8_BOM_3 0xBF
+
  static int
  detect_coding_utf_8 (coding, detect_info)
       struct coding_system *coding;
@@ -1225,6 +1283,7 @@ detect_coding_utf_8 (coding, detect_info)
    const unsigned char *src_end = coding->source + coding->src_bytes;
    int multibytep = coding->src_multibyte;
    int consumed_chars = 0;
+  int bom_found = 0;
    int found = 0;
  
    detect_info->checked |= CATEGORY_MASK_UTF_8;
@@ -1244,7 +1303,7 @@ detect_coding_utf_8 (coding, detect_info)
         break;
        if (UTF_8_2_OCTET_LEADING_P (c))
         {
-         found = CATEGORY_MASK_UTF_8;
+         found = 1;
           continue;
         }
        ONE_MORE_BYTE (c2);
@@ -1252,7 +1311,10 @@ detect_coding_utf_8 (coding, detect_info)
         break;
        if (UTF_8_3_OCTET_LEADING_P (c))
         {
-         found = CATEGORY_MASK_UTF_8;
+         found = 1;
+         if (src_base == coding->source
+             && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
+           bom_found = 1;
           continue;
         }
        ONE_MORE_BYTE (c3);
@@ -1260,7 +1322,7 @@ detect_coding_utf_8 (coding, detect_info)
         break;
        if (UTF_8_4_OCTET_LEADING_P (c))
         {
-         found = CATEGORY_MASK_UTF_8;
+         found = 1;
           continue;
         }
        ONE_MORE_BYTE (c4);
@@ -1268,7 +1330,7 @@ detect_coding_utf_8 (coding, detect_info)
         break;
        if (UTF_8_5_OCTET_LEADING_P (c))
         {
-         found = CATEGORY_MASK_UTF_8;
+         found = 1;
           continue;
         }
        break;
@@ -1282,7 +1344,17 @@ detect_coding_utf_8 (coding, detect_info)
        detect_info->rejected |= CATEGORY_MASK_UTF_8;
        return 0;
      }
-  detect_info->found |= found;
+  if (bom_found)
+    {
+      /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
+      detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
+    }
+  else
+    {
+      detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
+      if (found)
+       detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
+    }
    return 1;
  }
  
@@ -1296,14 +1368,48 @@ decode_coding_utf_8 (coding)
    const unsigned char *src_base;
    int *charbuf = coding->charbuf + coding->charbuf_used;
    int *charbuf_end = coding->charbuf + coding->charbuf_size;
-  int consumed_chars = 0, consumed_chars_base;
+  int consumed_chars = 0, consumed_chars_base = 0;
    int multibytep = coding->src_multibyte;
+  enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
    Lisp_Object attr, charset_list;
    int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
    int byte_after_cr = -1;
  
    CODING_GET_INFO (coding, attr, charset_list);
  
+  if (bom != utf_without_bom)
+    {
+      int c1, c2, c3;
+
+      src_base = src;
+      ONE_MORE_BYTE (c1);
+      if (! UTF_8_3_OCTET_LEADING_P (c1))
+       src = src_base;
+      else
+       {
+         ONE_MORE_BYTE (c2);
+         if (! UTF_8_EXTRA_OCTET_P (c2))
+           src = src_base;
+         else
+           {
+             ONE_MORE_BYTE (c3);
+             if (! UTF_8_EXTRA_OCTET_P (c3))
+               src = src_base;
+             else
+               {
+                 if ((c1 != UTF_8_BOM_1)
+                     || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
+                   src = src_base;
+                 else
+                   CODING_UTF_8_BOM (coding) = utf_without_bom;
+               }
+           }
+       }
+    }
+  CODING_UTF_8_BOM (coding) = utf_without_bom;
+
+
+
    while (1)
      {
        int c, c1, c2, c3, c4, c5;
@@ -1312,7 +1418,11 @@ decode_coding_utf_8 (coding)
        consumed_chars_base = consumed_chars;
  
        if (charbuf >= charbuf_end)
-       break;
+       {
+         if (byte_after_cr >= 0)
+           src_base--;
+         break;
+       }
  
        if (byte_after_cr >= 0)
         c1 = byte_after_cr, byte_after_cr = -1;
@@ -1417,6 +1527,13 @@ encode_coding_utf_8 (coding)
    int produced_chars = 0;
    int c;
  
+  if (CODING_UTF_8_BOM (coding) == utf_with_bom)
+    {
+      ASSURE_DESTINATION (3);
+      EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
+      CODING_UTF_8_BOM (coding) = utf_without_bom;
+    }
+
    if (multibytep)
      {
        int safe_room = MAX_MULTIBYTE_LENGTH * 2;
@@ -1497,8 +1614,7 @@ detect_coding_utf_16 (coding, detect_info)
        return 0;
      }
  
-  ONE_MORE_BYTE (c1);
-  ONE_MORE_BYTE (c2);
+  TWO_MORE_BYTES (c1, c2);
    if ((c1 == 0xFF) && (c2 == 0xFE))
      {
        detect_info->found |= (CATEGORY_MASK_UTF_16_LE
@@ -1515,6 +1631,11 @@ detect_coding_utf_16 (coding, detect_info)
                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
      }
+  else if (c2 < 0)
+    {
+      detect_info->rejected |= CATEGORY_MASK_UTF_16;
+      return 0;
+    }
    else
      {
        /* We check the dispersion of Eth and Oth bytes where E is even and
@@ -1532,8 +1653,9 @@ detect_coding_utf_16 (coding, detect_info)
  
        while (1)
         {
-         ONE_MORE_BYTE (c1);
-         ONE_MORE_BYTE (c2);
+         TWO_MORE_BYTES (c1, c2);
+         if (c2 < 0)
+           break;
           if (! e[c1])
             {
               e[c1] = 1;
@@ -1566,9 +1688,9 @@ decode_coding_utf_16 (coding)
    const unsigned char *src_base;
    int *charbuf = coding->charbuf + coding->charbuf_used;
    int *charbuf_end = coding->charbuf + coding->charbuf_size;
-  int consumed_chars = 0, consumed_chars_base;
+  int consumed_chars = 0, consumed_chars_base = 0;
    int multibytep = coding->src_multibyte;
-  enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+  enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
    enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
    int surrogate = CODING_UTF_16_SURROGATE (coding);
    Lisp_Object attr, charset_list;
@@ -1577,7 +1699,7 @@ decode_coding_utf_16 (coding)
  
    CODING_GET_INFO (coding, attr, charset_list);
  
-  if (bom == utf_16_with_bom)
+  if (bom == utf_with_bom)
      {
        int c, c1, c2;
  
@@ -1594,13 +1716,13 @@ decode_coding_utf_16 (coding)
           src = src_base;
           coding->errors++;
         }
-      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+      CODING_UTF_16_BOM (coding) = utf_without_bom;
      }
-  else if (bom == utf_16_detect_bom)
+  else if (bom == utf_detect_bom)
      {
        /* We have already tried to detect BOM and failed in
          detect_coding.  */
-      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+      CODING_UTF_16_BOM (coding) = utf_without_bom;
      }
  
    while (1)
@@ -1611,7 +1733,11 @@ decode_coding_utf_16 (coding)
        consumed_chars_base = consumed_chars;
  
        if (charbuf + 2 >= charbuf_end)
-       break;
+       {
+         if (byte_after_cr1 >= 0)
+           src_base -= 2;
+         break;
+       }
  
        if (byte_after_cr1 >= 0)
         c1 = byte_after_cr1, byte_after_cr1 = -1;
@@ -1690,7 +1816,7 @@ encode_coding_utf_16 (coding)
    unsigned char *dst = coding->destination + coding->produced;
    unsigned char *dst_end = coding->destination + coding->dst_bytes;
    int safe_room = 8;
-  enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+  enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
    int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
    int produced_chars = 0;
    Lisp_Object attrs, charset_list;
@@ -1698,14 +1824,14 @@ encode_coding_utf_16 (coding)
  
    CODING_GET_INFO (coding, attrs, charset_list);
  
-  if (bom != utf_16_without_bom)
+  if (bom != utf_without_bom)
      {
        ASSURE_DESTINATION (safe_room);
        if (big_endian)
         EMIT_TWO_BYTES (0xFE, 0xFF);
        else
         EMIT_TWO_BYTES (0xFF, 0xFE);
-      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+      CODING_UTF_16_BOM (coding) = utf_without_bom;
      }
  
    while (charbuf < charbuf_end)
@@ -2215,7 +2341,11 @@ decode_coding_emacs_mule (coding)
        consumed_chars_base = consumed_chars;
  
        if (charbuf >= charbuf_end)
-       break;
+       {
+         if (byte_after_cr >= 0)
+           src_base--;
+         break;
+       }
  
        if (byte_after_cr >= 0)
         c = byte_after_cr, byte_after_cr = -1;
@@ -2378,8 +2508,10 @@ encode_coding_emacs_mule (coding)
           if (preferred_charset_id >= 0)
             {
               charset = CHARSET_FROM_ID (preferred_charset_id);
-             if (! CHAR_CHARSET_P (c, charset))
-               charset = char_charset (c, charset_list, NULL);
+             if (CHAR_CHARSET_P (c, charset))
+               code = ENCODE_CHAR (charset, c);
+             else
+               charset = char_charset (c, charset_list, &code);
             }
           else
             charset = char_charset (c, charset_list, &code);
@@ -2687,6 +2819,7 @@ detect_coding_iso_2022 (coding, detect_info)
    int i;
    int rejected = 0;
    int found = 0;
+  int composition_count = -1;
  
    detect_info->checked |= CATEGORY_MASK_ISO;
  
@@ -2695,6 +2828,8 @@ detect_coding_iso_2022 (coding, detect_info)
        struct coding_system *this = &(coding_categories[i]);
        Lisp_Object attrs, val;
  
+      if (this->id < 0)
+       continue;
        attrs = CODING_ID_ATTRS (this->id);
        if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
@@ -2753,10 +2888,20 @@ detect_coding_iso_2022 (coding, detect_info)
               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
               break;
             }
+         else if (c == '1')
+           {
+             /* End of composition.  */
+             if (composition_count < 0
+                 || composition_count > MAX_COMPOSITION_COMPONENTS)
+               /* Invalid */
+               break;
+             composition_count = -1;
+             found |= CATEGORY_MASK_ISO;
+           }
           else if (c >= '0' && c <= '4')
             {
               /* ESC <Fp> for start/end composition.  */
-             found |= CATEGORY_MASK_ISO;
+             composition_count = 0;
               break;
             }
           else
@@ -2827,6 +2972,8 @@ detect_coding_iso_2022 (coding, detect_info)
             continue;
           if (c < 0x80)
             {
+             if (composition_count >= 0)
+               composition_count++;
               single_shifting = 0;
               break;
             }
@@ -2851,9 +2998,17 @@ detect_coding_iso_2022 (coding, detect_info)
                     }
  
                   if (i & 1 && src < src_end)
-                   rejected |= CATEGORY_MASK_ISO_8_2;
+                   {
+                     rejected |= CATEGORY_MASK_ISO_8_2;
+                     if (composition_count >= 0)
+                       composition_count += i;
+                   }
                   else
-                   found |= CATEGORY_MASK_ISO_8_2;
+                   {
+                     found |= CATEGORY_MASK_ISO_8_2;
+                     if (composition_count >= 0)
+                       composition_count += i / 2;
+                   }
                 }
               break;
             }
@@ -2970,6 +3125,8 @@ detect_coding_iso_2022 (coding, detect_info)
             break;                                                      \
         if (p == src_end - 1)                                           \
           {                                                             \
+           if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
+             goto invalid_code;                                        \
             /* The current composition doesn't end in the current       \
                source.  */                                              \
             record_conversion_result                                    \
@@ -3099,7 +3256,11 @@ decode_coding_iso_2022 (coding)
        consumed_chars_base = consumed_chars;
  
        if (charbuf >= charbuf_end)
-       break;
+       {
+         if (byte_after_cr >= 0)
+           src_base--;
+         break;
+       }
  
        if (byte_after_cr >= 0)
         c1 = byte_after_cr, byte_after_cr = -1;
@@ -3117,10 +3278,15 @@ decode_coding_iso_2022 (coding)
               if (composition_state == COMPOSING_RULE
                   || composition_state == COMPOSING_COMPONENT_RULE)
                 {
-                 DECODE_COMPOSITION_RULE (c1);
-                 components[component_idx++] = c1;
-                 composition_state--;
-                 continue;
+                 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
+                   {
+                     DECODE_COMPOSITION_RULE (c1);
+                     components[component_idx++] = c1;
+                     composition_state--;
+                     continue;
+                   }
+                 /* Too long composition.  */
+                 MAYBE_FINISH_COMPOSITION ();
                 }
             }
           if (charset_id_0 < 0
@@ -3137,10 +3303,14 @@ decode_coding_iso_2022 (coding)
               if (composition_state == COMPOSING_RULE
                   || composition_state == COMPOSING_COMPONENT_RULE)
                 {
-                 DECODE_COMPOSITION_RULE (c1);
-                 components[component_idx++] = c1;
-                 composition_state--;
-                 continue;
+                 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
+                   {
+                     DECODE_COMPOSITION_RULE (c1);
+                     components[component_idx++] = c1;
+                     composition_state--;
+                     continue;
+                   }
+                 MAYBE_FINISH_COMPOSITION ();
                 }
             }
           if (charset_id_0 < 0)
@@ -3498,11 +3668,20 @@ decode_coding_iso_2022 (coding)
         }
        else
         {
-         components[component_idx++] = c;
-         if (method == COMPOSITION_WITH_RULE
-             || (method == COMPOSITION_WITH_RULE_ALTCHARS
-                 && composition_state == COMPOSING_COMPONENT_CHAR))
-           composition_state++;
+         if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
+           {
+             components[component_idx++] = c;
+             if (method == COMPOSITION_WITH_RULE
+                 || (method == COMPOSITION_WITH_RULE_ALTCHARS
+                     && composition_state == COMPOSING_COMPONENT_CHAR))
+               composition_state++;
+           }
+         else
+           {
+             MAYBE_FINISH_COMPOSITION ();
+             *charbuf++ = c;
+             char_offset++;
+           }
         }
        continue;
  
@@ -4255,7 +4434,11 @@ decode_coding_sjis (coding)
        consumed_chars_base = consumed_chars;
  
        if (charbuf >= charbuf_end)
-       break;
+       {
+         if (byte_after_cr >= 0)
+           src_base--;
+         break;
+       }
  
        if (byte_after_cr >= 0)
         c = byte_after_cr, byte_after_cr = -1;
@@ -4363,7 +4546,11 @@ decode_coding_big5 (coding)
        consumed_chars_base = consumed_chars;
  
        if (charbuf >= charbuf_end)
-       break;
+       {
+         if (byte_after_cr >= 0)
+           src_base--;
+         break;
+       }
  
        if (byte_after_cr >= 0)
         c = byte_after_cr, byte_after_cr = -1;
@@ -4873,7 +5060,6 @@ encode_coding_raw_text (coding)
                 *dst++ = CHAR_TO_BYTE8 (c);
               else
                 CHAR_STRING_ADVANCE (c, dst);
-             produced_chars++;
             }
         }
        else
@@ -4881,8 +5067,8 @@ encode_coding_raw_text (coding)
           ASSURE_DESTINATION (charbuf_end - charbuf);
           while (charbuf < charbuf_end && dst < dst_end)
             *dst++ = *charbuf++;
-         produced_chars = dst - (coding->destination + coding->dst_bytes);
         }
+      produced_chars = dst - (coding->destination + coding->produced);
      }
    record_conversion_result (coding, CODING_RESULT_SUCCESS);
    coding->produced_char += produced_chars;
@@ -4903,16 +5089,20 @@ detect_coding_charset (coding, detect_info)
    const unsigned char *src_end = coding->source + coding->src_bytes;
    int multibytep = coding->src_multibyte;
    int consumed_chars = 0;
-  Lisp_Object attrs, valids;
+  Lisp_Object attrs, valids, name;
    int found = 0;
    int head_ascii = coding->head_ascii;
+  int check_latin_extra = 0;
  
    detect_info->checked |= CATEGORY_MASK_CHARSET;
  
    coding = &coding_categories[coding_category_charset];
    attrs = CODING_ID_ATTRS (coding->id);
    valids = AREF (attrs, coding_attr_charset_valids);
-
+  name = CODING_ID_NAME (coding->id);
+  if (VECTORP (Vlatin_extra_code_table)
+      && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-") == 0)
+    check_latin_extra = 1;
    if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
      src += head_ascii;
  
@@ -4931,7 +5121,13 @@ detect_coding_charset (coding, detect_info)
        if (NILP (val))
         break;
        if (c >= 0x80)
-       found = CATEGORY_MASK_CHARSET;
+       {
+         if (c < 0xA0
+             && check_latin_extra
+             && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
+           break;
+         found = CATEGORY_MASK_CHARSET;
+       }
        if (INTEGERP (val))
         {
           charset = CHARSET_FROM_ID (XFASTINT (val));
@@ -5019,7 +5215,11 @@ decode_coding_charset (coding)
        consumed_chars_base = consumed_chars;
  
        if (charbuf >= charbuf_end)
-       break;
+       {
+         if (byte_after_cr >= 0)
+           src_base--;
+         break;
+       }
  
        if (byte_after_cr >= 0)
         {
@@ -5037,7 +5237,7 @@ decode_coding_charset (coding)
        code = c;
  
        val = AREF (valids, c);
-      if (NILP (val))
+      if (! INTEGERP (val) && ! CONSP (val))
         goto invalid_code;
        if (INTEGERP (val))
         {
@@ -5273,18 +5473,24 @@ setup_coding_system (coding_system, coding)
      }
    else if (EQ (coding_type, Qutf_8))
      {
+      val = AREF (attrs, coding_attr_utf_bom);
+      CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
+                                  : EQ (val, Qt) ? utf_with_bom
+                                  : utf_without_bom);
        coding->detector = detect_coding_utf_8;
        coding->decoder = decode_coding_utf_8;
        coding->encoder = encode_coding_utf_8;
        coding->common_flags
         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
+      if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
+       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
      }
    else if (EQ (coding_type, Qutf_16))
      {
-      val = AREF (attrs, coding_attr_utf_16_bom);
-      CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
-                                   : EQ (val, Qt) ? utf_16_with_bom
-                                   : utf_16_without_bom);
+      val = AREF (attrs, coding_attr_utf_bom);
+      CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
+                                   : EQ (val, Qt) ? utf_with_bom
+                                   : utf_without_bom);
        val = AREF (attrs, coding_attr_utf_16_endian);
        CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
                                        : utf_16_little_endian);
@@ -5294,7 +5500,7 @@ setup_coding_system (coding_system, coding)
        coding->encoder = encode_coding_utf_16;
        coding->common_flags
         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
-      if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
+      if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
      }
    else if (EQ (coding_type, Qccl))
@@ -5389,6 +5595,39 @@ coding_charset_list (coding)
  }
  
  
+/* Return a list of charsets supported by CODING-SYSTEM.  */
+
+Lisp_Object
+coding_system_charset_list (coding_system)
+     Lisp_Object coding_system;
+{
+  int id;
+  Lisp_Object attrs, charset_list;
+
+  CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
+  attrs = CODING_ID_ATTRS (id);
+
+  if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
+    {
+      int flags = XINT (AREF (attrs, coding_attr_iso_flags));
+
+      if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
+       charset_list = Viso_2022_charset_list;
+      else
+       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
+    }
+  else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
+    {
+      charset_list = Vemacs_mule_charset_list;
+    }
+  else
+    {
+      charset_list = CODING_ATTR_CHARSET_LIST (attrs);
+    }
+  return charset_list;
+}
+
+
  /* Return raw-text or one of its subsidiaries that has the same
     eol_type as CODING-SYSTEM.  */
  
@@ -5613,16 +5852,26 @@ detect_eol (source, src_bytes, category)
                        || src[lsb + 2] != '\n')
                 this_eol = EOL_SEEN_CR;
               else
-               this_eol = EOL_SEEN_CRLF;
+               {
+                 this_eol = EOL_SEEN_CRLF;
+                 src += 2;
+               }
  
               if (eol_seen == EOL_SEEN_NONE)
                 /* This is the first end-of-line.  */
                 eol_seen = this_eol;
               else if (eol_seen != this_eol)
                 {
-                 /* The found type is different from what found before.  */
-                 eol_seen = EOL_SEEN_LF;
-                 break;
+                 /* The found type is different from what found before.
+                    Allow for stray ^M characters in DOS EOL files.  */
+                 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
+                     || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
+                   eol_seen = EOL_SEEN_CRLF;
+                 else
+                   {
+                     eol_seen = EOL_SEEN_LF;
+                     break;
+                   }
                 }
               if (++total == MAX_EOL_CHECK_COUNT)
                 break;
@@ -5651,9 +5900,16 @@ detect_eol (source, src_bytes, category)
                 eol_seen = this_eol;
               else if (eol_seen != this_eol)
                 {
-                 /* The found type is different from what found before.  */
-                 eol_seen = EOL_SEEN_LF;
-                 break;
+                 /* The found type is different from what found before.
+                    Allow for stray ^M characters in DOS EOL files.  */
+                 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
+                     || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
+                   eol_seen = EOL_SEEN_CRLF;
+                 else
+                   {
+                     eol_seen = EOL_SEEN_LF;
+                     break;
+                   }
                 }
               if (++total == MAX_EOL_CHECK_COUNT)
                 break;
@@ -5699,12 +5955,14 @@ detect_coding (coding)
       struct coding_system *coding;
  {
    const unsigned char *src, *src_end;
+  int saved_mode = coding->mode;
  
    coding->consumed = coding->consumed_char = 0;
    coding->produced = coding->produced_char = 0;
    coding_set_source (coding);
  
    src_end = coding->source + coding->src_bytes;
+  coding->head_ascii = 0;
  
    /* If we have not yet decided the text encoding type, detect it
       now.  */
@@ -5715,15 +5973,12 @@ detect_coding (coding)
        int null_byte_found = 0, eight_bit_found = 0;
  
        detect_info.checked = detect_info.found = detect_info.rejected = 0;
-      coding->head_ascii = -1;
        for (src = coding->source; src < src_end; src++)
         {
           c = *src;
           if (c & 0x80)
             {
               eight_bit_found = 1;
-             if (coding->head_ascii < 0)
-               coding->head_ascii = src - coding->source;
               if (null_byte_found)
                 break;
             }
@@ -5733,29 +5988,34 @@ detect_coding (coding)
                   && ! inhibit_iso_escape_detection
                   && ! detect_info.checked)
                 {
-                 if (coding->head_ascii < 0)
-                   coding->head_ascii = src - coding->source;
                   if (detect_coding_iso_2022 (coding, &detect_info))
                     {
                       /* We have scanned the whole data.  */
                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
-                       /* We didn't find an 8-bit code.  We may have
-                          found a null-byte, but it's very rare that
-                          a binary file confirm to ISO-2022.  */
-                       src = src_end;
+                       {
+                         /* We didn't find an 8-bit code.  We may
+                            have found a null-byte, but it's very
+                            rare that a binary file confirm to
+                            ISO-2022.  */
+                         src = src_end;
+                         coding->head_ascii = src - coding->source;
+                       }
+                     detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
                       break;
                     }
                 }
-             else if (! c)
+             else if (! c && !inhibit_null_byte_detection)
                 {
                   null_byte_found = 1;
                   if (eight_bit_found)
                     break;
                 }
+             if (! eight_bit_found)
+               coding->head_ascii++;
             }
+         else if (! eight_bit_found)
+           coding->head_ascii++;
         }
-      if (coding->head_ascii < 0)
-       coding->head_ascii = src - coding->source;
  
        if (null_byte_found || eight_bit_found
           || coding->head_ascii < coding->src_bytes
@@ -5809,23 +6069,42 @@ detect_coding (coding)
                       break;
                     }
                 }
-
-             if (i < coding_category_raw_text)
-               setup_coding_system (CODING_ID_NAME (this->id), coding);
-             else if (null_byte_found)
-               setup_coding_system (Qno_conversion, coding);
-             else if ((detect_info.rejected & CATEGORY_MASK_ANY)
-                      == CATEGORY_MASK_ANY)
-               setup_coding_system (Qraw_text, coding);
-             else if (detect_info.rejected)
-               for (i = 0; i < coding_category_raw_text; i++)
-                 if (! (detect_info.rejected & (1 << coding_priorities[i])))
-                   {
-                     this = coding_categories + coding_priorities[i];
-                     setup_coding_system (CODING_ID_NAME (this->id), coding);
-                     break;
-                   }
             }
+
+         if (i < coding_category_raw_text)
+           setup_coding_system (CODING_ID_NAME (this->id), coding);
+         else if (null_byte_found)
+           setup_coding_system (Qno_conversion, coding);
+         else if ((detect_info.rejected & CATEGORY_MASK_ANY)
+                  == CATEGORY_MASK_ANY)
+           setup_coding_system (Qraw_text, coding);
+         else if (detect_info.rejected)
+           for (i = 0; i < coding_category_raw_text; i++)
+             if (! (detect_info.rejected & (1 << coding_priorities[i])))
+               {
+                 this = coding_categories + coding_priorities[i];
+                 setup_coding_system (CODING_ID_NAME (this->id), coding);
+                 break;
+               }
+       }
+    }
+  else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
+          == coding_category_utf_8_auto)
+    {
+      Lisp_Object coding_systems;
+      struct coding_detection_info detect_info;
+
+      coding_systems
+       = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
+      detect_info.found = detect_info.rejected = 0;
+      coding->head_ascii = 0;
+      if (CONSP (coding_systems)
+         && detect_coding_utf_8 (coding, &detect_info))
+       {
+         if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+           setup_coding_system (XCAR (coding_systems), coding);
+         else
+           setup_coding_system (XCDR (coding_systems), coding);
         }
      }
    else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -5835,8 +6114,9 @@ detect_coding (coding)
        struct coding_detection_info detect_info;
  
        coding_systems
-       = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
+       = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
        detect_info.found = detect_info.rejected = 0;
+      coding->head_ascii = 0;
        if (CONSP (coding_systems)
           && detect_coding_utf_16 (coding, &detect_info))
         {
@@ -5846,6 +6126,7 @@ detect_coding (coding)
             setup_coding_system (XCDR (coding_systems), coding);
         }
      }
+  coding->mode = saved_mode;
  }
  
  
@@ -5885,7 +6166,12 @@ decode_eol (coding)
                 eol_seen |= EOL_SEEN_CR;
             }
         }
-      if (eol_seen != EOL_SEEN_NONE
+      /* Handle DOS-style EOLs in a file with stray ^M characters.  */
+      if ((eol_seen & EOL_SEEN_CRLF) != 0
+         && (eol_seen & EOL_SEEN_CR) != 0
+         && (eol_seen & EOL_SEEN_LF) == 0)
+       eol_seen = EOL_SEEN_CRLF;
+      else if (eol_seen != EOL_SEEN_NONE
           && eol_seen != EOL_SEEN_LF
           && eol_seen != EOL_SEEN_CRLF
           && eol_seen != EOL_SEEN_CR)
@@ -6178,7 +6464,7 @@ produce_chars (coding, translation_table, last_block)
           if (coding->src_multibyte)
             {
               int multibytep = 1;
-             EMACS_INT consumed_chars;
+             EMACS_INT consumed_chars = 0;
  
               while (1)
                 {
@@ -6342,7 +6628,7 @@ produce_charset (coding, charbuf, pos)
  
  #define ALLOC_CONVERSION_WORK_AREA(coding)                             \
    do {                                                                 \
-    int size = CHARBUF_SIZE;;                                          \
+    int size = CHARBUF_SIZE;                                           \
                                                                         \
      coding->charbuf = NULL;                                            \
      while (size > 1024)                                                        \
@@ -6500,6 +6786,8 @@ decode_coding (coding)
              that the number of data is less than the size of
              coding->charbuf.  */
           coding->charbuf_used = 0;
+         coding->chars_at_source = 0;
+
           while (nbytes-- > 0)
             {
               int c = *src++;
@@ -6871,13 +7159,17 @@ make_conversion_work_buffer (multibyte)
      }
    else
      {
-      name = Vcode_conversion_workbuf_name;
-      workbuf = Fget_buffer_create (name);
-      if (NILP (Vcode_conversion_reused_workbuf))
-       Vcode_conversion_reused_workbuf = workbuf;
+      if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
+       Vcode_conversion_reused_workbuf
+         = Fget_buffer_create (Vcode_conversion_workbuf_name);
+      workbuf = Vcode_conversion_reused_workbuf;
      }
    current = current_buffer;
    set_buffer_internal (XBUFFER (workbuf));
+  /* We can't allow modification hooks to run in the work buffer.  For
+     instance, directory_files_internal assumes that file decoding
+     doesn't compile new regexps.  */
+  Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
    Ferase_buffer ();
    current_buffer->undo_list = Qt;
    current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
@@ -7332,8 +7624,13 @@ encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
         }
        else
         {
-         coding->dst_pos = BUF_PT (XBUFFER (dst_object));
-         coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
+         struct buffer *current = current_buffer;
+
+         set_buffer_temp (XBUFFER (dst_object));
+         coding->dst_pos = PT;
+         coding->dst_pos_byte = PT_BYTE;
+         move_gap_both (coding->dst_pos, coding->dst_pos_byte);
+         set_buffer_temp (current);
         }
        coding->dst_multibyte
         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
@@ -7434,14 +7731,14 @@ DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
         doc: /* Return t if OBJECT is nil or a coding-system.
  See the documentation of `define-coding-system' for information
  about coding-system objects.  */)
-     (obj)
-     Lisp_Object obj;
+     (object)
+     Lisp_Object object;
  {
-  if (NILP (obj)
-      || CODING_SYSTEM_ID (obj) >= 0)
+  if (NILP (object)
+      || CODING_SYSTEM_ID (object) >= 0)
      return Qt;
-  if (! SYMBOLP (obj)
-      || NILP (Fget (obj, Qcoding_system_define_form)))
+  if (! SYMBOLP (object)
+      || NILP (Fget (object, Qcoding_system_define_form)))
      return Qnil;
    return Qt;
  }
@@ -7533,7 +7830,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
  {
    const unsigned char *src_end = src + src_bytes;
    Lisp_Object attrs, eol_type;
-  Lisp_Object val;
+  Lisp_Object val = Qnil;
    struct coding_system coding;
    int id;
    struct coding_detection_info detect_info;
@@ -7553,6 +7850,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
    coding.src_multibyte = multibytep;
    coding.consumed = 0;
    coding.mode |= CODING_MODE_LAST_BLOCK;
+  coding.head_ascii = 0;
  
    detect_info.checked = detect_info.found = detect_info.rejected = 0;
  
@@ -7564,7 +7862,6 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
        struct coding_system *this;
        int c, i;
  
-      coding.head_ascii = -1;
        /* Skip all ASCII bytes except for a few ISO2022 controls.  */
        for (; src < src_end; src++)
         {
@@ -7572,40 +7869,43 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
           if (c & 0x80)
             {
               eight_bit_found = 1;
-             if (coding.head_ascii < 0)
-               coding.head_ascii = src - coding.source;
               if (null_byte_found)
                 break;
             }
-         if (c < 0x20)
+         else if (c < 0x20)
             {
               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
                   && ! inhibit_iso_escape_detection
                   && ! detect_info.checked)
                 {
-                 if (coding.head_ascii < 0)
-                   coding.head_ascii = src - coding.source;
                   if (detect_coding_iso_2022 (&coding, &detect_info))
                     {
                       /* We have scanned the whole data.  */
                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
-                       /* We didn't find an 8-bit code.  We may have
-                          found a null-byte, but it's very rare that
-                          a binary file confirm to ISO-2022.  */
-                       src = src_end;
+                       {
+                         /* We didn't find an 8-bit code.  We may
+                            have found a null-byte, but it's very
+                            rare that a binary file confirm to
+                            ISO-2022.  */
+                         src = src_end;
+                         coding.head_ascii = src - coding.source;
+                       }
+                     detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
                       break;
                     }
                 }
-             else if (! c)
+             else if (! c && !inhibit_null_byte_detection)
                 {
                   null_byte_found = 1;
                   if (eight_bit_found)
                     break;
                 }
+             if (! eight_bit_found)
+               coding.head_ascii++;
             }
+         else if (! eight_bit_found)
+           coding.head_ascii++;
         }
-      if (coding.head_ascii < 0)
-       coding.head_ascii = src - coding.source;
  
        if (null_byte_found || eight_bit_found
           || coding.head_ascii < coding.src_bytes
@@ -7662,10 +7962,11 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
             }
         }
  
-      if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
+      if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
+         || null_byte_found)
         {
           detect_info.found = CATEGORY_MASK_RAW_TEXT;
-         id = coding_categories[coding_category_raw_text].id;
+         id = CODING_SYSTEM_ID (Qno_conversion);
           val = Fcons (make_number (id), Qnil);
         }
        else if (! detect_info.rejected && ! detect_info.found)
@@ -7695,7 +7996,6 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
         {
           int mask = detect_info.rejected | detect_info.found;
           int found = 0;
-         val = Qnil;
  
           for (i = coding_category_raw_text - 1; i >= 0; i--)
             {
@@ -7720,6 +8020,19 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
           detect_info.found |= found;
         }
      }
+  else if (base_category == coding_category_utf_8_auto)
+    {
+      if (detect_coding_utf_8 (&coding, &detect_info))
+       {
+         struct coding_system *this;
+
+         if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+           this = coding_categories + coding_category_utf_8_sig;
+         else
+           this = coding_categories + coding_category_utf_8_nosig;
+         val = Fcons (make_number (this->id), Qnil);
+       }
+    }
    else if (base_category == coding_category_utf_16_auto)
      {
        if (detect_coding_utf_16 (&coding, &detect_info))
@@ -7745,7 +8058,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
  
    /* Then, detect eol-format if necessary.  */
    {
-    int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
+    int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
      Lisp_Object tail;
  
      if (VECTORP (eol_type))
@@ -7811,7 +8124,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
        }
    }
  
-  return (highest ? XCAR (val) : val);
+  return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
  }
  
  
@@ -7821,9 +8134,9 @@ DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
  Return a list of possible coding systems ordered by priority.
  
  If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
  
  If optional argument HIGHEST is non-nil, return the coding system of
  highest priority.  */)
@@ -7858,9 +8171,9 @@ DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
  Return a list of possible coding systems ordered by priority.
  
  If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
  
  If optional argument HIGHEST is non-nil, return the coding system of
  highest priority.  */)
@@ -8028,7 +8341,7 @@ DEFUN ("unencodable-char-position", Funencodable_char_position,
         Sunencodable_char_position, 3, 5, 0,
         doc: /*
  Return position of first un-encodable character in a region.
-START and END specfiy the region and CODING-SYSTEM specifies the
+START and END specify the region and CODING-SYSTEM specifies the
  encoding to check.  Return nil if CODING-SYSTEM does encode the region.
  
  If optional 4th argument COUNT is non-nil, it specifies at most how
@@ -8141,7 +8454,7 @@ START and END are buffer positions specifying the region.
  CODING-SYSTEM-LIST is a list of coding systems to check.
  
  The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
-CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
+CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
  whole region, POS0, POS1, ... are buffer positions where non-encodable
  characters are found.
  
@@ -8150,7 +8463,10 @@ value is nil.
  
  START may be a string.  In that case, check if the string is
  encodable, and the value contains indices to the string instead of
-buffer positions.  END is ignored.  */)
+buffer positions.  END is ignored.
+
+If the current buffer (or START if it is a string) is unibyte, the value
+is nil.  */)
       (start, end, coding_system_list)
       Lisp_Object start, end, coding_system_list;
  {
@@ -8164,7 +8480,7 @@ buffer positions.  END is ignored.  */)
    if (STRINGP (start))
      {
        if (!STRING_MULTIBYTE (start)
-         && SCHARS (start) != SBYTES (start))
+         || SCHARS (start) == SBYTES (start))
         return Qnil;
        start_byte = 0;
        end_byte = SBYTES (start);
@@ -8181,7 +8497,7 @@ buffer positions.  END is ignored.  */)
        start_byte = CHAR_TO_BYTE (XINT (start));
        end_byte = CHAR_TO_BYTE (XINT (end));
        if (XINT (end) - XINT (start) == end_byte - start_byte)
-       return Qt;
+       return Qnil;
  
        if (XINT (start) < GPT && XINT (end) > GPT)
         {
@@ -8310,13 +8626,14 @@ START and END are buffer positions.
  
  Optional 4th arguments DESTINATION specifies where the decoded text goes.
  If nil, the region between START and END is replaced by the decoded text.
-If buffer, the decoded text is inserted in the buffer.
-If t, the decoded text is returned.
+If buffer, the decoded text is inserted in that buffer after point (point
+does not move).
+In those cases, the length of the decoded text is returned.
+If DESTINATION is t, the decoded text is returned.
  
  This function sets `last-coding-system-used' to the precise coding system
  used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified.)
-It returns the length of the decoded text.  */)
+not fully specified.)  */)
       (start, end, coding_system, destination)
       Lisp_Object start, end, coding_system, destination;
  {
@@ -8326,18 +8643,20 @@ It returns the length of the decoded text.  */)
  DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
         3, 4, "r\nzCoding system: ",
         doc: /* Encode the current region by specified coding system.
-When called from a program, takes three arguments:
-START, END, and CODING-SYSTEM.  START and END are buffer positions.
+When called from a program, takes four arguments:
+        START, END, CODING-SYSTEM and DESTINATION.
+START and END are buffer positions.
  
  Optional 4th arguments DESTINATION specifies where the encoded text goes.
  If nil, the region between START and END is replace by the encoded text.
-If buffer, the encoded text is inserted in the buffer.
-If t, the encoded text is returned.
+If buffer, the encoded text is inserted in that buffer after point (point
+does not move).
+In those cases, the length of the encoded text is returned.
+If DESTINATION is t, the encoded text is returned.
  
  This function sets `last-coding-system-used' to the precise coding system
  used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified.)
-It returns the length of the encoded text.  */)
+not fully specified.)  */)
    (start, end, coding_system, destination)
       Lisp_Object start, end, coding_system, destination;
  {
@@ -8410,13 +8729,13 @@ DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
  Optional third arg NOCOPY non-nil means it is OK to return STRING itself
  if the decoding operation is trivial.
  
-Optional fourth arg BUFFER non-nil meant that the decoded text is
-inserted in BUFFER instead of returned as a string.  In this case,
-the return value is BUFFER.
+Optional fourth arg BUFFER non-nil means that the decoded text is
+inserted in that buffer after point (point does not move).  In this
+case, the return value is the length of the decoded text.
  
  This function sets `last-coding-system-used' to the precise coding system
  used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified.  */)
+not fully specified.)  */)
    (string, coding_system, nocopy, buffer)
       Lisp_Object string, coding_system, nocopy, buffer;
  {
@@ -8431,9 +8750,9 @@ DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
  Optional third arg NOCOPY non-nil means it is OK to return STRING
  itself if the encoding operation is trivial.
  
-Optional fourth arg BUFFER non-nil meant that the encoded text is
-inserted in BUFFER instead of returned as a string.  In this case,
-the return value is BUFFER.
+Optional fourth arg BUFFER non-nil means that the encoded text is
+inserted in that buffer after point (point does not move).  In this
+case, the return value is the length of the encoded text.
  
  This function sets `last-coding-system-used' to the precise coding system
  used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
@@ -8691,9 +9010,9 @@ whichever argument specifies the file name is TARGET.
  TARGET has a meaning which depends on OPERATION:
    For file I/O, TARGET is a file name (except for the special case below).
    For process I/O, TARGET is a process name.
-  For network I/O, TARGET is a service name or a port number
+  For network I/O, TARGET is a service name or a port number.
  
-This function looks up what specified for TARGET in,
+This function looks up what is specified for TARGET in
  `file-coding-system-alist', `process-coding-system-alist',
  or `network-coding-system-alist' depending on OPERATION.
  They may specify a coding system, a cons of coding systems,
@@ -8785,10 +9104,10 @@ usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
  DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
         Sset_coding_system_priority, 0, MANY, 0,
         doc: /* Assign higher priority to the coding systems given as arguments.
-If multiple coding systems belongs to the same category,
+If multiple coding systems belong to the same category,
  all but the first one are ignored.
  
-usage: (set-coding-system-priority ...)  */)
+usage: (set-coding-system-priority &rest coding-systems)  */)
       (nargs, args)
       int nargs;
       Lisp_Object *args;
@@ -9149,7 +9468,7 @@ usage: (define-coding-system-internal ...)  */)
           val = XCDR (bom);
           CHECK_CODING_SYSTEM (val);
         }
-      ASET (attrs, coding_attr_utf_16_bom, bom);
+      ASET (attrs, coding_attr_utf_bom, bom);
  
        endian = args[coding_arg_utf16_endian];
        CHECK_SYMBOL (endian);
@@ -9328,8 +9647,27 @@ usage: (define-coding-system-internal ...)  */)
      }
    else if (EQ (coding_type, Qutf_8))
      {
-      category = coding_category_utf_8;
+      Lisp_Object bom;
+
        CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
+
+      if (nargs < coding_arg_utf8_max)
+       goto short_args;
+
+      bom = args[coding_arg_utf8_bom];
+      if (! NILP (bom) && ! EQ (bom, Qt))
+       {
+         CHECK_CONS (bom);
+         val = XCAR (bom);
+         CHECK_CODING_SYSTEM (val);
+         val = XCDR (bom);
+         CHECK_CODING_SYSTEM (val);
+       }
+      ASET (attrs, coding_attr_utf_bom, bom);
+
+      category = (CONSP (bom) ? coding_category_utf_8_auto
+                 : NILP (bom) ? coding_category_utf_8_nosig
+                 : coding_category_utf_8_sig);
      }
    else if (EQ (coding_type, Qundecided))
      category = coding_category_undecided;
@@ -9421,7 +9759,7 @@ DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
         CHECK_CHARACTER (val);
        CODING_ATTR_MNEMONIC (attrs) = val;
      }
-  else if (EQ (prop, QCdefalut_char))
+  else if (EQ (prop, QCdefault_char))
      {
        if (NILP (val))
         val = make_number (' ');
@@ -9473,7 +9811,7 @@ DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
    CHECK_SYMBOL (alias);
    CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
    aliases = AREF (spec, 1);
-  /* ALISES should be a list of length more than zero, and the first
+  /* ALIASES should be a list of length more than zero, and the first
       element is a base coding system.  Append ALIAS at the tail of the
       list.  */
    while (!NILP (XCDR (aliases)))
@@ -9551,7 +9889,7 @@ DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
  DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
         Scoding_system_eol_type, 1, 1, 0,
         doc: /* Return eol-type of CODING-SYSTEM.
-An eol-type is integer 0, 1, 2, or a vector of coding systems.
+An eol-type is an integer 0, 1, 2, or a vector of coding systems.
  
  Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
  and CR respectively.
@@ -9727,7 +10065,7 @@ syms_of_coding ()
  
    DEFSYM (QCcategory, ":category");
    DEFSYM (QCmnemonic, ":mnemonic");
-  DEFSYM (QCdefalut_char, ":default-char");
+  DEFSYM (QCdefault_char, ":default-char");
    DEFSYM (QCdecode_translation_table, ":decode-translation-table");
    DEFSYM (QCencode_translation_table, ":encode-translation-table");
    DEFSYM (QCpost_read_conversion, ":post-read-conversion");
@@ -9750,8 +10088,12 @@ syms_of_coding ()
         intern ("coding-category-iso-7-else"));
    ASET (Vcoding_category_table, coding_category_iso_8_else,
         intern ("coding-category-iso-8-else"));
-  ASET (Vcoding_category_table, coding_category_utf_8,
+  ASET (Vcoding_category_table, coding_category_utf_8_auto,
+       intern ("coding-category-utf-8-auto"));
+  ASET (Vcoding_category_table, coding_category_utf_8_nosig,
         intern ("coding-category-utf-8"));
+  ASET (Vcoding_category_table, coding_category_utf_8_sig,
+       intern ("coding-category-utf-8-sig"));
    ASET (Vcoding_category_table, coding_category_utf_16_be,
         intern ("coding-category-utf-16-be"));
    ASET (Vcoding_category_table, coding_category_utf_16_auto,
@@ -9829,7 +10171,7 @@ updated by the functions `define-coding-system' and
    DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
                doc: /* Alist of coding system names.
  Each element is one element list of coding system name.
-This variable is given to `completing-read' as TABLE argument.
+This variable is given to `completing-read' as COLLECTION argument.
  
  Do not alter the value of this variable manually.  This variable should be
  updated by the functions `make-coding-system' and
@@ -9859,8 +10201,8 @@ Don't modify this variable directly, but use `set-coding-priority'.  */);
                doc: /* Specify the coding system for read operations.
  It is useful to bind this variable with `let', but do not set it globally.
  If the value is a coding system, it is used for decoding on read operation.
-If not, an appropriate element is used from one of the coding system alists:
-There are three such tables, `file-coding-system-alist',
+If not, an appropriate element is used from one of the coding system alists.
+There are three such tables: `file-coding-system-alist',
  `process-coding-system-alist', and `network-coding-system-alist'.  */);
    Vcoding_system_for_read = Qnil;
  
@@ -9871,8 +10213,8 @@ If the value is a coding system, it is used for encoding of output,
  when writing it to a file and when sending it to a file or subprocess.
  
  If this does not specify a coding system, an appropriate element
-is used from one of the coding system alists:
-There are three such tables, `file-coding-system-alist',
+is used from one of the coding system alists.
+There are three such tables: `file-coding-system-alist',
  `process-coding-system-alist', and `network-coding-system-alist'.
  For output to files, if the above procedure does not specify a coding system,
  the value of `buffer-file-coding-system' is used.  */);
@@ -10032,7 +10374,7 @@ If Nth element is non-nil, the existence of code N in a file
  a coding system of ISO 2022 variant which has a flag
  `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
  or reading output of a subprocess.
-Only 128th through 159th elements has a meaning.  */);
+Only 128th through 159th elements have a meaning.  */);
    Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
  
    DEFVAR_LISP ("select-safe-coding-system-function",
@@ -10061,18 +10403,18 @@ called even if `coding-system-for-write' is non-nil.  The command
    DEFVAR_BOOL ("inhibit-iso-escape-detection",
                &inhibit_iso_escape_detection,
                doc: /*
-If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
+If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
  
-By default, on reading a file, Emacs tries to detect how the text is
-encoded.  This code detection is sensitive to escape sequences.  If
-the sequence is valid as ISO2022, the code is determined as one of
-the ISO2022 encodings, and the file is decoded by the corresponding
-coding system (e.g. `iso-2022-7bit').
+When Emacs reads text, it tries to detect how the text is encoded.
+This code detection is sensitive to escape sequences.  If Emacs sees
+a valid ISO-2022 escape sequence, it assumes the text is encoded in one
+of the ISO2022 encodings, and decodes text by the corresponding coding
+system (e.g. `iso-2022-7bit').
  
  However, there may be a case that you want to read escape sequences in
  a file as is.  In such a case, you can set this variable to non-nil.
-Then, as the code detection ignores any escape sequences, no file is
-detected as encoded in some ISO2022 encoding.  The result is that all
+Then the code detection will ignore any escape sequences, and no text is
+detected as encoded in some ISO-2022 encoding.  The result is that all
  escape sequences become visible in a buffer.
  
  The default value is nil, and it is strongly recommended not to change
@@ -10082,14 +10424,31 @@ in Emacs's distribution, and they won't be decoded correctly on
  reading if you suppress escape sequence detection.
  
  The other way to read escape sequences in a file without decoding is
-to explicitly specify some coding system that doesn't use ISO2022's
+to explicitly specify some coding system that doesn't use ISO-2022
  escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
    inhibit_iso_escape_detection = 0;
  
+  DEFVAR_BOOL ("inhibit-null-byte-detection",
+              &inhibit_null_byte_detection,
+              doc: /* If non-nil, Emacs ignores null bytes on code detection.
+By default, Emacs treats it as binary data, and does not attempt to
+decode it.  The effect is as if you specified `no-conversion' for
+reading that text.
+
+Set this to non-nil when a regular text happens to include null bytes.
+Examples are Index nodes of Info files and null-byte delimited output
+from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
+decode text as usual.  */);
+  inhibit_null_byte_detection = 0;
+
    DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
                doc: /* Char table for translating self-inserting characters.
-This is applied to the result of input methods, not their input.  See also
-`keyboard-translate-table'.  */);
+This is applied to the result of input methods, not their input.
+See also `keyboard-translate-table'.
+
+Use of this variable for character code unification was rendered
+obsolete in Emacs 23.1 and later, since Unicode is now the basis of
+internal character representation.  */);
      Vtranslation_table_for_input = Qnil;
  
    {