(define-charset): New args :min-code and :max-code.

[bpt/emacs.git] / src / coding.c
diff --git a/src/coding.c b/src/coding.c

index 8067df7..1fc59a0 100644 (file)
--- a/src/coding.c
+++ b/src/coding.c
@@ -46,8 +46,8 @@ Boston, MA 02111-1307, USA.  */
  
  CODING SYSTEM
  
-  Coding system is an object for a encoding mechanism that contains
-  information about how to convert byte sequence to character
+  A coding system is an object for an encoding mechanism that contains
+  information about how to convert byte sequences to character
    sequences and vice versa.  When we say "decode", it means converting
    a byte sequence of a specific coding system into a character
    sequence that is represented by Emacs' internal coding system
@@ -57,12 +57,12 @@ CODING SYSTEM
  
    In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
    C level, a coding system is represented by a vector of attributes
-  stored in the hash table Vcharset_hash_table.  The conversion from a
+  stored in the hash table Vcharset_hash_table.  The conversion from
    coding system symbol to attributes vector is done by looking up
    Vcharset_hash_table by the symbol.
  
    Coding systems are classified into the following types depending on
-  the mechanism of encoding.  Here's a brief descrition about type.
+  the encoding mechanism.  Here's a brief description of the types.
  
    o UTF-8
  
@@ -71,12 +71,12 @@ CODING SYSTEM
    o Charset-base coding system
  
    A coding system defined by one or more (coded) character sets.
-  Decoding and encoding are done by code converter defined for each
+  Decoding and encoding are done by a code converter defined for each
    character set.
  
-  o Old Emacs' internal format (emacs-mule)
+  o Old Emacs internal format (emacs-mule)
  
-  The coding system adopted by an old versions of Emacs (20 and 21).
+  The coding system adopted by old versions of Emacs (20 and 21).
  
    o ISO2022-base coding system
  
@@ -101,7 +101,7 @@ CODING SYSTEM
  
    o CCL
  
-  If a user wants to decode/encode a text encoded in a coding system
+  If a user wants to decode/encode text encoded in a coding system
    not listed above, he can supply a decoder and an encoder for it in
    CCL (Code Conversion Language) programs.  Emacs executes the CCL
    program while decoding/encoding.
@@ -109,7 +109,7 @@ CODING SYSTEM
    o Raw-text
  
    A coding system for a text containing raw eight-bit data.  Emacs
-  treat each byte of source text as a character (except for
+  treats each byte of source text as a character (except for
    end-of-line conversion).
  
    o No-conversion
@@ -119,13 +119,13 @@ CODING SYSTEM
  
  END-OF-LINE FORMAT
  
-  How end-of-line of a text is encoded depends on a system.  For
+  How text end-of-line is encoded depends on operating system.  For
    instance, Unix's format is just one byte of LF (line-feed) code,
    whereas DOS's format is two-byte sequence of `carriage-return' and
    `line-feed' codes.  MacOS's format is usually one byte of
    `carriage-return'.
  
-  Since text characters encoding and end-of-line encoding are
+  Since text character encoding and end-of-line encoding are
    independent, any coding system described above can take any format
    of end-of-line (except for no-conversion).
  
@@ -134,7 +134,7 @@ STRUCT CODING_SYSTEM
    Before using a coding system for code conversion (i.e. decoding and
    encoding), we setup a structure of type `struct coding_system'.
    This structure keeps various information about a specific code
-  conversion (e.g.  the location of source and destination data).
+  conversion (e.g. the location of source and destination data).
  
  */
  
@@ -303,7 +303,8 @@ encode_coding_XXX (coding)
  Lisp_Object Vcoding_system_hash_table;
  
  Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
-Lisp_Object Qunix, Qdos, Qmac;
+Lisp_Object Qunix, Qdos;
+extern Lisp_Object Qmac;       /* frame.c */
  Lisp_Object Qbuffer_file_coding_system;
  Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
  Lisp_Object Qdefault_char;
@@ -402,10 +403,6 @@ Lisp_Object Vdefault_process_coding_system;
     to avoid infinite recursive call.  */
  static int inhibit_pre_post_conversion;
  
-/* Char-table containing safe coding systems of each character.  */
-Lisp_Object Vchar_coding_system_table;
-Lisp_Object Qchar_coding_system;
-
  /* Two special coding systems.  */
  Lisp_Object Vsjis_coding_system;
  Lisp_Object Vbig5_coding_system;
@@ -768,6 +765,7 @@ static int detected_mask[coding_category_raw_text] =
           error ("Undecodable char found");     \
         c = ((c & 1) << 6) | *src++;            \
        }                                                \
+    consumed_chars++;                          \
    } while (0)
  
  
@@ -897,7 +895,6 @@ coding_set_source (coding)
        else
         {
           struct buffer *buf = XBUFFER (coding->src_object);
-         EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
           EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
           unsigned char *beg_addr = BUF_BEG_ADDR (buf);
  
@@ -1081,6 +1078,7 @@ detect_coding_utf_8 (coding, mask)
  }
  
  
+/* Fixme: deal with surrogates?  */
  static void
  decode_coding_utf_8 (coding)
       struct coding_system *coding;
@@ -1129,23 +1127,38 @@ decode_coding_utf_8 (coding)
           if (! UTF_8_EXTRA_OCTET_P (c2))
             goto invalid_code;
           if (UTF_8_2_OCTET_LEADING_P (c1))
-           c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
+           {
+             c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
+             /* Reject overlong sequences here and below.  Encoders
+                producing them are incorrect, they can be misleading,
+                and they mess up read/write invariance.  */
+             if (c < 128)
+               goto invalid_code;
+           }
           else
             {
               ONE_MORE_BYTE (c3);
               if (! UTF_8_EXTRA_OCTET_P (c3))
                 goto invalid_code;
               if (UTF_8_3_OCTET_LEADING_P (c1))
-               c = (((c1 & 0xF) << 12)
-                    | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
+               {
+                 c = (((c1 & 0xF) << 12)
+                      | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
+                 if (c < 0x800)
+                   goto invalid_code;
+               }
               else
                 {
                   ONE_MORE_BYTE (c4);
                   if (! UTF_8_EXTRA_OCTET_P (c4))
                     goto invalid_code;
                   if (UTF_8_4_OCTET_LEADING_P (c1))
+                   {
                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
+                   if (c < 0x10000)
+                     goto invalid_code;
+                   }
                   else
                     {
                       ONE_MORE_BYTE (c5);
@@ -1156,7 +1169,7 @@ decode_coding_utf_8 (coding)
                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
                                | (c5 & 0x3F));
-                         if (c > MAX_CHAR)
+                         if ((c > MAX_CHAR) || (c < 0x200000))
                             goto invalid_code;
                         }
                       else
@@ -1528,12 +1541,11 @@ char emacs_mule_bytes[256];
  
  
  int
-emacs_mule_char (coding, composition, nbytes, nchars)
+emacs_mule_char (coding, src, nbytes, nchars)
       struct coding_system *coding;
-     int composition;
+     unsigned char *src;
       int *nbytes, *nchars;
  {
-  unsigned char *src = coding->source + coding->consumed;
    unsigned char *src_end = coding->source + coding->src_bytes;
    int multibytep = coding->src_multibyte;
    unsigned char *src_base = src;
@@ -1543,20 +1555,6 @@ emacs_mule_char (coding, composition, nbytes, nchars)
    int consumed_chars = 0;
  
    ONE_MORE_BYTE (c);
-  if (composition)
-    {
-      c -= 0x20;
-      if (c == 0x80)
-       {
-         ONE_MORE_BYTE (c);
-         if (c < 0xA0)
-           goto invalid_code;
-         *nbytes = src - src_base;
-         *nchars = consumed_chars;
-         return (c - 0x80);
-       }
-    }
-
    switch (emacs_mule_bytes[c])
      {
      case 2:
@@ -1581,17 +1579,18 @@ emacs_mule_char (coding, composition, nbytes, nchars)
           if (! (charset = emacs_mule_charset[c]))
             goto invalid_code;
           ONE_MORE_BYTE (c);
-         code = (c & 0x7F) << 7;
+         code = (c & 0x7F) << 8;
           ONE_MORE_BYTE (c);
           code |= c & 0x7F;
         }
        break;
  
      case 4:
+      ONE_MORE_BYTE (c);
        if (! (charset = emacs_mule_charset[c]))
         goto invalid_code;
        ONE_MORE_BYTE (c);
-      code = (c & 0x7F) << 7;
+      code = (c & 0x7F) << 8;
        ONE_MORE_BYTE (c);
        code |= c & 0x7F;
        break;
@@ -1714,7 +1713,7 @@ detect_coding_emacs_mule (coding, mask)
                                                                 \
        if (src == src_end)                                      \
         break;                                                  \
-      c = emacs_mule_char (coding, 1, &nbytes, &nchars);       \
+      c = emacs_mule_char (coding, src, &nbytes, &nchars);     \
        if (c < 0)                                               \
         {                                                       \
           if (c == -2)                                          \
@@ -1729,17 +1728,18 @@ detect_coding_emacs_mule (coding, mask)
  
  
  /* Decode a composition rule represented as a component of composition
-   sequence of Emacs 20 style at SRC.  Set C to the rule.  If SRC
-   points an invalid byte sequence, set C to -1.  */
+   sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
+   and increment BUF.  If SRC points an invalid byte sequence, set C
+   to -1.  */
  
-#define DECODE_EMACS_MULE_COMPOSITION_RULE(buf)                \
+#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)     \
    do {                                                 \
      int c, gref, nref;                                 \
                                                         \
-    if (src < src_end)                                 \
+    if (src >= src_end)                                        \
        goto invalid_code;                               \
      ONE_MORE_BYTE_NO_CHECK (c);                                \
-    c -= 0xA0;                                         \
+    c -= 0x20;                                         \
      if (c < 0 || c >= 81)                              \
        goto invalid_code;                               \
                                                         \
@@ -1748,6 +1748,28 @@ detect_coding_emacs_mule (coding, mask)
    } while (0)
  
  
+/* Decode a composition rule represented as a component of composition
+   sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
+   and increment BUF.  If SRC points an invalid byte sequence, set C
+   to -1.  */
+
+#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)     \
+  do {                                                 \
+    int gref, nref;                                    \
+                                                       \
+    if (src + 1>= src_end)                             \
+      goto invalid_code;                               \
+    ONE_MORE_BYTE_NO_CHECK (gref);                     \
+    gref -= 0x20;                                      \
+    ONE_MORE_BYTE_NO_CHECK (nref);                     \
+    nref -= 0x20;                                      \
+    if (gref < 0 || gref >= 81                         \
+       || nref < 0 || nref >= 81)                      \
+      goto invalid_code;                               \
+    *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);     \
+  } while (0)
+
+
  #define ADD_COMPOSITION_DATA(buf, method, nchars)      \
    do {                                                 \
      *buf++ = -5;                                       \
@@ -1761,10 +1783,11 @@ detect_coding_emacs_mule (coding, mask)
  #define DECODE_EMACS_MULE_21_COMPOSITION(c)                            \
    do {                                                                 \
      /* Emacs 21 style format.  The first three bytes at SRC are                \
-       (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
+       (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
         the byte length of this composition information, CHARS is the   \
         number of characters composed by this composition.  */          \
-    enum composition_method method = c - 0xF0;                         \
+    enum composition_method method = c - 0xF2;                         \
+    int *charbuf_base = charbuf;                                       \
      int consumed_chars_limit;                                          \
      int nbytes, nchars;                                                        \
                                                                         \
@@ -1782,12 +1805,14 @@ detect_coding_emacs_mule (coding, mask)
         while (consumed_chars < consumed_chars_limit)                   \
           {                                                             \
             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
-             DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf);             \
+             DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
             else                                                        \
               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
+           i++;                                                        \
           }                                                             \
         if (consumed_chars < consumed_chars_limit)                      \
           goto invalid_code;                                            \
+       charbuf_base[0] -= i;                                           \
        }                                                                        \
    } while (0)
  
@@ -1823,7 +1848,7 @@ detect_coding_emacs_mule (coding, mask)
      DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                  \
      for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)           \
        {                                                                \
-       DECODE_EMACS_MULE_COMPOSITION_RULE (buf);               \
+       DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
        }                                                                \
      if (i < 1 || (buf - components) % 2 == 0)                  \
@@ -1888,8 +1913,8 @@ decode_coding_emacs_mule (coding)
           if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
             break;
           ONE_MORE_BYTE (c);
-         if (c - 0xF0 >= COMPOSITION_RELATIVE
-             && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
+         if (c - 0xF2 >= COMPOSITION_RELATIVE
+             && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
             DECODE_EMACS_MULE_21_COMPOSITION (c);
           else if (c < 0xC0)
             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
@@ -1897,12 +1922,14 @@ decode_coding_emacs_mule (coding)
             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
           else
             goto invalid_code;
+         coding->annotated = 1;
         }
        else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
         {
           int nbytes, nchars;
-         src--;
-         c = emacs_mule_char (coding, 0, &nbytes, &nchars);
+         src = src_base;
+         consumed_chars = consumed_chars_base;
+         c = emacs_mule_char (coding, src, &nbytes, &nchars);
           if (c < 0)
             {
               if (c == -2)
@@ -1910,6 +1937,8 @@ decode_coding_emacs_mule (coding)
               goto invalid_code;
             }
           *charbuf++ = c;
+         src += nbytes;
+         consumed_chars += nchars;
           char_offset++;
         }
        continue;
@@ -2193,7 +2222,8 @@ enum iso_code_class_type iso_code_class[256];
    (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
  
  static void
-setup_iso_safe_charsets (Lisp_Object attrs)
+setup_iso_safe_charsets (attrs)
+     Lisp_Object attrs;
  {
    Lisp_Object charset_list, safe_charsets;
    Lisp_Object request;
@@ -2577,7 +2607,7 @@ detect_coding_iso_2022 (coding, mask)
  #define DECODE_COMPOSITION_START(c1)                                   \
    do {                                                                 \
      if (c1 == '0'                                                      \
-       && composition_state == COMPOSING_COMPONENT_CHAR)               \
+       && composition_state == COMPOSING_COMPONENT_RULE)               \
        {                                                                        \
         component_len = component_idx;                                  \
         composition_state = COMPOSING_CHAR;                             \
@@ -2730,27 +2760,26 @@ decode_coding_iso_2022 (coding)
                   composition_state--;
                   continue;
                 }
-             else if (method == COMPOSITION_WITH_RULE)
-               composition_state = COMPOSING_RULE;
-             else if (method == COMPOSITION_WITH_RULE_ALTCHARS
-                      && composition_state == COMPOSING_COMPONENT_CHAR)
-               composition_state = COMPOSING_COMPONENT_CHAR;
             }
           if (charset_id_0 < 0
               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
-           {
-             /* This is SPACE or DEL.  */
-             charset = CHARSET_FROM_ID (charset_ascii);
-             break;
-           }
-         /* This is a graphic character, we fall down ...  */
+           /* This is SPACE or DEL.  */
+           charset = CHARSET_FROM_ID (charset_ascii);
+         else
+           charset = CHARSET_FROM_ID (charset_id_0);
+         break;
  
         case ISO_graphic_plane_0:
-         if (composition_state == COMPOSING_RULE)
+         if (composition_state != COMPOSING_NO)
             {
-             DECODE_COMPOSITION_RULE (c1);
-             components[component_idx++] = c1;
-             composition_state = COMPOSING_CHAR;
+             if (composition_state == COMPOSING_RULE
+                 || composition_state == COMPOSING_COMPONENT_RULE)
+               {
+                 DECODE_COMPOSITION_RULE (c1);
+                 components[component_idx++] = c1;
+                 composition_state--;
+                 continue;
+               }
             }
           charset = CHARSET_FROM_ID (charset_id_0);
           break;
@@ -3014,7 +3043,13 @@ decode_coding_iso_2022 (coding)
           char_offset++;
         }
        else
-       components[component_idx++] = c;
+       {
+         components[component_idx++] = c;
+         if (method == COMPOSITION_WITH_RULE
+             || (method == COMPOSITION_WITH_RULE_ALTCHARS
+                 && composition_state == COMPOSING_COMPONENT_CHAR))
+           composition_state++;
+       }
        continue;
  
      invalid_code:
@@ -3434,6 +3469,9 @@ encode_coding_iso_2022 (coding)
    int c;
  
    CODING_GET_INFO (coding, attrs, eol_type, charset_list);
+  setup_iso_safe_charsets (attrs);
+  coding->safe_charsets
+    = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
  
    ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  
@@ -3490,8 +3528,16 @@ encode_coding_iso_2022 (coding)
  
           if (!charset)
             {
-             c = coding->default_char;
-             charset = char_charset (c, charset_list, NULL);
+             if (coding->mode & CODING_MODE_SAFE_ENCODING)
+               {
+                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
+                 charset = CHARSET_FROM_ID (charset_ascii);
+               }
+             else
+               {
+                 c = coding->default_char;
+                 charset = char_charset (c, charset_list, NULL);
+               }
             }
           ENCODE_ISO_CHARACTER (charset, c);
         }
@@ -3853,8 +3899,16 @@ encode_coding_sjis (coding)
  
           if (!charset)
             {
-             c = coding->default_char;
-             charset = char_charset (c, charset_list, &code);
+             if (coding->mode & CODING_MODE_SAFE_ENCODING)
+               {
+                 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
+                 charset = CHARSET_FROM_ID (charset_ascii);
+               }
+             else
+               {
+                 c = coding->default_char;
+                 charset = char_charset (c, charset_list, &code);
+               }
             }
           if (code == CHARSET_INVALID_CODE (charset))
             abort ();
@@ -3913,8 +3967,16 @@ encode_coding_big5 (coding)
  
           if (! charset)
             {
-             c = coding->default_char;
-             charset = char_charset (c, charset_list, &code);
+             if (coding->mode & CODING_MODE_SAFE_ENCODING)
+               {
+                 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
+                 charset = CHARSET_FROM_ID (charset_ascii);
+               }
+             else
+               {
+                 c = coding->default_char;
+                 charset = char_charset (c, charset_list, &code);
+               }
             }
           if (code == CHARSET_INVALID_CODE (charset))
             abort ();
@@ -4258,15 +4320,14 @@ decode_coding_charset (coding)
    int *charbuf_end = charbuf + coding->charbuf_size;
    int consumed_chars = 0, consumed_chars_base;
    int multibytep = coding->src_multibyte;
-  struct charset *charset;
-  Lisp_Object attrs, eol_type, charset_list;
+  Lisp_Object attrs, eol_type, charset_list, valids;
  
    CODING_GET_INFO (coding, attrs, eol_type, charset_list);
-  charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
+  valids = AREF (attrs, coding_attr_charset_valids);
  
    while (1)
      {
-      int c, c1;
+      int c;
  
        src_base = src;
        consumed_chars_base = consumed_chars;
@@ -4274,14 +4335,15 @@ decode_coding_charset (coding)
        if (charbuf >= charbuf_end)
         break;
  
-      ONE_MORE_BYTE (c1);
+      ONE_MORE_BYTE (c);
        if (c == '\r')
         {
+         /* Here we assume that no charset maps '\r' to something
+            else.  */
           if (EQ (eol_type, Qdos))
             {
-             if (src == src_end)
-               goto no_more_source;
-             if (*src == '\n')
+             if (src < src_end
+                 && *src == '\n')
                 ONE_MORE_BYTE (c);
             }
           else if (EQ (eol_type, Qmac))
@@ -4289,7 +4351,50 @@ decode_coding_charset (coding)
         }
        else
         {
-         CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
+         Lisp_Object val;
+         struct charset *charset;
+         int dim;
+         int len = 1;
+         unsigned code = c;
+
+         val = AREF (valids, c);
+         if (NILP (val))
+           goto invalid_code;
+         if (INTEGERP (val))
+           {
+             charset = CHARSET_FROM_ID (XFASTINT (val));
+             dim = CHARSET_DIMENSION (charset);
+             while (len < dim)
+               {
+                 ONE_MORE_BYTE (c);
+                 code = (code << 8) | c;
+                 len++;
+               }
+             CODING_DECODE_CHAR (coding, src, src_base, src_end,
+                                 charset, code, c);
+           }
+         else
+           {
+             /* VAL is a list of charset IDs.  It is assured that the
+                list is sorted by charset dimensions (smaller one
+                comes first).  */
+             while (CONSP (val))
+               {
+                 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
+                 dim = CHARSET_DIMENSION (charset);
+                 while (len < dim)
+                   {
+                     ONE_MORE_BYTE (c);
+                     code = (code << 8) | c;
+                     len++;
+                   }
+                 CODING_DECODE_CHAR (coding, src, src_base,
+                                     src_end, charset, code, c);
+                 if (c >= 0)
+                   break;
+                 val = XCDR (val);
+               }
+           }
           if (c < 0)
             goto invalid_code;
         }
@@ -4321,28 +4426,46 @@ encode_coding_charset (coding)
    unsigned char *dst_end = coding->destination + coding->dst_bytes;
    int safe_room = MAX_MULTIBYTE_LENGTH;
    int produced_chars = 0;
-  struct charset *charset;
    Lisp_Object attrs, eol_type, charset_list;
    int ascii_compatible;
    int c;
  
    CODING_GET_INFO (coding, attrs, eol_type, charset_list);
-  charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
    ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  
    while (charbuf < charbuf_end)
      {
+      struct charset *charset;
        unsigned code;
        
        ASSURE_DESTINATION (safe_room);
        c = *charbuf++;
        if (ascii_compatible && ASCII_CHAR_P (c))
         EMIT_ONE_ASCII_BYTE (c);
-      else if ((code = ENCODE_CHAR (charset, c))
-              != CHARSET_INVALID_CODE (charset))
-       EMIT_ONE_BYTE (code);
        else
-       EMIT_ONE_BYTE (coding->default_char);
+       {
+         charset = char_charset (c, charset_list, &code);
+         if (charset)
+           {
+             if (CHARSET_DIMENSION (charset) == 1)
+               EMIT_ONE_BYTE (code);
+             else if (CHARSET_DIMENSION (charset) == 2)
+               EMIT_TWO_BYTES (code >> 8, code & 0xFF);
+             else if (CHARSET_DIMENSION (charset) == 3)
+               EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
+             else
+               EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
+                                (code >> 8) & 0xFF, code & 0xFF);
+           }
+         else
+           {
+             if (coding->mode & CODING_MODE_SAFE_ENCODING)
+               c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
+             else
+               c = coding->default_char;
+             EMIT_ONE_BYTE (c);
+           }
+       }
      }
  
    coding->result = CODING_RESULT_SUCCESS;
@@ -4617,6 +4740,7 @@ raw_text_coding_system (coding_system)
  
  Lisp_Object
  coding_inherit_eol_type (coding_system, parent)
+     Lisp_Object coding_system, parent;
  {
    Lisp_Object spec, attrs, eol_type;
  
@@ -5880,6 +6004,7 @@ decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
        EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
        Lisp_Object val;
  
+      TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
        GCPRO2 (coding->src_object, coding->dst_object);
        val = call1 (CODING_ATTR_POST_READ (attrs),
                    make_number (coding->produced_char));
@@ -5942,8 +6067,6 @@ encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
  
    if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
      {
-      Lisp_Object val;
-
        coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
        set_buffer_internal (XBUFFER (coding->src_object));
        if (STRINGP (src_object))
@@ -5960,9 +6083,9 @@ encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
           set_buffer_internal (XBUFFER (coding->src_object));
         }
  
-      val = call2 (CODING_ATTR_PRE_WRITE (attrs),
-                  make_number (1), make_number (chars));
-      CHECK_NATNUM (val);
+      call2 (CODING_ATTR_PRE_WRITE (attrs),
+            make_number (BEG), make_number (Z));
+      coding->src_object = Fcurrent_buffer ();
        if (BEG != GPT)
         move_gap_both (BEG, BEG_BYTE);
        coding->src_chars = Z - BEG;
@@ -6005,8 +6128,10 @@ encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
    else if (EQ (dst_object, Qt))
      {
        coding->dst_object = Qnil;
-      coding->destination = (unsigned char *) xmalloc (coding->src_chars);
        coding->dst_bytes = coding->src_chars;
+      if (coding->dst_bytes == 0)
+       coding->dst_bytes = 1;
+      coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
        coding->dst_multibyte = 0;
      }
    else
@@ -6778,8 +6903,8 @@ Return the corresponding character.  */)
  
    val = CODING_ATTR_CHARSET_LIST (attrs);
    charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
-  charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
-  charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
+  charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
+  charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
  
    if (c <= 0x7F)
      charset = charset_roman;
@@ -6790,7 +6915,7 @@ Return the corresponding character.  */)
      }
    else
      {
-      int s1 = c >> 8, s2 = c & 0x7F;
+      int s1 = c >> 8, s2 = c & 0xFF;
  
        if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
@@ -6906,6 +7031,7 @@ DEFUN ("set-terminal-coding-system-internal",
         Sset_terminal_coding_system_internal, 1, 1, 0,
         doc: /* Internal use only.  */)
       (coding_system)
+     Lisp_Object coding_system;
  {
    CHECK_SYMBOL (coding_system);
    setup_coding_system (Fcheck_coding_system (coding_system),
@@ -6925,6 +7051,7 @@ DEFUN ("set-safe-terminal-coding-system-internal",
         Sset_safe_terminal_coding_system_internal, 1, 1, 0,
         doc: /* Internal use only.  */)
       (coding_system)
+     Lisp_Object coding_system;
  {
    CHECK_SYMBOL (coding_system);
    setup_coding_system (Fcheck_coding_system (coding_system),
@@ -7066,7 +7193,8 @@ usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
  
  DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
         Sset_coding_system_priority, 1, MANY, 0,
-       doc: /* Put higher priority to coding systems of the arguments.  */)
+       doc: /* Assign higher priority to the coding systems given as arguments.
+usage: (set-coding-system-priority CODING-SYSTEM ...)  */)
       (nargs, args)
       int nargs;
       Lisp_Object *args;
@@ -7115,7 +7243,8 @@ DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
  
  DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
         Scoding_system_priority_list, 0, 1, 0,
-       doc: /* Return a list of coding systems ordered by their priorities.  */)
+       doc: /* Return a list of coding systems ordered by their priorities.
+HIGHESTP non-nil means just return the highest priority one.  */)
       (highestp)
       Lisp_Object highestp;
  {
@@ -7138,12 +7267,13 @@ DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
    return Fnreverse (val);
  }
  
+static char *suffixes[] = { "-unix", "-dos", "-mac" };
+
  static Lisp_Object
  make_subsidiaries (base)
       Lisp_Object base;
  {
    Lisp_Object subsidiaries;
-  char *suffixes[] = { "-unix", "-dos", "-mac" };
    int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
    char *buf = (char *) alloca (base_name_len + 6);
    int i;
@@ -7161,7 +7291,8 @@ make_subsidiaries (base)
  
  DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
         Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
-       doc: /* For internal use only.  */)
+       doc: /* For internal use only.
+usage: (define-coding-system-internal ...)  */)
       (nargs, args)
       int nargs;
       Lisp_Object *args;
@@ -7277,15 +7408,62 @@ DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
  
    if (EQ (coding_type, Qcharset))
      {
+      /* Generate a lisp vector of 256 elements.  Each element is nil,
+        integer, or a list of charset IDs.
+
+        If Nth element is nil, the byte code N is invalid in this
+        coding system.
+
+        If Nth element is a number NUM, N is the first byte of a
+        charset whose ID is NUM.
+
+        If Nth element is a list of charset IDs, N is the first byte
+        of one of them.  The list is sorted by dimensions of the
+        charsets.  A charset of smaller dimension comes firtst.
+      */
        val = Fmake_vector (make_number (256), Qnil);
  
        for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
         {
-         struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
+         struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
+         int dim = CHARSET_DIMENSION (charset);
+         int idx = (dim - 1) * 4;
+         
+         for (i = charset->code_space[idx];
+              i <= charset->code_space[idx + 1]; i++)
+           {
+             Lisp_Object tmp, tmp2;
+             int dim2;
  
-         for (i = charset->code_space[0]; i <= charset->code_space[1]; i++)
-           if (NILP (AREF (val, i)))
-             ASET (val, i, XCAR (tail));
+             tmp = AREF (val, i);
+             if (NILP (tmp))
+               tmp = XCAR (tail);
+             else if (NUMBERP (tmp))
+               {
+                 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
+                 if (dim < dim2)
+                   tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
+                 else
+                   tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
+               }
+             else
+               {
+                 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
+                   {
+                     dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
+                     if (dim < dim2)
+                       break;
+                   }
+                 if (NILP (tmp2))
+                   tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
+                 else
+                   {
+                     XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
+                     XSETCAR (tmp2, XCAR (tail));
+                   }
+               }
+             ASET (val, i, tmp);
+           }
         }
        ASET (attrs, coding_attr_charset_valids, val);
        category = coding_category_charset;
@@ -7365,7 +7543,6 @@ DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
    else if (EQ (coding_type, Qiso_2022))
      {
        Lisp_Object initial, reg_usage, request, flags;
-      struct charset *charset;
        int i, id;
  
        if (nargs < coding_arg_iso2022_max)
@@ -7558,6 +7735,8 @@ DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
                          make_number (nargs)));
  }
  
+/* Fixme: should this record the alias relationships for
+   diagnostics?  */
  DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
         Sdefine_coding_system_alias, 2, 2, 0,
         doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
@@ -7588,7 +7767,8 @@ DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
      }
  
    Fputhash (alias, spec, Vcoding_system_hash_table);
-  Vcoding_system_alist = Fcons (Fcons (alias, Qnil), Vcoding_system_alist);
+  Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
+                               Vcoding_system_alist);
  
    return Qnil;
  }
@@ -7596,7 +7776,7 @@ DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
  DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
         1, 1, 0,
         doc: /* Return the base of CODING-SYSTEM.
-Any alias or subsidiary coding systems are not base coding system.  */)
+Any alias or subsidiary coding system is not a base coding system.  */)
    (coding_system)
       Lisp_Object coding_system;
  {
@@ -7627,9 +7807,7 @@ DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
  
  DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
         1, 1, 0,
-       doc: /* Return the list of aliases of CODING-SYSTEM.
-A base coding system is what made by `define-coding-system'.
-Any alias nor subsidiary coding systems are not base coding system.  */)
+       doc: /* Return the list of aliases of CODING-SYSTEM.  */)
       (coding_system)
       Lisp_Object coding_system;
  {
@@ -7638,7 +7816,7 @@ Any alias nor subsidiary coding systems are not base coding system.  */)
    if (NILP (coding_system))
      coding_system = Qno_conversion;
    CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
-  return AREF (spec, 2);
+  return AREF (spec, 1);
  }
  
  DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
@@ -7712,6 +7890,10 @@ init_coding_once ()
      {
        emacs_mule_bytes[i] = 1;
      }
+  emacs_mule_bytes[LEADING_CODE_PRIVATE_11] = 3;
+  emacs_mule_bytes[LEADING_CODE_PRIVATE_12] = 3;
+  emacs_mule_bytes[LEADING_CODE_PRIVATE_21] = 4;
+  emacs_mule_bytes[LEADING_CODE_PRIVATE_22] = 4;
  }
  
  #ifdef emacs
@@ -7766,7 +7948,6 @@ syms_of_coding ()
    DEFSYM (Qeol_type, "eol-type");
    DEFSYM (Qunix, "unix");
    DEFSYM (Qdos, "dos");
-  DEFSYM (Qmac, "mac");
  
    DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
    DEFSYM (Qpost_read_conversion, "post-read-conversion");
@@ -7812,10 +7993,6 @@ syms_of_coding ()
    DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
    DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
  
-  DEFSYM (Qchar_coding_system, "char-coding-system");
-
-  Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
-
    DEFSYM (Qvalid_codes, "valid-codes");
  
    DEFSYM (Qemacs_mule, "emacs-mule");
@@ -8100,13 +8277,6 @@ coding system used in each operation can't encode the text.
  The default value is `select-safe-coding-system' (which see).  */);
    Vselect_safe_coding_system_function = Qnil;
  
-  DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
-              doc: /*
-Char-table containing safe coding systems of each characters.
-Each element doesn't include such generic coding systems that can
-encode any characters.   They are in the first extra slot.  */);
-  Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
-
    DEFVAR_BOOL ("inhibit-iso-escape-detection",
                &inhibit_iso_escape_detection,
                doc: /*