Revision: emacs@sv.gnu.org/emacs--unicode--0--patch-11

author Miles Bader <miles@gnu.org>

Sat, 4 Feb 2006 01:01:38 +0000 (01:01 +0000)

committer Miles Bader <miles@gnu.org>

Sat, 4 Feb 2006 01:01:38 +0000 (01:01 +0000)
author Miles Bader <miles@gnu.org>
Sat, 4 Feb 2006 01:01:38 +0000 (01:01 +0000)
committer Miles Bader <miles@gnu.org>
Sat, 4 Feb 2006 01:01:38 +0000 (01:01 +0000)
diff --cc etc/NEWS
Simple merge
diff --cc etc/TODO
Simple merge
diff --cc lisp/ChangeLog
Simple merge
diff --cc lisp/arc-mode.el
Simple merge
diff --cc lisp/gnus/mml.el
Simple merge
diff --cc lisp/international/mule-cmds.el
Simple merge
diff --cc lisp/simple.el
Simple merge
diff --cc src/ChangeLog
Simple merge
diff --cc src/coding.c

index 6ec0804,385481d..f10a10e
--- 1/src/coding.c
--- 2/src/coding.c
+++ b/src/coding.c
@@@ -818,1480 -783,463 +818,1482 @@@ static struct coding_system coding_cate
     } while (0)
   
   
- -/* Decode composition sequence encoded by `emacs-mule' at the source
- -   pointed by SRC.  SRC_END is the end of source.  Store information
- -   of the composition in CODING->cmp_data.
+ +#define EMIT_FOUR_BYTES(c1, c2, c3, c4)               \
+ +  do {                                                \
+ +    EMIT_TWO_BYTES (c1, c2);                  \
+ +    EMIT_TWO_BYTES (c3, c4);                  \
+ +  } while (0)
   
- -   For backward compatibility, decode also a composition sequence of
- -   Emacs 20 style.  In that case, the composition sequence contains
- -   characters that should be extracted into a buffer or string.  Store
- -   those characters at *DESTINATION in multibyte form.
   
- -   If we encounter an invalid byte sequence, return 0.
- -   If we encounter an insufficient source or destination, or
- -   insufficient space in CODING->cmp_data, return 1.
- -   Otherwise, return consumed bytes in the source.
+ +/* Prototypes for static functions.  */
+ +static void record_conversion_result P_ ((struct coding_system *coding,
+ +                                        enum coding_result_code result));
+ +static int detect_coding_utf_8 P_ ((struct coding_system *,
+ +                                  struct coding_detection_info *info));
+ +static void decode_coding_utf_8 P_ ((struct coding_system *));
+ +static int encode_coding_utf_8 P_ ((struct coding_system *));
+ +
+ +static int detect_coding_utf_16 P_ ((struct coding_system *,
+ +                                   struct coding_detection_info *info));
+ +static void decode_coding_utf_16 P_ ((struct coding_system *));
+ +static int encode_coding_utf_16 P_ ((struct coding_system *));
+ +
+ +static int detect_coding_iso_2022 P_ ((struct coding_system *,
+ +                                     struct coding_detection_info *info));
+ +static void decode_coding_iso_2022 P_ ((struct coding_system *));
+ +static int encode_coding_iso_2022 P_ ((struct coding_system *));
+ +
+ +static int detect_coding_emacs_mule P_ ((struct coding_system *,
+ +                                       struct coding_detection_info *info));
+ +static void decode_coding_emacs_mule P_ ((struct coding_system *));
+ +static int encode_coding_emacs_mule P_ ((struct coding_system *));
+ +
+ +static int detect_coding_sjis P_ ((struct coding_system *,
+ +                                 struct coding_detection_info *info));
+ +static void decode_coding_sjis P_ ((struct coding_system *));
+ +static int encode_coding_sjis P_ ((struct coding_system *));
+ +
+ +static int detect_coding_big5 P_ ((struct coding_system *,
+ +                                 struct coding_detection_info *info));
+ +static void decode_coding_big5 P_ ((struct coding_system *));
+ +static int encode_coding_big5 P_ ((struct coding_system *));
+ +
+ +static int detect_coding_ccl P_ ((struct coding_system *,
+ +                                struct coding_detection_info *info));
+ +static void decode_coding_ccl P_ ((struct coding_system *));
+ +static int encode_coding_ccl P_ ((struct coding_system *));
+ +
+ +static void decode_coding_raw_text P_ ((struct coding_system *));
+ +static int encode_coding_raw_text P_ ((struct coding_system *));
+ +
+ +static void coding_set_source P_ ((struct coding_system *));
+ +static void coding_set_destination P_ ((struct coding_system *));
+ +static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
+ +static void coding_alloc_by_making_gap P_ ((struct coding_system *,
+ +                                          EMACS_INT));
+ +static unsigned char *alloc_destination P_ ((struct coding_system *,
+ +                                           EMACS_INT, unsigned char *));
+ +static void setup_iso_safe_charsets P_ ((Lisp_Object));
+ +static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
+ +                                                   int *, int *,
+ +                                                   unsigned char *));
+ +static int detect_eol P_ ((const unsigned char *,
+ +                         EMACS_INT, enum coding_category));
+ +static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
+ +static void decode_eol P_ ((struct coding_system *));
+ +static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
+ +static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
+ +                                      int, int *, int *));
+ +static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
+ +static INLINE void produce_composition P_ ((struct coding_system *, int *,
+ +                                          EMACS_INT));
+ +static INLINE void produce_charset P_ ((struct coding_system *, int *,
+ +                                      EMACS_INT));
+ +static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
+ +static int decode_coding P_ ((struct coding_system *));
+ +static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
+ +                                                    struct coding_system *, 
+ +                                                    int *, EMACS_INT *));
+ +static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
+ +                                                struct coding_system *,
+ +                                                int *, EMACS_INT *));
+ +static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
+ +static int encode_coding P_ ((struct coding_system *));
+ +static Lisp_Object make_conversion_work_buffer P_ ((int));
+ +static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
+ +static INLINE int char_encodable_p P_ ((int, Lisp_Object));
+ +static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
   
- -*/
- -static INLINE int
- -decode_composition_emacs_mule (coding, src, src_end,
- -                             destination, dst_end, dst_bytes)
- -     struct coding_system *coding;
- -     const unsigned char *src, *src_end;
- -     unsigned char **destination, *dst_end;
- -     int dst_bytes;
+ +static void
+ +record_conversion_result (struct coding_system *coding,
+ +                        enum coding_result_code result)
   {
- -  unsigned char *dst = *destination;
- -  int method, data_len, nchars;
- -  const unsigned char *src_base = src++;
- -  /* Store components of composition.  */
- -  int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
- -  int ncomponent;
- -  /* Store multibyte form of characters to be composed.  This is for
- -     Emacs 20 style composition sequence.  */
- -  unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
- -  unsigned char *bufp = buf;
- -  int c, i, gref, nref;
- -
- -  if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
- -      >= COMPOSITION_DATA_SIZE)
+ +  coding->result = result;
+ +  switch (result)
       {
- -      coding->result = CODING_FINISH_INSUFFICIENT_CMP;
- -      return -1;
+ +    case CODING_RESULT_INSUFFICIENT_SRC:
+ +      Vlast_code_conversion_error = Qinsufficient_source;
+ +      break;
+ +    case CODING_RESULT_INCONSISTENT_EOL:
+ +      Vlast_code_conversion_error = Qinconsistent_eol;
+ +      break;
+ +    case CODING_RESULT_INVALID_SRC:
+ +      Vlast_code_conversion_error = Qinvalid_source;
+ +      break;
+ +    case CODING_RESULT_INTERRUPT:
+ +      Vlast_code_conversion_error = Qinterrupted;
+ +      break;
+ +    case CODING_RESULT_INSUFFICIENT_MEM:
+ +      Vlast_code_conversion_error = Qinsufficient_memory;
+ +      break;
+ +    default:
+ +      Vlast_code_conversion_error = intern ("Unknown error");
       }
+ +}
   
- -  ONE_MORE_BYTE (c);
- -  if (c - 0xF0 >= COMPOSITION_RELATIVE
- -         && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
- -    {
- -      int with_rule;
+ +#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
+ +  do {                                                                             \
+ +    charset_map_loaded = 0;                                                \
+ +    c = DECODE_CHAR (charset, code);                                       \
+ +    if (charset_map_loaded)                                                \
+ +      {                                                                            \
+ +      const unsigned char *orig = coding->source;                          \
+ +      EMACS_INT offset;                                                    \
+ +                                                                           \
+ +      coding_set_source (coding);                                          \
+ +      offset = coding->source - orig;                                      \
+ +      src += offset;                                                       \
+ +      src_base += offset;                                                  \
+ +      src_end += offset;                                                   \
+ +      }                                                                            \
+ +  } while (0)
   
- -      method = c - 0xF0;
- -      with_rule = (method == COMPOSITION_WITH_RULE
- -                 || method == COMPOSITION_WITH_RULE_ALTCHARS);
- -      ONE_MORE_BYTE (c);
- -      data_len = c - 0xA0;
- -      if (data_len < 4
- -        || src_base + data_len > src_end)
- -      return 0;
- -      ONE_MORE_BYTE (c);
- -      nchars = c - 0xA0;
- -      if (c < 1)
- -      return 0;
- -      for (ncomponent = 0; src < src_base + data_len; ncomponent++)
- -      {
- -        /* If it is longer than this, it can't be valid.  */
- -        if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
- -          return 0;
   
- -        if (ncomponent % 2 && with_rule)
- -          {
- -            ONE_MORE_BYTE (gref);
- -            gref -= 32;
- -            ONE_MORE_BYTE (nref);
- -            nref -= 32;
- -            c = COMPOSITION_ENCODE_RULE (gref, nref);
- -          }
- -        else
- -          {
- -            int bytes;
- -            if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
- -                || (coding->flags /* We are recovering a file.  */
- -                    && src[0] == LEADING_CODE_8_BIT_CONTROL
- -                    && ! CHAR_HEAD_P (src[1])))
- -              c = STRING_CHAR (src, bytes);
- -            else
- -              c = *src, bytes = 1;
- -            src += bytes;
- -          }
- -        component[ncomponent] = c;
- -      }
+ +#define ASSURE_DESTINATION(bytes)                             \
+ +  do {                                                                \
+ +    if (dst + (bytes) >= dst_end)                             \
+ +      {                                                               \
+ +      int more_bytes = charbuf_end - charbuf + (bytes);       \
+ +                                                              \
+ +      dst = alloc_destination (coding, more_bytes, dst);      \
+ +      dst_end = coding->destination + coding->dst_bytes;      \
+ +      }                                                               \
+ +  } while (0)
+ +
+ +
+ +
+ +static void
+ +coding_set_source (coding)
+ +     struct coding_system *coding;
+ +{
+ +  if (BUFFERP (coding->src_object))
+ +    {
+ +      struct buffer *buf = XBUFFER (coding->src_object);
+ +
+ +      if (coding->src_pos < 0)
+ +      coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
+ +      else
+ +      coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
       }
- -  else if (c >= 0x80)
+ +  else if (STRINGP (coding->src_object))
       {
- -      /* This may be an old Emacs 20 style format.  See the comment at
- -       the section 2 of this file.  */
- -      while (src < src_end && !CHAR_HEAD_P (*src)) src++;
- -      if (src == src_end
- -        && !(coding->mode & CODING_MODE_LAST_BLOCK))
- -      goto label_end_of_loop;
+ +      coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
+ +    }
+ +  else
+ +    /* Otherwise, the source is C string and is never relocated
+ +       automatically.  Thus we don't have to update anything.  */
+ +    ;
+ +}
   
- -      src_end = src;
- -      src = src_base + 1;
- -      if (c < 0xC0)
+ +static void
+ +coding_set_destination (coding)
+ +     struct coding_system *coding;
+ +{
+ +  if (BUFFERP (coding->dst_object))
+ +    {
+ +      if (coding->src_pos < 0)
         {
- -        method = COMPOSITION_RELATIVE;
- -        for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
- -          {
- -            DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
- -            if (c < 0)
- -              break;
- -            component[ncomponent++] = c;
- -          }
- -        if (ncomponent < 2)
- -          return 0;
- -        nchars = ncomponent;
+ +        coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
+ +        coding->dst_bytes = (GAP_END_ADDR
+ +                             - (coding->src_bytes - coding->consumed)
+ +                             - coding->destination);
         }
- -      else if (c == 0xFF)
+ +      else
         {
- -        method = COMPOSITION_WITH_RULE;
- -        src++;
- -        DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
- -        if (c < 0)
- -          return 0;
- -        component[0] = c;
- -        for (ncomponent = 1;
- -             ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
- -          {
- -            DECODE_EMACS_MULE_COMPOSITION_RULE (c);
- -            if (c < 0)
- -              break;
- -            component[ncomponent++] = c;
- -            DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
- -            if (c < 0)
- -              break;
- -            component[ncomponent++] = c;
- -          }
- -        if (ncomponent < 3)
- -          return 0;
- -        nchars = (ncomponent + 1) / 2;
+ +        /* We are sure that coding->dst_pos_byte is before the gap
+ +           of the buffer. */
+ +        coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
+ +                               + coding->dst_pos_byte - 1);
+ +        coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
+ +                             - coding->destination);
         }
- -      else
- -      return 0;
       }
     else
- -    return 0;
+ +    /* Otherwise, the destination is C string and is never relocated
+ +       automatically.  Thus we don't have to update anything.  */
+ +    ;
+ +}
+ +
+ +
+ +static void
+ +coding_alloc_by_realloc (coding, bytes)
+ +     struct coding_system *coding;
+ +     EMACS_INT bytes;
+ +{
+ +  coding->destination = (unsigned char *) xrealloc (coding->destination,
+ +                                                  coding->dst_bytes + bytes);
+ +  coding->dst_bytes += bytes;
+ +}
   
- -  if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
+ +static void
+ +coding_alloc_by_making_gap (coding, bytes)
+ +     struct coding_system *coding;
+ +     EMACS_INT bytes;
+ +{
+ +  if (BUFFERP (coding->dst_object)
+ +      && EQ (coding->src_object, coding->dst_object))
       {
- -      CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
- -      for (i = 0; i < ncomponent; i++)
- -      CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
- -      CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
- -      if (buf < bufp)
- -      {
- -        unsigned char *p = buf;
- -        EMIT_BYTES (p, bufp);
- -        *destination += bufp - buf;
- -        coding->produced_char += nchars;
- -      }
- -      return (src - src_base);
+ +      EMACS_INT add = coding->src_bytes - coding->consumed;
+ +
+ +      GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
+ +      make_gap (bytes);
+ +      GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
+ +    }
-   else
++  else if (c >= 0x80)
+ +    {
+ +      Lisp_Object this_buffer;
+ +
+ +      this_buffer = Fcurrent_buffer ();
+ +      set_buffer_internal (XBUFFER (coding->dst_object));
+ +      make_gap (bytes);
+ +      set_buffer_internal (XBUFFER (this_buffer));
       }
- - label_end_of_loop:
- -  return -1;
   }
   
- -/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
   
- -static void
- -decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
+ +static unsigned char *
+ +alloc_destination (coding, nbytes, dst)
        struct coding_system *coding;
- -     const unsigned char *source;
- -     unsigned char *destination;
- -     int src_bytes, dst_bytes;
+ +     EMACS_INT nbytes;
+ +     unsigned char *dst;
   {
- -  const unsigned char *src = source;
- -  const unsigned char *src_end = source + src_bytes;
- -  unsigned char *dst = destination;
- -  unsigned char *dst_end = destination + dst_bytes;
- -  /* SRC_BASE remembers the start position in source in each loop.
- -     The loop will be exited when there's not enough source code, or
- -     when there's not enough destination area to produce a
- -     character.  */
- -  const unsigned char *src_base;
+ +  EMACS_INT offset = dst - coding->destination;
+ +
+ +  if (BUFFERP (coding->dst_object))
+ +    coding_alloc_by_making_gap (coding, nbytes);
+ +  else
+ +    coding_alloc_by_realloc (coding, nbytes);
+ +  record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ +  coding_set_destination (coding);
+ +  dst = coding->destination + offset;
+ +  return dst;
+ +}
+ +
+ +/** Macros for annotations.  */
+ +
+ +/* Maximum length of annotation data (sum of annotations for
+ +   composition and charset).  */
+ +#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
+ +
+ +/* An annotation data is stored in the array coding->charbuf in this
+ +   format:
+ +     [ -LENGTH ANNOTATION_MASK NCHARS ... ]
+ +   LENGTH is the number of elements in the annotation.
+ +   ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
+ +   NCHARS is the number of characters in the text annotated.
+ +
+ +   The format of the following elements depend on ANNOTATION_MASK.
+ +
+ +   In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
+ +   follows:
+ +     ... METHOD [ COMPOSITION-COMPONENTS ... ]
+ +   METHOD is one of enum composition_method.
+ +   Optionnal COMPOSITION-COMPONENTS are characters and composition
+ +   rules.
+ +
+ +   In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
+ +   follows.  */
+ +
+ +#define ADD_ANNOTATION_DATA(buf, len, mask, nchars)   \
+ +  do {                                                        \
+ +    *(buf)++ = -(len);                                        \
+ +    *(buf)++ = (mask);                                        \
+ +    *(buf)++ = (nchars);                              \
+ +    coding->annotated = 1;                            \
+ +  } while (0);
+ +
+ +#define ADD_COMPOSITION_DATA(buf, nchars, method)                         \
+ +  do {                                                                            \
+ +    ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
+ +    *buf++ = method;                                                      \
+ +  } while (0)
+ +
+ +
+ +#define ADD_CHARSET_DATA(buf, nchars, id)                             \
+ +  do {                                                                        \
+ +    ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars);       \
+ +    *buf++ = id;                                                      \
+ +  } while (0)
+ +
+ +\f
+ +/*** 2. Emacs' internal format (emacs-utf-8) ***/
+ +
+ +
+ +
+ +\f
+ +/*** 3. UTF-8 ***/
+ +
+ +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ +   Check if a text is encoded in UTF-8.  If it is, return 1, else
+ +   return 0.  */
+ +
+ +#define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
+ +#define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
+ +#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
+ +#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
+ +#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
+ +#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
+ +
+ +static int
+ +detect_coding_utf_8 (coding, detect_info)
+ +     struct coding_system *coding;
+ +     struct coding_detection_info *detect_info;
+ +{
+ +  const unsigned char *src = coding->source, *src_base;
+ +  const unsigned char *src_end = coding->source + coding->src_bytes;
+ +  int multibytep = coding->src_multibyte;
+ +  int consumed_chars = 0;
+ +  int found = 0;
+ +
+ +  detect_info->checked |= CATEGORY_MASK_UTF_8;
+ +  /* A coding system of this category is always ASCII compatible.  */
+ +  src += coding->head_ascii;
   
- -  coding->produced_char = 0;
- -  while ((src_base = src) < src_end)
+ +  while (1)
       {
- -      unsigned char tmp[MAX_MULTIBYTE_LENGTH];
- -      const unsigned char *p;
- -      int bytes;
+ +      int c, c1, c2, c3, c4;
   
- -      if (*src == '\r')
+ +      src_base = src;
+ +      ONE_MORE_BYTE (c);
+ +      if (c < 0 || UTF_8_1_OCTET_P (c))
+ +      continue;
+ +      ONE_MORE_BYTE (c1);
+ +      if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
+ +      break;
+ +      if (UTF_8_2_OCTET_LEADING_P (c))
         {
- -        int c = *src++;
- -
- -        if (coding->eol_type == CODING_EOL_CR)
- -          c = '\n';
- -        else if (coding->eol_type == CODING_EOL_CRLF)
- -          {
- -            ONE_MORE_BYTE (c);
- -            if (c != '\n')
- -              {
- -                src--;
- -                c = '\r';
- -              }
- -          }
- -        *dst++ = c;
- -        coding->produced_char++;
+ +        found = CATEGORY_MASK_UTF_8;
           continue;
         }
- -      else if (*src == '\n')
+ +      ONE_MORE_BYTE (c2);
+ +      if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
+ +      break;
+ +      if (UTF_8_3_OCTET_LEADING_P (c))
         {
- -        if ((coding->eol_type == CODING_EOL_CR
- -             || coding->eol_type == CODING_EOL_CRLF)
- -            && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
- -          {
- -            coding->result = CODING_FINISH_INCONSISTENT_EOL;
- -            goto label_end_of_loop;
- -          }
- -        *dst++ = *src++;
- -        coding->produced_char++;
+ +        found = CATEGORY_MASK_UTF_8;
           continue;
         }
- -      else if (*src == 0x80 && coding->cmp_data)
+ +      ONE_MORE_BYTE (c3);
+ +      if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
+ +      break;
+ +      if (UTF_8_4_OCTET_LEADING_P (c))
         {
- -        /* Start of composition data.  */
- -        int consumed  = decode_composition_emacs_mule (coding, src, src_end,
- -                                                       &dst, dst_end,
- -                                                       dst_bytes);
- -        if (consumed < 0)
- -          goto label_end_of_loop;
- -        else if (consumed > 0)
- -          {
- -            src += consumed;
- -            continue;
- -          }
- -        bytes = CHAR_STRING (*src, tmp);
- -        p = tmp;
- -        src++;
+ +        found = CATEGORY_MASK_UTF_8;
+ +        continue;
         }
- -      else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
- -             || (coding->flags /* We are recovering a file.  */
- -                 && src[0] == LEADING_CODE_8_BIT_CONTROL
- -                 && ! CHAR_HEAD_P (src[1])))
+ +      ONE_MORE_BYTE (c4);
+ +      if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
+ +      break;
+ +      if (UTF_8_5_OCTET_LEADING_P (c))
         {
- -        p = src;
- -        src += bytes;
+ +        found = CATEGORY_MASK_UTF_8;
+ +        continue;
         }
- -      else
- -      {
- -        int i, c;
+ +      break;
+ +    }
+ +  detect_info->rejected |= CATEGORY_MASK_UTF_8;
+ +  return 0;
   
- -        bytes = BYTES_BY_CHAR_HEAD (*src);
- -        src++;
- -        for (i = 1; i < bytes; i++)
- -          {
- -            ONE_MORE_BYTE (c);
- -            if (CHAR_HEAD_P (c))
- -              break;
- -          }
- -        if (i < bytes)
- -          {
- -            bytes = CHAR_STRING (*src_base, tmp);
- -            p = tmp;
- -            src = src_base + 1;
- -          }
- -        else
- -          {
- -            p = src_base;
- -          }
- -      }
- -      if (dst + bytes >= (dst_bytes ? dst_end : src))
- -      {
- -        coding->result = CODING_FINISH_INSUFFICIENT_DST;
- -        break;
- -      }
- -      while (bytes--) *dst++ = *p++;
- -      coding->produced_char++;
+ + no_more_source:
+ +  if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
+ +    {
+ +      detect_info->rejected |= CATEGORY_MASK_UTF_8;
+ +      return 0;
       }
- - label_end_of_loop:
- -  coding->consumed = coding->consumed_char = src_base - source;
- -  coding->produced = dst - destination;
+ +  detect_info->found |= found;
+ +  return 1;
   }
   
   
- -/* Encode composition data stored at DATA into a special byte sequence
- -   starting by 0x80.  Update CODING->cmp_data_start and maybe
- -   CODING->cmp_data for the next call.  */
- -
- -#define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                   \
- -  do {                                                                        \
- -    unsigned char buf[1024], *p0 = buf, *p;                           \
- -    int len = data[0];                                                        \
- -    int i;                                                            \
- -                                                                      \
- -    buf[0] = 0x80;                                                    \
- -    buf[1] = 0xF0 + data[3];  /* METHOD */                            \
- -    buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */         \
- -    p = buf + 4;                                                      \
- -    if (data[3] == COMPOSITION_WITH_RULE                              \
- -      || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
- -      {                                                                       \
- -      p += CHAR_STRING (data[4], p);                                  \
- -      for (i = 5; i < len; i += 2)                                    \
- -        {                                                             \
- -          int gref, nref;                                             \
- -           COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
- -          *p++ = 0x20 + gref;                                         \
- -          *p++ = 0x20 + nref;                                         \
- -          p += CHAR_STRING (data[i + 1], p);                          \
- -        }                                                             \
- -      }                                                                       \
- -    else                                                              \
- -      {                                                                       \
- -      for (i = 4; i < len; i++)                                       \
- -        p += CHAR_STRING (data[i], p);                                \
- -      }                                                                       \
- -    buf[2] = 0xA0 + (p - buf);        /* COMPONENTS-BYTES */                  \
- -                                                                      \
- -    if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))            \
- -      {                                                                       \
- -      coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
- -      goto label_end_of_loop;                                         \
- -      }                                                                       \
- -    while (p0 < p)                                                    \
- -      *dst++ = *p0++;                                                 \
- -    coding->cmp_data_start += data[0];                                        \
- -    if (coding->cmp_data_start == coding->cmp_data->used              \
- -      && coding->cmp_data->next)                                      \
- -      {                                                                       \
- -      coding->cmp_data = coding->cmp_data->next;                      \
- -      coding->cmp_data_start = 0;                                     \
- -      }                                                                       \
- -  } while (0)
- -
- -
- -static void encode_eol P_ ((struct coding_system *, const unsigned char *,
- -                          unsigned char *, int, int));
- -
   static void
- -encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
+ +decode_coding_utf_8 (coding)
        struct coding_system *coding;
- -     const unsigned char *source;
- -     unsigned char *destination;
- -     int src_bytes, dst_bytes;
   {
- -  const unsigned char *src = source;
- -  const unsigned char *src_end = source + src_bytes;
- -  unsigned char *dst = destination;
- -  unsigned char *dst_end = destination + dst_bytes;
+ +  const unsigned char *src = coding->source + coding->consumed;
+ +  const unsigned char *src_end = coding->source + coding->src_bytes;
     const unsigned char *src_base;
- -  int c;
- -  int char_offset;
- -  int *data;
- -
- -  Lisp_Object translation_table;
+ +  int *charbuf = coding->charbuf + coding->charbuf_used;
+ +  int *charbuf_end = coding->charbuf + coding->charbuf_size;
+ +  int consumed_chars = 0, consumed_chars_base;
+ +  int multibytep = coding->src_multibyte;
+ +  Lisp_Object attr, charset_list;
   
- -  translation_table = Qnil;
+ +  CODING_GET_INFO (coding, attr, charset_list);
   
- -  /* Optimization for the case that there's no composition.  */
- -  if (!coding->cmp_data || coding->cmp_data->used == 0)
- -    {
- -      encode_eol (coding, source, destination, src_bytes, dst_bytes);
- -      return;
- -    }
- -
- -  char_offset = coding->cmp_data->char_offset;
- -  data = coding->cmp_data->data + coding->cmp_data_start;
     while (1)
       {
+ +      int c, c1, c2, c3, c4, c5;
+ +
         src_base = src;
+ +      consumed_chars_base = consumed_chars;
   
- -      /* If SRC starts a composition, encode the information about the
- -       composition in advance.  */
- -      if (coding->cmp_data_start < coding->cmp_data->used
- -        && char_offset + coding->consumed_char == data[1])
+ +      if (charbuf >= charbuf_end)
+ +      break;
+ +
+ +      ONE_MORE_BYTE (c1);
+ +      if (c1 < 0)
         {
- -        ENCODE_COMPOSITION_EMACS_MULE (coding, data);
- -        char_offset = coding->cmp_data->char_offset;
- -        data = coding->cmp_data->data + coding->cmp_data_start;
+ +        c = - c1;
         }
- -
- -      ONE_MORE_CHAR (c);
- -      if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
- -                      || coding->eol_type == CODING_EOL_CR))
+ +      else if (UTF_8_1_OCTET_P(c1))
         {
- -        if (coding->eol_type == CODING_EOL_CRLF)
- -          EMIT_TWO_BYTES ('\r', c);
- -        else
- -          EMIT_ONE_BYTE ('\r');
+ +        c = c1;
         }
- -      else if (SINGLE_BYTE_CHAR_P (c))
+ +      else
         {
- -        if (coding->flags && ! ASCII_BYTE_P (c))
+ +        ONE_MORE_BYTE (c2);
+ +        if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
+ +          goto invalid_code;
+ +        if (UTF_8_2_OCTET_LEADING_P (c1))
             {
- -            /* As we are auto saving, retain the multibyte form for
- -               8-bit chars.  */
- -            unsigned char buf[MAX_MULTIBYTE_LENGTH];
- -            int bytes = CHAR_STRING (c, buf);
- -
- -            if (bytes == 1)
- -              EMIT_ONE_BYTE (buf[0]);
- -            else
- -              EMIT_TWO_BYTES (buf[0], buf[1]);
+ +            c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
+ +            /* Reject overlong sequences here and below.  Encoders
+ +               producing them are incorrect, they can be misleading,
+ +               and they mess up read/write invariance.  */
+ +            if (c < 128)
+ +              goto invalid_code;
             }
           else
- -          EMIT_ONE_BYTE (c);
+ +          {
+ +            ONE_MORE_BYTE (c3);
+ +            if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
+ +              goto invalid_code;
+ +            if (UTF_8_3_OCTET_LEADING_P (c1))
+ +              {
+ +                c = (((c1 & 0xF) << 12)
+ +                     | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
+ +                if (c < 0x800
+ +                    || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
+ +                  goto invalid_code;
+ +              }
+ +            else
+ +              {
+ +                ONE_MORE_BYTE (c4);
+ +                if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
+ +                  goto invalid_code;
+ +                if (UTF_8_4_OCTET_LEADING_P (c1))
+ +                  {
+ +                  c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
+ +                       | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
+ +                  if (c < 0x10000)
+ +                    goto invalid_code;
+ +                  }
+ +                else
+ +                  {
+ +                    ONE_MORE_BYTE (c5);
+ +                    if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
+ +                      goto invalid_code;
+ +                    if (UTF_8_5_OCTET_LEADING_P (c1))
+ +                      {
+ +                        c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
+ +                             | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
+ +                             | (c5 & 0x3F));
+ +                        if ((c > MAX_CHAR) || (c < 0x200000))
+ +                          goto invalid_code;
+ +                      }
+ +                    else
+ +                      goto invalid_code;
+ +                  }
+ +              }
+ +          }
         }
- -      else
- -      EMIT_BYTES (src_base, src);
- -      coding->consumed_char++;
- -    }
- - label_end_of_loop:
- -  coding->consumed = src_base - source;
- -  coding->produced = coding->produced_char = dst - destination;
- -  return;
- -}
   
- -\f
- -/*** 3. ISO2022 handlers ***/
+ +      *charbuf++ = c;
+ +      continue;
   
- -/* The following note describes the coding system ISO2022 briefly.
- -   Since the intention of this note is to help understand the
- -   functions in this file, some parts are NOT ACCURATE or are OVERLY
- -   SIMPLIFIED.  For thorough understanding, please refer to the
- -   original document of ISO2022.  This is equivalent to the standard
- -   ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
+ +    invalid_code:
+ +      src = src_base;
+ +      consumed_chars = consumed_chars_base;
+ +      ONE_MORE_BYTE (c);
+ +      *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
+ +      coding->errors++;
+ +    }
++  else
++    return 0;
   
- -   ISO2022 provides many mechanisms to encode several character sets
- -   in 7-bit and 8-bit environments.  For 7-bit environments, all text
- -   is encoded using bytes less than 128.  This may make the encoded
- -   text a little bit longer, but the text passes more easily through
- -   several types of gateway, some of which strip off the MSB (Most
- -   Significant Bit).
+ + no_more_source:
+ +  coding->consumed_char += consumed_chars_base;
+ +  coding->consumed = src_base - coding->source;
+ +  coding->charbuf_used = charbuf - coding->charbuf;
+ +}
   
- -   There are two kinds of character sets: control character sets and
- -   graphic character sets.  The former contain control characters such
- -   as `newline' and `escape' to provide control functions (control
- -   functions are also provided by escape sequences).  The latter
- -   contain graphic characters such as 'A' and '-'.  Emacs recognizes
- -   two control character sets and many graphic character sets.
   
- -   Graphic character sets are classified into one of the following
- -   four classes, according to the number of bytes (DIMENSION) and
- -   number of characters in one dimension (CHARS) of the set:
- -   - DIMENSION1_CHARS94
- -   - DIMENSION1_CHARS96
- -   - DIMENSION2_CHARS94
- -   - DIMENSION2_CHARS96
+ +static int
+ +encode_coding_utf_8 (coding)
+ +     struct coding_system *coding;
+ +{
+ +  int multibytep = coding->dst_multibyte;
+ +  int *charbuf = coding->charbuf;
+ +  int *charbuf_end = charbuf + coding->charbuf_used;
+ +  unsigned char *dst = coding->destination + coding->produced;
+ +  unsigned char *dst_end = coding->destination + coding->dst_bytes;
+ +  int produced_chars = 0;
+ +  int c;
   
- -   In addition, each character set is assigned an identification tag,
- -   unique for each set, called the "final character" (denoted as <F>
- -   hereafter).  The <F> of each character set is decided by ECMA(*)
- -   when it is registered in ISO.  The code range of <F> is 0x30..0x7F
- -   (0x30..0x3F are for private use only).
+ +  if (multibytep)
+ +    {
+ +      int safe_room = MAX_MULTIBYTE_LENGTH * 2;
   
- -   Note (*): ECMA = European Computer Manufacturers Association
+ +      while (charbuf < charbuf_end)
+ +      {
+ +        unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
   
- -   Here are examples of graphic character sets [NAME(<F>)]:
+ +        ASSURE_DESTINATION (safe_room);
+ +        c = *charbuf++;
+ +        if (CHAR_BYTE8_P (c))
+ +          {
+ +            c = CHAR_TO_BYTE8 (c);
+ +            EMIT_ONE_BYTE (c);
+ +          }
+ +        else
+ +          {
+ +            CHAR_STRING_ADVANCE (c, pend);
+ +            for (p = str; p < pend; p++)
+ +              EMIT_ONE_BYTE (*p);
+ +          }
+ +      }
+ +    }
+ +  else
+ +    {
+ +      int safe_room = MAX_MULTIBYTE_LENGTH;
+ +
+ +      while (charbuf < charbuf_end)
+ +      {
+ +        ASSURE_DESTINATION (safe_room);
+ +        c = *charbuf++;
+ +        if (CHAR_BYTE8_P (c))
+ +          *dst++ = CHAR_TO_BYTE8 (c);
+ +        else
+ +          dst += CHAR_STRING (c, dst);
+ +        produced_chars++;
+ +      }
+ +    }
+ +  record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ +  coding->produced_char += produced_chars;
+ +  coding->produced = dst - coding->destination;
+ +  return 0;
+ +}
+ +
+ +
+ +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ +   Check if a text is encoded in one of UTF-16 based coding systems.
+ +   If it is, return 1, else return 0.  */
+ +
+ +#define UTF_16_HIGH_SURROGATE_P(val) \
+ +  (((val) & 0xFC00) == 0xD800)
+ +
+ +#define UTF_16_LOW_SURROGATE_P(val) \
+ +  (((val) & 0xFC00) == 0xDC00)
+ +
+ +#define UTF_16_INVALID_P(val) \
+ +  (((val) == 0xFFFE)          \
+ +   || ((val) == 0xFFFF)               \
+ +   || UTF_16_LOW_SURROGATE_P (val))
+ +
+ +
+ +static int
+ +detect_coding_utf_16 (coding, detect_info)
+ +     struct coding_system *coding;
+ +     struct coding_detection_info *detect_info;
+ +{
+ +  const unsigned char *src = coding->source, *src_base = src;
+ +  const unsigned char *src_end = coding->source + coding->src_bytes;
+ +  int multibytep = coding->src_multibyte;
+ +  int consumed_chars = 0;
+ +  int c1, c2;
+ +
+ +  detect_info->checked |= CATEGORY_MASK_UTF_16;
+ +  if (coding->mode & CODING_MODE_LAST_BLOCK
+ +      && (coding->src_chars & 1))
+ +    {
+ +      detect_info->rejected |= CATEGORY_MASK_UTF_16;
+ +      return 0;
+ +    }
+ +
+ +  ONE_MORE_BYTE (c1);
+ +  ONE_MORE_BYTE (c2);
+ +  if ((c1 == 0xFF) && (c2 == 0xFE))
+ +    {
+ +      detect_info->found |= (CATEGORY_MASK_UTF_16_LE
+ +                           | CATEGORY_MASK_UTF_16_AUTO);
+ +      detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
+ +                              | CATEGORY_MASK_UTF_16_BE_NOSIG
+ +                              | CATEGORY_MASK_UTF_16_LE_NOSIG);
+ +    }
+ +  else if ((c1 == 0xFE) && (c2 == 0xFF))
+ +    {
+ +      detect_info->found |= (CATEGORY_MASK_UTF_16_BE
+ +                           | CATEGORY_MASK_UTF_16_AUTO);
+ +      detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
+ +                              | CATEGORY_MASK_UTF_16_BE_NOSIG
+ +                              | CATEGORY_MASK_UTF_16_LE_NOSIG);
+ +    }
+ +  else if (c1 >= 0 && c2 >= 0)
+ +    {
+ +      detect_info->rejected
+ +      |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
+ +    }
+ + no_more_source:
+ +  return 1;
+ +}
+ +
+ +static void
+ +decode_coding_utf_16 (coding)
+ +     struct coding_system *coding;
+ +{
+ +  const unsigned char *src = coding->source + coding->consumed;
+ +  const unsigned char *src_end = coding->source + coding->src_bytes;
+ +  const unsigned char *src_base;
+ +  int *charbuf = coding->charbuf + coding->charbuf_used;
+ +  int *charbuf_end = coding->charbuf + coding->charbuf_size;
+ +  int consumed_chars = 0, consumed_chars_base;
+ +  int multibytep = coding->src_multibyte;
+ +  enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ +  enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
+ +  int surrogate = CODING_UTF_16_SURROGATE (coding);
+ +  Lisp_Object attr, charset_list;
+ +
+ +  CODING_GET_INFO (coding, attr, charset_list);
+ +
+ +  if (bom == utf_16_with_bom)
+ +    {
+ +      int c, c1, c2;
+ +
+ +      src_base = src;
+ +      ONE_MORE_BYTE (c1);
+ +      ONE_MORE_BYTE (c2);
+ +      c = (c1 << 8) | c2;
+ +
+ +      if (endian == utf_16_big_endian
+ +        ? c != 0xFEFF : c != 0xFFFE)
+ +      {
+ +        /* The first two bytes are not BOM.  Treat them as bytes
+ +           for a normal character.  */
+ +        src = src_base;
+ +        coding->errors++;
+ +      }
+ +      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ +    }
+ +  else if (bom == utf_16_detect_bom)
+ +    {
+ +      /* We have already tried to detect BOM and failed in
+ +       detect_coding.  */
+ +      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ +    }
+ +
+ +  while (1)
+ +    {
+ +      int c, c1, c2;
+ +
+ +      src_base = src;
+ +      consumed_chars_base = consumed_chars;
+ +
+ +      if (charbuf + 2 >= charbuf_end)
+ +      break;
+ +
+ +      ONE_MORE_BYTE (c1);
+ +      if (c1 < 0)
+ +      {
+ +        *charbuf++ = -c1;
+ +        continue;
+ +      }
+ +      ONE_MORE_BYTE (c2);
+ +      if (c2 < 0)
+ +      {
+ +        *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
+ +        *charbuf++ = -c2;
+ +        continue;
+ +      }
+ +      c = (endian == utf_16_big_endian
+ +         ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
+ +      if (surrogate)
+ +      {
+ +        if (! UTF_16_LOW_SURROGATE_P (c))
+ +          {
+ +            if (endian == utf_16_big_endian)
+ +              c1 = surrogate >> 8, c2 = surrogate & 0xFF;
+ +            else
+ +              c1 = surrogate & 0xFF, c2 = surrogate >> 8;
+ +            *charbuf++ = c1;
+ +            *charbuf++ = c2;
+ +            coding->errors++;
+ +            if (UTF_16_HIGH_SURROGATE_P (c))
+ +              CODING_UTF_16_SURROGATE (coding) = surrogate = c;
+ +            else
+ +              *charbuf++ = c;
+ +          }
+ +        else
+ +          {
+ +            c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
+ +            CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
+ +            *charbuf++ = 0x10000 + c;
+ +          }
+ +      }
+ +      else
+ +      {
+ +        if (UTF_16_HIGH_SURROGATE_P (c))
+ +          CODING_UTF_16_SURROGATE (coding) = surrogate = c;
+ +        else
+ +          *charbuf++ = c;
+ +      }
+ +    }
+ +
+ + no_more_source:
+ +  coding->consumed_char += consumed_chars_base;
+ +  coding->consumed = src_base - coding->source;
+ +  coding->charbuf_used = charbuf - coding->charbuf;
+ +}
+ +
+ +static int
+ +encode_coding_utf_16 (coding)
+ +     struct coding_system *coding;
+ +{
+ +  int multibytep = coding->dst_multibyte;
+ +  int *charbuf = coding->charbuf;
+ +  int *charbuf_end = charbuf + coding->charbuf_used;
+ +  unsigned char *dst = coding->destination + coding->produced;
+ +  unsigned char *dst_end = coding->destination + coding->dst_bytes;
+ +  int safe_room = 8;
+ +  enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ +  int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
+ +  int produced_chars = 0;
+ +  Lisp_Object attrs, charset_list;
+ +  int c;
+ +
+ +  CODING_GET_INFO (coding, attrs, charset_list);
+ +
+ +  if (bom != utf_16_without_bom)
+ +    {
+ +      ASSURE_DESTINATION (safe_room);
+ +      if (big_endian)
+ +      EMIT_TWO_BYTES (0xFE, 0xFF);
+ +      else
+ +      EMIT_TWO_BYTES (0xFF, 0xFE);
+ +      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ +    }
+ +
+ +  while (charbuf < charbuf_end)
+ +    {
+ +      ASSURE_DESTINATION (safe_room);
+ +      c = *charbuf++;
+ +      if (c >= MAX_UNICODE_CHAR)
+ +      c = coding->default_char;
+ +
+ +      if (c < 0x10000)
+ +      {
+ +        if (big_endian)
+ +          EMIT_TWO_BYTES (c >> 8, c & 0xFF);
+ +        else
+ +          EMIT_TWO_BYTES (c & 0xFF, c >> 8);
+ +      }
+ +      else
+ +      {
+ +        int c1, c2;
+ +
+ +        c -= 0x10000;
+ +        c1 = (c >> 10) + 0xD800;
+ +        c2 = (c & 0x3FF) + 0xDC00;
+ +        if (big_endian)
+ +          EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
+ +        else
+ +          EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
+ +      }
+ +    }
+ +  record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ +  coding->produced = dst - coding->destination;
+ +  coding->produced_char += produced_chars;
+ +  return 0;
+ +}
+ +
+ +\f
+ +/*** 6. Old Emacs' internal format (emacs-mule) ***/
+ +
+ +/* Emacs' internal format for representation of multiple character
+ +   sets is a kind of multi-byte encoding, i.e. characters are
+ +   represented by variable-length sequences of one-byte codes.
+ +
+ +   ASCII characters and control characters (e.g. `tab', `newline') are
+ +   represented by one-byte sequences which are their ASCII codes, in
+ +   the range 0x00 through 0x7F.
+ +
+ +   8-bit characters of the range 0x80..0x9F are represented by
+ +   two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
+ +   code + 0x20).
+ +
+ +   8-bit characters of the range 0xA0..0xFF are represented by
+ +   one-byte sequences which are their 8-bit code.
+ +
+ +   The other characters are represented by a sequence of `base
+ +   leading-code', optional `extended leading-code', and one or two
+ +   `position-code's.  The length of the sequence is determined by the
+ +   base leading-code.  Leading-code takes the range 0x81 through 0x9D,
+ +   whereas extended leading-code and position-code take the range 0xA0
+ +   through 0xFF.  See `charset.h' for more details about leading-code
+ +   and position-code.
+ +
+ +   --- CODE RANGE of Emacs' internal format ---
+ +   character set      range
+ +   -------------      -----
+ +   ascii              0x00..0x7F
+ +   eight-bit-control  LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
+ +   eight-bit-graphic  0xA0..0xBF
+ +   ELSE                       0x81..0x9D + [0xA0..0xFF]+
+ +   ---------------------------------------------
+ +
+ +   As this is the internal character representation, the format is
+ +   usually not used externally (i.e. in a file or in a data sent to a
+ +   process).  But, it is possible to have a text externally in this
+ +   format (i.e. by encoding by the coding system `emacs-mule').
+ +
+ +   In that case, a sequence of one-byte codes has a slightly different
+ +   form.
+ +
+ +   At first, all characters in eight-bit-control are represented by
+ +   one-byte sequences which are their 8-bit code.
+ +
+ +   Next, character composition data are represented by the byte
+ +   sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
+ +   where,
+ +      METHOD is 0xF0 plus one of composition method (enum
+ +      composition_method),
+ +
+ +      BYTES is 0xA0 plus a byte length of this composition data,
+ +
+ +      CHARS is 0x20 plus a number of characters composed by this
+ +      data,
+ +
+ +      COMPONENTs are characters of multibye form or composition
+ +      rules encoded by two-byte of ASCII codes.
+ +
+ +   In addition, for backward compatibility, the following formats are
+ +   also recognized as composition data on decoding.
+ +
+ +   0x80 MSEQ ...
+ +   0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
+ +
+ +   Here,
+ +      MSEQ is a multibyte form but in these special format:
+ +        ASCII: 0xA0 ASCII_CODE+0x80,
+ +        other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
+ +      RULE is a one byte code of the range 0xA0..0xF0 that
+ +      represents a composition rule.
+ +  */
+ +
+ +char emacs_mule_bytes[256];
+ +
+ +int
+ +emacs_mule_char (coding, src, nbytes, nchars, id)
+ +     struct coding_system *coding;
+ +     const unsigned char *src;
+ +     int *nbytes, *nchars, *id;
+ +{
+ +  const unsigned char *src_end = coding->source + coding->src_bytes;
+ +  const unsigned char *src_base = src;
+ +  int multibytep = coding->src_multibyte;
+ +  struct charset *charset;
+ +  unsigned code;
+ +  int c;
+ +  int consumed_chars = 0;
+ +
+ +  ONE_MORE_BYTE (c);
+ +  if (c < 0)
+ +    {
+ +      c = -c;
+ +      charset = emacs_mule_charset[0];
+ +    }
+ +  else
+ +    {
+ +      switch (emacs_mule_bytes[c])
+ +      {
+ +      case 2:
+ +        if (! (charset = emacs_mule_charset[c]))
+ +          goto invalid_code;
+ +        ONE_MORE_BYTE (c);
+ +        if (c < 0xA0)
+ +          goto invalid_code;
+ +        code = c & 0x7F;
+ +        break;
+ +
+ +      case 3:
+ +        if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
+ +            || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
+ +          {
+ +            ONE_MORE_BYTE (c);
+ +            if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
+ +              goto invalid_code;
+ +            ONE_MORE_BYTE (c);
+ +            if (c < 0xA0)
+ +              goto invalid_code;
+ +            code = c & 0x7F;
+ +          }
+ +        else
+ +          {
+ +            if (! (charset = emacs_mule_charset[c]))
+ +              goto invalid_code;
+ +            ONE_MORE_BYTE (c);
+ +            if (c < 0xA0)
+ +              goto invalid_code;
+ +            code = (c & 0x7F) << 8;
+ +            ONE_MORE_BYTE (c);
+ +            if (c < 0xA0)
+ +              goto invalid_code;
+ +            code |= c & 0x7F;
+ +          }
+ +        break;
+ +
+ +      case 4:
+ +        ONE_MORE_BYTE (c);
+ +        if (c < 0 || ! (charset = emacs_mule_charset[c]))
+ +          goto invalid_code;
+ +        ONE_MORE_BYTE (c);
+ +        if (c < 0xA0)
+ +          goto invalid_code;
+ +        code = (c & 0x7F) << 8;
+ +        ONE_MORE_BYTE (c);
+ +        if (c < 0xA0)
+ +          goto invalid_code;
+ +        code |= c & 0x7F;
+ +        break;
+ +
+ +      case 1:
+ +        code = c;
+ +        charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
+ +                                   ? charset_ascii : charset_eight_bit);
+ +        break;
+ +
+ +      default:
+ +        abort ();
+ +      }
+ +      c = DECODE_CHAR (charset, code);
+ +      if (c < 0)
+ +      goto invalid_code;
+ +    }
+ +  *nbytes = src - src_base;
+ +  *nchars = consumed_chars;
+ +  if (id)
+ +    *id = charset->id;
+ +  return c;
+ +
+ + no_more_source:
+ +  return -2;
+ +
+ + invalid_code:
+ +  return -1;
+ +}
+ +
+ +
+ +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ +   Check if a text is encoded in `emacs-mule'.  If it is, return 1,
+ +   else return 0.  */
+ +
+ +static int
+ +detect_coding_emacs_mule (coding, detect_info)
+ +     struct coding_system *coding;
+ +     struct coding_detection_info *detect_info;
+ +{
+ +  const unsigned char *src = coding->source, *src_base;
+ +  const unsigned char *src_end = coding->source + coding->src_bytes;
+ +  int multibytep = coding->src_multibyte;
+ +  int consumed_chars = 0;
+ +  int c;
+ +  int found = 0;
+ +
+ +  detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
+ +  /* A coding system of this category is always ASCII compatible.  */
+ +  src += coding->head_ascii;
+ +
+ +  while (1)
+ +    {
+ +      src_base = src;
+ +      ONE_MORE_BYTE (c);
+ +      if (c < 0)
+ +      continue;
+ +      if (c == 0x80)
+ +      {
+ +        /* Perhaps the start of composite character.  We simple skip
+ +           it because analyzing it is too heavy for detecting.  But,
+ +           at least, we check that the composite character
+ +           constitues of more than 4 bytes.  */
+ +        const unsigned char *src_base;
+ +
+ +      repeat:
+ +        src_base = src;
+ +        do
+ +          {
+ +            ONE_MORE_BYTE (c);
+ +          }
+ +        while (c >= 0xA0);
+ +
+ +        if (src - src_base <= 4)
+ +          break;
+ +        found = CATEGORY_MASK_EMACS_MULE;
+ +        if (c == 0x80)
+ +          goto repeat;
+ +      }
+ +
+ +      if (c < 0x80)
+ +      {
+ +        if (c < 0x20
+ +            && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
+ +          break;
+ +      }
+ +      else
+ +      {
+ +        int more_bytes = emacs_mule_bytes[*src_base] - 1;
+ +
+ +        while (more_bytes > 0)
+ +          {
+ +            ONE_MORE_BYTE (c);
+ +            if (c < 0xA0)
+ +              {
+ +                src--;        /* Unread the last byte.  */
+ +                break;
+ +              }
+ +            more_bytes--;
+ +          }
+ +        if (more_bytes != 0)
+ +          break;
+ +        found = CATEGORY_MASK_EMACS_MULE;
+ +      }
+ +    }
+ +  detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
+ +  return 0;
+ +
+ + no_more_source:
+ +  if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
+ +    {
+ +      detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
+ +      return 0;
+ +    }
+ +  detect_info->found |= found;
+ +  return 1;
+ +}
+ +
+ +
+ +/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
+ +
+ +/* Decode a character represented as a component of composition
+ +   sequence of Emacs 20/21 style at SRC.  Set C to that character and
+ +   update SRC to the head of next character (or an encoded composition
+ +   rule).  If SRC doesn't points a composition component, set C to -1.
+ +   If SRC points an invalid byte sequence, global exit by a return
+ +   value 0.  */
+ +
+ +#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                       \
+ +  if (1)                                                      \
+ +    {                                                         \
+ +      int c;                                                  \
+ +      int nbytes, nchars;                                     \
+ +                                                              \
+ +      if (src == src_end)                                     \
+ +      break;                                                  \
+ +      c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
+ +      if (c < 0)                                              \
+ +      {                                                       \
+ +        if (c == -2)                                          \
+ +          break;                                              \
+ +        goto invalid_code;                                    \
+ +      }                                                       \
+ +      *buf++ = c;                                             \
+ +      src += nbytes;                                          \
+ +      consumed_chars += nchars;                                       \
+ +    }                                                         \
+ +  else
+ +
+ +
+ +/* Decode a composition rule represented as a component of composition
+ +   sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
+ +   and increment BUF.  If SRC points an invalid byte sequence, set C
+ +   to -1.  */
+ +
+ +#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)    \
+ +  do {                                                        \
+ +    int c, gref, nref;                                        \
+ +                                                      \
+ +    if (src >= src_end)                                       \
+ +      goto invalid_code;                              \
+ +    ONE_MORE_BYTE_NO_CHECK (c);                               \
+ +    c -= 0x20;                                                \
+ +    if (c < 0 || c >= 81)                             \
+ +      goto invalid_code;                              \
+ +                                                      \
+ +    gref = c / 9, nref = c % 9;                               \
+ +    *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);    \
+ +  } while (0)
+ +
+ +
+ +/* Decode a composition rule represented as a component of composition
+ +   sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
+ +   and increment BUF.  If SRC points an invalid byte sequence, set C
+ +   to -1.  */
+ +
+ +#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)    \
+ +  do {                                                        \
+ +    int gref, nref;                                   \
+ +                                                      \
+ +    if (src + 1>= src_end)                            \
+ +      goto invalid_code;                              \
+ +    ONE_MORE_BYTE_NO_CHECK (gref);                    \
+ +    gref -= 0x20;                                     \
+ +    ONE_MORE_BYTE_NO_CHECK (nref);                    \
+ +    nref -= 0x20;                                     \
+ +    if (gref < 0 || gref >= 81                                \
+ +      || nref < 0 || nref >= 81)                      \
+ +      goto invalid_code;                              \
+ +    *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);    \
+ +  } while (0)
+ +
+ +
+ +#define DECODE_EMACS_MULE_21_COMPOSITION(c)                           \
+ +  do {                                                                        \
+ +    /* Emacs 21 style format.  The first three bytes at SRC are               \
+ +       (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is        \
+ +       the byte length of this composition information, CHARS is the  \
+ +       number of characters composed by this composition.  */         \
+ +    enum composition_method method = c - 0xF2;                                \
+ +    int *charbuf_base = charbuf;                                      \
+ +    int consumed_chars_limit;                                         \
+ +    int nbytes, nchars;                                                       \
+ +                                                                      \
+ +    ONE_MORE_BYTE (c);                                                        \
+ +    if (c < 0)                                                                \
+ +      goto invalid_code;                                              \
+ +    nbytes = c - 0xA0;                                                        \
+ +    if (nbytes < 3)                                                   \
+ +      goto invalid_code;                                              \
+ +    ONE_MORE_BYTE (c);                                                        \
+ +    if (c < 0)                                                                \
+ +      goto invalid_code;                                              \
+ +    nchars = c - 0xA0;                                                        \
+ +    ADD_COMPOSITION_DATA (charbuf, nchars, method);                   \
+ +    consumed_chars_limit = consumed_chars_base + nbytes;              \
+ +    if (method != COMPOSITION_RELATIVE)                                       \
+ +      {                                                                       \
+ +      int i = 0;                                                      \
+ +      while (consumed_chars < consumed_chars_limit)                   \
+ +        {                                                             \
+ +          if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
+ +            DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
+ +          else                                                        \
+ +            DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
+ +          i++;                                                        \
+ +        }                                                             \
+ +      if (consumed_chars < consumed_chars_limit)                      \
+ +        goto invalid_code;                                            \
+ +      charbuf_base[0] -= i;                                           \
+ +      }                                                                       \
+ +  } while (0)
+ +
+ +
+ +#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)          \
+ +  do {                                                                \
+ +    /* Emacs 20 style format for relative composition.  */    \
+ +    /* Store multibyte form of characters to be composed.  */ \
+ +    enum composition_method method = COMPOSITION_RELATIVE;    \
+ +    int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];               \
+ +    int *buf = components;                                    \
+ +    int i, j;                                                 \
+ +                                                              \
+ +    src = src_base;                                           \
+ +    ONE_MORE_BYTE (c);                /* skip 0x80 */                 \
+ +    for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)          \
+ +      DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                       \
+ +    if (i < 2)                                                        \
+ +      goto invalid_code;                                      \
+ +    ADD_COMPOSITION_DATA (charbuf, i, method);                        \
+ +    for (j = 0; j < i; j++)                                   \
+ +      *charbuf++ = components[j];                             \
+ +  } while (0)
+ +
+ +
+ +#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)          \
+ +  do {                                                                \
+ +    /* Emacs 20 style format for rule-base composition.  */   \
+ +    /* Store multibyte form of characters to be composed.  */ \
+ +    enum composition_method method = COMPOSITION_WITH_RULE;   \
+ +    int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];               \
+ +    int *buf = components;                                    \
+ +    int i, j;                                                 \
+ +                                                              \
+ +    DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
+ +    for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)          \
+ +      {                                                               \
+ +      DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
+ +      DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
+ +      }                                                               \
+ +    if (i < 1 || (buf - components) % 2 == 0)                 \
+ +      goto invalid_code;                                      \
+ +    if (charbuf + i + (i / 2) + 1 < charbuf_end)              \
+ +      goto no_more_source;                                    \
+ +    ADD_COMPOSITION_DATA (buf, i, method);                    \
+ +    for (j = 0; j < i; j++)                                   \
+ +      *charbuf++ = components[j];                             \
+ +    for (j = 0; j < i; j += 2)                                        \
+ +      *charbuf++ = components[j];                             \
+ +  } while (0)
+ +
+ +
+ +static void
+ +decode_coding_emacs_mule (coding)
+ +     struct coding_system *coding;
+ +{
+ +  const unsigned char *src = coding->source + coding->consumed;
+ +  const unsigned char *src_end = coding->source + coding->src_bytes;
+ +  const unsigned char *src_base;
+ +  int *charbuf = coding->charbuf + coding->charbuf_used;
+ +  int *charbuf_end
+ +    = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
+ +  int consumed_chars = 0, consumed_chars_base;
+ +  int multibytep = coding->src_multibyte;
+ +  Lisp_Object attrs, charset_list;
+ +  int char_offset = coding->produced_char;
+ +  int last_offset = char_offset;
+ +  int last_id = charset_ascii;
+ +
+ +  CODING_GET_INFO (coding, attrs, charset_list);
+ +
+ +  while (1)
+ +    {
+ +      int c;
+ +
+ +      src_base = src;
+ +      consumed_chars_base = consumed_chars;
+ +
+ +      if (charbuf >= charbuf_end)
+ +      break;
+ +
+ +      ONE_MORE_BYTE (c);
+ +      if (c < 0)
+ +      {
+ +        *charbuf++ = -c;
+ +        char_offset++;
+ +      }
+ +      else if (c < 0x80)
+ +      {
+ +        *charbuf++ = c;
+ +        char_offset++;
+ +      }
+ +      else if (c == 0x80)
+ +      {
+ +        ONE_MORE_BYTE (c);
+ +        if (c < 0)
+ +          goto invalid_code;
+ +        if (c - 0xF2 >= COMPOSITION_RELATIVE
+ +            && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
+ +          DECODE_EMACS_MULE_21_COMPOSITION (c);
+ +        else if (c < 0xC0)
+ +          DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
+ +        else if (c == 0xFF)
+ +          DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
+ +        else
+ +          goto invalid_code;
+ +      }
+ +      else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
+ +      {
+ +        int nbytes, nchars;
+ +        int id;
+ +
+ +        src = src_base;
+ +        consumed_chars = consumed_chars_base;
+ +        c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
+ +        if (c < 0)
+ +          {
+ +            if (c == -2)
+ +              break;
+ +            goto invalid_code;
+ +          }
+ +        if (last_id != id)
+ +          {
+ +            if (last_id != charset_ascii)
+ +              ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
+ +            last_id = id;
+ +            last_offset = char_offset;
+ +          }
+ +        *charbuf++ = c;
+ +        src += nbytes;
+ +        consumed_chars += nchars;
+ +        char_offset++;
+ +      }
+ +      continue;
+ +
+ +    invalid_code:
+ +      src = src_base;
+ +      consumed_chars = consumed_chars_base;
+ +      ONE_MORE_BYTE (c);
+ +      *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
+ +      char_offset++;
+ +      coding->errors++;
+ +    }
+ +
+ + no_more_source:
+ +  if (last_id != charset_ascii)
+ +    ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
+ +  coding->consumed_char += consumed_chars_base;
+ +  coding->consumed = src_base - coding->source;
+ +  coding->charbuf_used = charbuf - coding->charbuf;
+ +}
+ +
+ +
+ +#define EMACS_MULE_LEADING_CODES(id, codes)   \
+ +  do {                                                \
+ +    if (id < 0xA0)                            \
+ +      codes[0] = id, codes[1] = 0;            \
+ +    else if (id < 0xE0)                               \
+ +      codes[0] = 0x9A, codes[1] = id;         \
+ +    else if (id < 0xF0)                               \
+ +      codes[0] = 0x9B, codes[1] = id;         \
+ +    else if (id < 0xF5)                               \
+ +      codes[0] = 0x9C, codes[1] = id;         \
+ +    else                                      \
+ +      codes[0] = 0x9D, codes[1] = id;         \
+ +  } while (0);
+ +
+ +
+ +static int
+ +encode_coding_emacs_mule (coding)
+ +     struct coding_system *coding;
+ +{
+ +  int multibytep = coding->dst_multibyte;
+ +  int *charbuf = coding->charbuf;
+ +  int *charbuf_end = charbuf + coding->charbuf_used;
+ +  unsigned char *dst = coding->destination + coding->produced;
+ +  unsigned char *dst_end = coding->destination + coding->dst_bytes;
+ +  int safe_room = 8;
+ +  int produced_chars = 0;
+ +  Lisp_Object attrs, charset_list;
+ +  int c;
+ +  int preferred_charset_id = -1;
+ +
+ +  CODING_GET_INFO (coding, attrs, charset_list);
+ +  if (! EQ (charset_list, Vemacs_mule_charset_list))
+ +    {
+ +      CODING_ATTR_CHARSET_LIST (attrs)
+ +      = charset_list = Vemacs_mule_charset_list;
+ +    }
+ +
+ +  while (charbuf < charbuf_end)
+ +    {
+ +      ASSURE_DESTINATION (safe_room);
+ +      c = *charbuf++;
+ +
+ +      if (c < 0)
+ +      {
+ +        /* Handle an annotation.  */
+ +        switch (*charbuf)
+ +          {
+ +          case CODING_ANNOTATE_COMPOSITION_MASK:
+ +            /* Not yet implemented.  */
+ +            break;
+ +          case CODING_ANNOTATE_CHARSET_MASK:
+ +            preferred_charset_id = charbuf[3];
+ +            if (preferred_charset_id >= 0
+ +                && NILP (Fmemq (make_number (preferred_charset_id),
+ +                                charset_list)))
+ +              preferred_charset_id = -1;
+ +            break;
+ +          default:
+ +            abort ();
+ +          }
+ +        charbuf += -c - 1;
+ +        continue;
+ +      }
+ +
+ +      if (ASCII_CHAR_P (c))
+ +      EMIT_ONE_ASCII_BYTE (c);
+ +      else if (CHAR_BYTE8_P (c))
+ +      {
+ +        c = CHAR_TO_BYTE8 (c);
+ +        EMIT_ONE_BYTE (c);
+ +      }
+ +      else
+ +      {
+ +        struct charset *charset;
+ +        unsigned code;
+ +        int dimension;
+ +        int emacs_mule_id;
+ +        unsigned char leading_codes[2];
+ +
+ +        if (preferred_charset_id >= 0)
+ +          {
+ +            charset = CHARSET_FROM_ID (preferred_charset_id);
+ +            if (! CHAR_CHARSET_P (c, charset))
+ +              charset = char_charset (c, charset_list, NULL);
+ +          }
+ +        else
+ +          charset = char_charset (c, charset_list, &code);
+ +        if (! charset)
+ +          {
+ +            c = coding->default_char;
+ +            if (ASCII_CHAR_P (c))
+ +              {
+ +                EMIT_ONE_ASCII_BYTE (c);
+ +                continue;
+ +              }
+ +            charset = char_charset (c, charset_list, &code);
+ +          }
+ +        dimension = CHARSET_DIMENSION (charset);
+ +        emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
+ +        EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
+ +        EMIT_ONE_BYTE (leading_codes[0]);
+ +        if (leading_codes[1])
+ +          EMIT_ONE_BYTE (leading_codes[1]);
+ +        if (dimension == 1)
+ +          EMIT_ONE_BYTE (code | 0x80);
+ +        else
+ +          {
+ +            code |= 0x8080;
+ +            EMIT_ONE_BYTE (code >> 8);
+ +            EMIT_ONE_BYTE (code & 0xFF);
+ +          }
+ +      }
+ +    }
+ +  record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ +  coding->produced_char += produced_chars;
+ +  coding->produced = dst - coding->destination;
+ +  return 0;
+ +}
+ +
+ +\f
+ +/*** 7. ISO2022 handlers ***/
+ +
+ +/* The following note describes the coding system ISO2022 briefly.
+ +   Since the intention of this note is to help understand the
+ +   functions in this file, some parts are NOT ACCURATE or are OVERLY
+ +   SIMPLIFIED.  For thorough understanding, please refer to the
+ +   original document of ISO2022.  This is equivalent to the standard
+ +   ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
+ +
+ +   ISO2022 provides many mechanisms to encode several character sets
+ +   in 7-bit and 8-bit environments.  For 7-bit environments, all text
+ +   is encoded using bytes less than 128.  This may make the encoded
+ +   text a little bit longer, but the text passes more easily through
+ +   several types of gateway, some of which strip off the MSB (Most
+ +   Significant Bit).
+ +
+ +   There are two kinds of character sets: control character sets and
+ +   graphic character sets.  The former contain control characters such
+ +   as `newline' and `escape' to provide control functions (control
+ +   functions are also provided by escape sequences).  The latter
+ +   contain graphic characters such as 'A' and '-'.  Emacs recognizes
+ +   two control character sets and many graphic character sets.
+ +
+ +   Graphic character sets are classified into one of the following
+ +   four classes, according to the number of bytes (DIMENSION) and
+ +   number of characters in one dimension (CHARS) of the set:
+ +   - DIMENSION1_CHARS94
+ +   - DIMENSION1_CHARS96
+ +   - DIMENSION2_CHARS94
+ +   - DIMENSION2_CHARS96
+ +
+ +   In addition, each character set is assigned an identification tag,
+ +   unique for each set, called the "final character" (denoted as <F>
+ +   hereafter).  The <F> of each character set is decided by ECMA(*)
+ +   when it is registered in ISO.  The code range of <F> is 0x30..0x7F
+ +   (0x30..0x3F are for private use only).
+ +
+ +   Note (*): ECMA = European Computer Manufacturers Association
+ +
+ +   Here are examples of graphic character sets [NAME(<F>)]:
         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
diff --cc src/xdisp.c
Simple merge
author	Miles Bader <miles@gnu.org>
	Sat, 4 Feb 2006 01:01:38 +0000 (01:01 +0000)
committer	Miles Bader <miles@gnu.org>
	Sat, 4 Feb 2006 01:01:38 +0000 (01:01 +0000)
		1	2
etc/NEWS	patch \|	diff1 \|	diff2 \|	blob \| history
etc/TODO	patch \|	diff1 \|	diff2 \|	blob \| history
lisp/ChangeLog	patch \|	diff1 \|	diff2 \|	blob \| history
lisp/arc-mode.el	patch \|	diff1 \|	diff2 \|	blob \| history
lisp/gnus/mml.el	patch \|	diff1 \|	diff2 \|	blob \| history
lisp/international/mule-cmds.el	patch \|	diff1 \|	diff2 \|	blob \| history
lisp/simple.el	patch \|	diff1 \|	diff2 \|	blob \| history
src/ChangeLog	patch \|	diff1 \|	diff2 \|	blob \| history
src/coding.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/xdisp.c	patch \|	diff1 \|	diff2 \|	blob \| history