(SAFE_ONE_MORE_BYTE): New macro.
authorKenichi Handa <handa@m17n.org>
Thu, 28 Dec 2000 01:05:02 +0000 (01:05 +0000)
committerKenichi Handa <handa@m17n.org>
Thu, 28 Dec 2000 01:05:02 +0000 (01:05 +0000)
(DECODE_EMACS_MULE_COMPOSITION_CHAR): New macro.
(DECODE_EMACS_MULE_COMPOSITION_RULE): New macro.
(decode_composition_emacs_mule): New function.
(decode_coding_emacs_mule): Decode composition sequence by calling
decode_composition_emacs_mule.
(ENCODE_COMPOSITION_EMACS_MULE): New macro.
(encode_coding_emacs_mule): Changed from macro to function.  If
a text contains compostions, encode them correctly.
(setup_coding_system): Set coding->commong_flags for emacs-mule so
that decoding and encoding are required.

src/coding.c

index 10be961..ade2014 100644 (file)
@@ -513,9 +513,9 @@ coding_safe_chars (coding)
 \f
 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 
-/* Emacs' internal format for encoding multiple character sets is a
-   kind of multi-byte encoding, i.e. characters are encoded by
-   variable-length sequences of one-byte codes.
+/* Emacs' internal format for representation of multiple character
+   sets is a kind of multi-byte encoding, i.e. characters are
+   represented by variable-length sequences of one-byte codes.
 
    ASCII characters and control characters (e.g. `tab', `newline') are
    represented by one-byte sequences which are their ASCII codes, in
@@ -531,7 +531,7 @@ coding_safe_chars (coding)
    The other characters are represented by a sequence of `base
    leading-code', optional `extended leading-code', and one or two
    `position-code's.  The length of the sequence is determined by the
-   base leading-code.  Leading-code takes the range 0x80 through 0x9F,
+   base leading-code.  Leading-code takes the range 0x81 through 0x9D,
    whereas extended leading-code and position-code take the range 0xA0
    through 0xFF.  See `charset.h' for more details about leading-code
    and position-code.
@@ -542,9 +542,46 @@ coding_safe_chars (coding)
    ascii               0x00..0x7F
    eight-bit-control   LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
    eight-bit-graphic   0xA0..0xBF
-   ELSE                        0x81..0x9F + [0xA0..0xFF]+
+   ELSE                        0x81..0x9D + [0xA0..0xFF]+
    ---------------------------------------------
 
+   As this is the internal character representation, the format is
+   usually not used externally (i.e. in a file or in a data sent to a
+   process).  But, it is possible to have a text externally in this
+   format (i.e. by encoding by the coding system `emacs-mule').
+
+   In that case, a sequence of one-byte codes has a slightly different
+   form.
+
+   At first, all characters in eight-bit-control are represented by
+   one-byte sequences which are their 8-bit code.
+
+   Next, character composition data are represented by the byte
+   sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
+   where,
+       METHOD is 0xF0 plus one of composition method (enum
+       composition_method),
+
+       BYTES is 0x20 plus a byte length of this composition data,
+
+       CHARS is 0x20 plus a number of characters composed by this
+       data,
+
+       COMPONENTs are characters of multibye form or composition
+       rules encoded by two-byte of ASCII codes.
+
+   In addition, for backward compatibility, the following formats are
+   also recognized as composition data on decoding.
+
+   0x80 MSEQ ...
+   0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
+
+   Here,
+       MSEQ is a multibyte form but in these special format:
+         ASCII: 0xA0 ASCII_CODE+0x80,
+         other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
+       RULE is a one byte code of the range 0xA0..0xF0 that
+       represents a composition rule.
   */
 
 enum emacs_code_class_type emacs_code_class[256];
@@ -608,6 +645,261 @@ detect_coding_emacs_mule (src, src_end, multibytep)
 }
 
 
+/* Record the starting position START and METHOD of one composition.  */
+
+#define CODING_ADD_COMPOSITION_START(coding, start, method)    \
+  do {                                                         \
+    struct composition_data *cmp_data = coding->cmp_data;      \
+    int *data = cmp_data->data + cmp_data->used;               \
+    coding->cmp_data_start = cmp_data->used;                   \
+    data[0] = -1;                                              \
+    data[1] = cmp_data->char_offset + start;                   \
+    data[3] = (int) method;                                    \
+    cmp_data->used += 4;                                       \
+  } while (0)
+
+/* Record the ending position END of the current composition.  */
+
+#define CODING_ADD_COMPOSITION_END(coding, end)                        \
+  do {                                                         \
+    struct composition_data *cmp_data = coding->cmp_data;      \
+    int *data = cmp_data->data + coding->cmp_data_start;       \
+    data[0] = cmp_data->used - coding->cmp_data_start;         \
+    data[2] = cmp_data->char_offset + end;                     \
+  } while (0)
+
+/* Record one COMPONENT (alternate character or composition rule).  */
+
+#define CODING_ADD_COMPOSITION_COMPONENT(coding, component)    \
+  (coding->cmp_data->data[coding->cmp_data->used++] = component)
+
+
+/* Get one byte from a data pointed by SRC and increment SRC.  If SRC
+   is not less than SRC_END, return -1 without inccrementing Src.  */
+
+#define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
+
+
+/* Decode a character represented as a component of composition
+   sequence of Emacs 20 style at SRC.  Set C to that character, store
+   its multibyte form sequence at P, and set P to the end of that
+   sequence.  If no valid character is found, set C to -1.  */
+
+#define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)               \
+  do {                                                         \
+    int bytes;                                                 \
+                                                               \
+    c = SAFE_ONE_MORE_BYTE ();                                 \
+    if (c < 0)                                                 \
+      break;                                                   \
+    if (CHAR_HEAD_P (c))                                       \
+      c = -1;                                                  \
+    else if (c == 0xA0)                                                \
+      {                                                                \
+       c = SAFE_ONE_MORE_BYTE ();                              \
+       if (c < 0xA0)                                           \
+         c = -1;                                               \
+       else                                                    \
+         {                                                     \
+           c -= 0xA0;                                          \
+           *p++ = c;                                           \
+         }                                                     \
+      }                                                                \
+    else if (BASE_LEADING_CODE_P (c - 0x20))                   \
+      {                                                                \
+       unsigned char *p0 = p;                                  \
+                                                               \
+       c -= 0x20;                                              \
+       *p++ = c;                                               \
+       bytes = BYTES_BY_CHAR_HEAD (c);                         \
+       while (--bytes)                                         \
+         {                                                     \
+           c = SAFE_ONE_MORE_BYTE ();                          \
+           if (c < 0)                                          \
+             break;                                            \
+           *p++ = c;                                           \
+         }                                                     \
+       if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes))     \
+         c = STRING_CHAR (p0, bytes);                          \
+       else                                                    \
+         c = -1;                                               \
+      }                                                                \
+    else                                                       \
+      c = -1;                                                  \
+  } while (0)
+
+
+/* Decode a composition rule represented as a component of composition
+   sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
+   valid rule is found, set C to -1.  */
+
+#define DECODE_EMACS_MULE_COMPOSITION_RULE(c)          \
+  do {                                                 \
+    c = SAFE_ONE_MORE_BYTE ();                         \
+    c -= 0xA0;                                         \
+    if (c < 0 || c >= 81)                              \
+      c = -1;                                          \
+    else                                               \
+      {                                                        \
+       gref = c / 9, nref = c % 9;                     \
+       c = COMPOSITION_ENCODE_RULE (gref, nref);       \
+      }                                                        \
+  } while (0)
+
+
+/* Decode composition sequence encoded by `emacs-mule' at the source
+   pointed by SRC.  SRC_END is the end of source.  Store information
+   of the composition in CODING->cmp_data.
+
+   For backward compatibility, decode also a composition sequence of
+   Emacs 20 style.  In that case, the composition sequence contains
+   characters that should be extracted into a buffer or string.  Store
+   those characters at *DESTINATION in multibyte form.
+
+   If we encounter an invalid byte sequence, return 0.
+   If we encounter an insufficient source or destination, or
+   insufficient space in CODING->cmp_data, return 1.
+   Otherwise, return consumed bytes in the source.
+
+*/
+static INLINE int
+decode_composition_emacs_mule (coding, src, src_end,
+                              destination, dst_end, dst_bytes)
+     struct coding_system *coding;
+     unsigned char *src, *src_end, **destination, *dst_end;
+     int dst_bytes;
+{
+  unsigned char *dst = *destination;
+  int method, data_len, nchars;
+  unsigned char *src_base = src++;
+  /* Store compoments of composition.  */
+  int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
+  int ncomponent;
+  /* Store multibyte form of characters to be composed.  This is for
+     Emacs 20 style composition sequence.  */
+  unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
+  unsigned char *bufp = buf;
+  int c, i, gref, nref;
+
+  if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
+      >= COMPOSITION_DATA_SIZE)
+    {
+      coding->result = CODING_FINISH_INSUFFICIENT_CMP;
+      return -1;
+    }
+
+  ONE_MORE_BYTE (c);
+  if (c - 0xF0 >= COMPOSITION_RELATIVE
+          && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
+    {
+      int with_rule;
+
+      method = c - 0xF0;
+      with_rule = (method == COMPOSITION_WITH_RULE
+                  || method == COMPOSITION_WITH_RULE_ALTCHARS);
+      ONE_MORE_BYTE (c);
+      data_len = c - 0xA0;
+      if (data_len < 4
+         || src_base + data_len > src_end)
+       return 0;
+      ONE_MORE_BYTE (c);
+      nchars = c - 0xA0;
+      if (c < 1)
+       return 0;
+      for (ncomponent = 0; src < src_base + data_len; ncomponent++)
+       {
+         if (ncomponent % 2 && with_rule)
+           {
+             ONE_MORE_BYTE (gref);
+             gref -= 32;
+             ONE_MORE_BYTE (nref);
+             nref -= 32;
+             c = COMPOSITION_ENCODE_RULE (gref, nref);
+           }
+         else
+           {
+             int bytes;
+             if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
+               c = STRING_CHAR (src, bytes);
+             else
+               c = *src, bytes = 1;
+             src += bytes;
+           }
+         component[ncomponent] = c;
+       }
+    }
+  else
+    {
+      /* This may be an old Emacs 20 style format.  See the comment at
+        the section 2 of this file.  */
+      while (src < src_end && !CHAR_HEAD_P (*src)) src++;
+      if (src == src_end
+         && !(coding->mode & CODING_MODE_LAST_BLOCK))
+       goto label_end_of_loop;
+
+      src_end = src;
+      src = src_base + 1;
+      if (c < 0xC0)
+       {
+         method = COMPOSITION_RELATIVE;
+         for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
+           {
+             DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
+             if (c < 0)
+               break;
+             component[ncomponent++] = c;
+           }
+         if (ncomponent < 2)
+           return 0;
+         nchars = ncomponent;
+       }
+      else if (c == 0xFF)
+       {
+         method = COMPOSITION_WITH_RULE;
+         src++;
+         DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
+         if (c < 0)
+           return 0;
+         component[0] = c;
+         for (ncomponent = 1;
+              ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
+           {
+             DECODE_EMACS_MULE_COMPOSITION_RULE (c);
+             if (c < 0)
+               break;
+             component[ncomponent++] = c;
+             DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
+             if (c < 0)
+               break;
+             component[ncomponent++] = c;
+           }
+         if (ncomponent < 3)
+           return 0;
+         nchars = (ncomponent + 1) / 2;
+       }
+      else
+       return 0;
+    }
+
+  if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
+    {
+      CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
+      for (i = 0; i < ncomponent; i++)
+       CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
+      CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);  
+      if (buf < bufp)
+       {
+         unsigned char *p = buf;
+         EMIT_BYTES (p, bufp);
+         *destination += bufp - buf;
+         coding->produced_char += nchars;
+       }
+      return (src - src_base);
+    }
+ label_end_of_loop:
+  return -1;
+}
+
 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 
 static void
@@ -669,6 +961,23 @@ decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
          coding->produced_char++;
          continue;
        }
+      else if (*src == 0x80)
+       {
+         /* Start of composition data.  */
+         int consumed  = decode_composition_emacs_mule (coding, src, src_end,
+                                                        &dst, dst_end,
+                                                        dst_bytes);
+         if (consumed < 0)
+           goto label_end_of_loop;
+         else if (consumed > 0)
+           {
+             src += consumed;
+             continue;
+           }
+         bytes = CHAR_STRING (*src, tmp);
+         p = tmp;
+         src++;
+       }
       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
        {
          p = src;
@@ -693,9 +1002,123 @@ decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
   coding->produced = dst - destination;
 }
 
-#define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
-  encode_eol (coding, source, destination, src_bytes, dst_bytes)
 
+/* Encode composition data stored at DATA into a special byte sequence
+   starting by 0x80.  Update CODING->cmp_data_start and maybe
+   CODING->cmp_data for the next call.  */
+
+#define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                    \
+  do {                                                                 \
+    unsigned char buf[1024], *p0 = buf, *p;                            \
+    int len = data[0];                                                 \
+    int i;                                                             \
+                                                                       \
+    buf[0] = 0x80;                                                     \
+    buf[1] = 0xF0 + data[3];   /* METHOD */                            \
+    buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */          \
+    p = buf + 4;                                                       \
+    if (data[3] == COMPOSITION_WITH_RULE                               \
+       || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
+      {                                                                        \
+       p += CHAR_STRING (data[4], p);                                  \
+       for (i = 5; i < len; i += 2)                                    \
+         {                                                             \
+           int gref, nref;                                             \
+            COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
+           *p++ = 0x20 + gref;                                         \
+           *p++ = 0x20 + nref;                                         \
+           p += CHAR_STRING (data[i + 1], p);                          \
+         }                                                             \
+      }                                                                        \
+    else                                                               \
+      {                                                                        \
+       for (i = 4; i < len; i++)                                       \
+         p += CHAR_STRING (data[i], p);                                \
+      }                                                                        \
+    buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */                  \
+                                                                       \
+    if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))             \
+      {                                                                        \
+       coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
+       goto label_end_of_loop;                                         \
+      }                                                                        \
+    while (p0 < p)                                                     \
+      *dst++ = *p0++;                                                  \
+    coding->cmp_data_start += data[0];                                 \
+    if (coding->cmp_data_start == coding->cmp_data->used               \
+       && coding->cmp_data->next)                                      \
+      {                                                                        \
+       coding->cmp_data = coding->cmp_data->next;                      \
+       coding->cmp_data_start = 0;                                     \
+      }                                                                        \
+  } while (0)
+  
+
+static void encode_eol P_ ((struct coding_system *, unsigned char *,
+                           unsigned char *, int, int));
+
+static void
+encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
+     struct coding_system *coding;
+     unsigned char *source, *destination;
+     int src_bytes, dst_bytes;
+{
+  unsigned char *src = source;
+  unsigned char *src_end = source + src_bytes;
+  unsigned char *dst = destination;
+  unsigned char *dst_end = destination + dst_bytes;
+  unsigned char *src_base;
+  int c;
+  int char_offset;
+  int *data;
+
+  Lisp_Object translation_table;
+
+  translation_table = Qnil;
+
+  /* Optimization for the case that there's no composition.  */
+  if (!coding->cmp_data || coding->cmp_data->used == 0)
+    {
+      encode_eol (coding, source, destination, src_bytes, dst_bytes);
+      return;
+    }
+
+  char_offset = coding->cmp_data->char_offset;
+  data = coding->cmp_data->data + coding->cmp_data_start;
+  while (1)
+    {
+      src_base = src;
+
+      /* If SRC starts a composition, encode the information about the
+        composition in advance.  */
+      if (coding->cmp_data_start < coding->cmp_data->used
+         && char_offset + coding->consumed_char == data[1])
+       {
+         ENCODE_COMPOSITION_EMACS_MULE (coding, data);
+         char_offset = coding->cmp_data->char_offset;
+         data = coding->cmp_data->data + coding->cmp_data_start;
+       }
+
+      ONE_MORE_CHAR (c);
+      if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
+                       || coding->eol_type == CODING_EOL_CR))
+       {
+         if (coding->eol_type == CODING_EOL_CRLF)
+           EMIT_TWO_BYTES ('\r', c);
+         else
+           EMIT_ONE_BYTE ('\r');
+       }
+      else if (SINGLE_BYTE_CHAR_P (c))
+       EMIT_ONE_BYTE (c);
+      else
+       EMIT_BYTES (src_base, src);
+      coding->consumed_char++;
+    }
+ label_end_of_loop:
+  coding->consumed = src_base - source;
+  coding->produced = coding->produced_char = dst - destination;
+  return;
+}
 
 \f
 /*** 3. ISO2022 handlers ***/
@@ -1180,35 +1603,12 @@ coding_allocate_composition_data (coding, char_offset)
   coding->cmp_data_start = 0;
 }
 
-/* Record the starting position START and METHOD of one composition.  */
-
-#define CODING_ADD_COMPOSITION_START(coding, start, method)    \
-  do {                                                         \
-    struct composition_data *cmp_data = coding->cmp_data;      \
-    int *data = cmp_data->data + cmp_data->used;               \
-    coding->cmp_data_start = cmp_data->used;                   \
-    data[0] = -1;                                              \
-    data[1] = cmp_data->char_offset + start;                   \
-    data[3] = (int) method;                                    \
-    cmp_data->used += 4;                                       \
-  } while (0)
-
-/* Record the ending position END of the current composition.  */
-
-#define CODING_ADD_COMPOSITION_END(coding, end)                        \
-  do {                                                         \
-    struct composition_data *cmp_data = coding->cmp_data;      \
-    int *data = cmp_data->data + coding->cmp_data_start;       \
-    data[0] = cmp_data->used - coding->cmp_data_start;         \
-    data[2] = cmp_data->char_offset + end;                     \
-  } while (0)
-
-/* Record one COMPONENT (alternate character or composition rule).  */
-
-#define CODING_ADD_COMPOSITION_COMPONENT(coding, component)    \
-  (coding->cmp_data->data[coding->cmp_data->used++] = component)
-
-/* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
+/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
+   ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
+   ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
+   ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
+   ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
+  */
 
 #define DECODE_COMPOSITION_START(c1)                                      \
   do {                                                                    \
@@ -3088,6 +3488,9 @@ setup_coding_system (coding_system, coding)
     {
     case 0:
       coding->type = coding_type_emacs_mule;
+      coding->common_flags
+       |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
+      coding->composing = COMPOSITION_NO;
       if (!NILP (coding->post_read_conversion))
        coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
       if (!NILP (coding->pre_write_conversion))