} while (0)
-/* Decode composition sequence encoded by `emacs-mule' at the source
- pointed by SRC. SRC_END is the end of source. Store information
- of the composition in CODING->cmp_data.
+#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
+ do { \
+ EMIT_TWO_BYTES (c1, c2); \
+ EMIT_TWO_BYTES (c3, c4); \
+ } while (0)
- For backward compatibility, decode also a composition sequence of
- Emacs 20 style. In that case, the composition sequence contains
- characters that should be extracted into a buffer or string. Store
- those characters at *DESTINATION in multibyte form.
- If we encounter an invalid byte sequence, return 0.
- If we encounter an insufficient source or destination, or
- insufficient space in CODING->cmp_data, return 1.
- Otherwise, return consumed bytes in the source.
+/* Prototypes for static functions. */
+static void record_conversion_result P_ ((struct coding_system *coding,
+ enum coding_result_code result));
+static int detect_coding_utf_8 P_ ((struct coding_system *,
+ struct coding_detection_info *info));
+static void decode_coding_utf_8 P_ ((struct coding_system *));
+static int encode_coding_utf_8 P_ ((struct coding_system *));
+
+static int detect_coding_utf_16 P_ ((struct coding_system *,
+ struct coding_detection_info *info));
+static void decode_coding_utf_16 P_ ((struct coding_system *));
+static int encode_coding_utf_16 P_ ((struct coding_system *));
+
+static int detect_coding_iso_2022 P_ ((struct coding_system *,
+ struct coding_detection_info *info));
+static void decode_coding_iso_2022 P_ ((struct coding_system *));
+static int encode_coding_iso_2022 P_ ((struct coding_system *));
+
+static int detect_coding_emacs_mule P_ ((struct coding_system *,
+ struct coding_detection_info *info));
+static void decode_coding_emacs_mule P_ ((struct coding_system *));
+static int encode_coding_emacs_mule P_ ((struct coding_system *));
+
+static int detect_coding_sjis P_ ((struct coding_system *,
+ struct coding_detection_info *info));
+static void decode_coding_sjis P_ ((struct coding_system *));
+static int encode_coding_sjis P_ ((struct coding_system *));
+
+static int detect_coding_big5 P_ ((struct coding_system *,
+ struct coding_detection_info *info));
+static void decode_coding_big5 P_ ((struct coding_system *));
+static int encode_coding_big5 P_ ((struct coding_system *));
+
+static int detect_coding_ccl P_ ((struct coding_system *,
+ struct coding_detection_info *info));
+static void decode_coding_ccl P_ ((struct coding_system *));
+static int encode_coding_ccl P_ ((struct coding_system *));
+
+static void decode_coding_raw_text P_ ((struct coding_system *));
+static int encode_coding_raw_text P_ ((struct coding_system *));
+
+static void coding_set_source P_ ((struct coding_system *));
+static void coding_set_destination P_ ((struct coding_system *));
+static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
+static void coding_alloc_by_making_gap P_ ((struct coding_system *,
+ EMACS_INT));
+static unsigned char *alloc_destination P_ ((struct coding_system *,
+ EMACS_INT, unsigned char *));
+static void setup_iso_safe_charsets P_ ((Lisp_Object));
+static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
+ int *, int *,
+ unsigned char *));
+static int detect_eol P_ ((const unsigned char *,
+ EMACS_INT, enum coding_category));
+static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
+static void decode_eol P_ ((struct coding_system *));
+static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
+static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
+ int, int *, int *));
+static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
+static INLINE void produce_composition P_ ((struct coding_system *, int *,
+ EMACS_INT));
+static INLINE void produce_charset P_ ((struct coding_system *, int *,
+ EMACS_INT));
+static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
+static int decode_coding P_ ((struct coding_system *));
+static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
+ struct coding_system *,
+ int *, EMACS_INT *));
+static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
+ struct coding_system *,
+ int *, EMACS_INT *));
+static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
+static int encode_coding P_ ((struct coding_system *));
+static Lisp_Object make_conversion_work_buffer P_ ((int));
+static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
+static INLINE int char_encodable_p P_ ((int, Lisp_Object));
+static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
-*/
-static INLINE int
-decode_composition_emacs_mule (coding, src, src_end,
- destination, dst_end, dst_bytes)
- struct coding_system *coding;
- const unsigned char *src, *src_end;
- unsigned char **destination, *dst_end;
- int dst_bytes;
+static void
+record_conversion_result (struct coding_system *coding,
+ enum coding_result_code result)
{
- unsigned char *dst = *destination;
- int method, data_len, nchars;
- const unsigned char *src_base = src++;
- /* Store components of composition. */
- int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
- int ncomponent;
- /* Store multibyte form of characters to be composed. This is for
- Emacs 20 style composition sequence. */
- unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
- unsigned char *bufp = buf;
- int c, i, gref, nref;
-
- if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
- >= COMPOSITION_DATA_SIZE)
+ coding->result = result;
+ switch (result)
{
- coding->result = CODING_FINISH_INSUFFICIENT_CMP;
- return -1;
+ case CODING_RESULT_INSUFFICIENT_SRC:
+ Vlast_code_conversion_error = Qinsufficient_source;
+ break;
+ case CODING_RESULT_INCONSISTENT_EOL:
+ Vlast_code_conversion_error = Qinconsistent_eol;
+ break;
+ case CODING_RESULT_INVALID_SRC:
+ Vlast_code_conversion_error = Qinvalid_source;
+ break;
+ case CODING_RESULT_INTERRUPT:
+ Vlast_code_conversion_error = Qinterrupted;
+ break;
+ case CODING_RESULT_INSUFFICIENT_MEM:
+ Vlast_code_conversion_error = Qinsufficient_memory;
+ break;
+ default:
+ Vlast_code_conversion_error = intern ("Unknown error");
}
+}
- ONE_MORE_BYTE (c);
- if (c - 0xF0 >= COMPOSITION_RELATIVE
- && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
- {
- int with_rule;
+#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
+ do { \
+ charset_map_loaded = 0; \
+ c = DECODE_CHAR (charset, code); \
+ if (charset_map_loaded) \
+ { \
+ const unsigned char *orig = coding->source; \
+ EMACS_INT offset; \
+ \
+ coding_set_source (coding); \
+ offset = coding->source - orig; \
+ src += offset; \
+ src_base += offset; \
+ src_end += offset; \
+ } \
+ } while (0)
- method = c - 0xF0;
- with_rule = (method == COMPOSITION_WITH_RULE
- || method == COMPOSITION_WITH_RULE_ALTCHARS);
- ONE_MORE_BYTE (c);
- data_len = c - 0xA0;
- if (data_len < 4
- || src_base + data_len > src_end)
- return 0;
- ONE_MORE_BYTE (c);
- nchars = c - 0xA0;
- if (c < 1)
- return 0;
- for (ncomponent = 0; src < src_base + data_len; ncomponent++)
- {
- /* If it is longer than this, it can't be valid. */
- if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
- return 0;
- if (ncomponent % 2 && with_rule)
- {
- ONE_MORE_BYTE (gref);
- gref -= 32;
- ONE_MORE_BYTE (nref);
- nref -= 32;
- c = COMPOSITION_ENCODE_RULE (gref, nref);
- }
- else
- {
- int bytes;
- if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
- || (coding->flags /* We are recovering a file. */
- && src[0] == LEADING_CODE_8_BIT_CONTROL
- && ! CHAR_HEAD_P (src[1])))
- c = STRING_CHAR (src, bytes);
- else
- c = *src, bytes = 1;
- src += bytes;
- }
- component[ncomponent] = c;
- }
+#define ASSURE_DESTINATION(bytes) \
+ do { \
+ if (dst + (bytes) >= dst_end) \
+ { \
+ int more_bytes = charbuf_end - charbuf + (bytes); \
+ \
+ dst = alloc_destination (coding, more_bytes, dst); \
+ dst_end = coding->destination + coding->dst_bytes; \
+ } \
+ } while (0)
+
+
+
+static void
+coding_set_source (coding)
+ struct coding_system *coding;
+{
+ if (BUFFERP (coding->src_object))
+ {
+ struct buffer *buf = XBUFFER (coding->src_object);
+
+ if (coding->src_pos < 0)
+ coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
+ else
+ coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
}
- else if (c >= 0x80)
+ else if (STRINGP (coding->src_object))
{
- /* This may be an old Emacs 20 style format. See the comment at
- the section 2 of this file. */
- while (src < src_end && !CHAR_HEAD_P (*src)) src++;
- if (src == src_end
- && !(coding->mode & CODING_MODE_LAST_BLOCK))
- goto label_end_of_loop;
+ coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
+ }
+ else
+ /* Otherwise, the source is C string and is never relocated
+ automatically. Thus we don't have to update anything. */
+ ;
+}
- src_end = src;
- src = src_base + 1;
- if (c < 0xC0)
+static void
+coding_set_destination (coding)
+ struct coding_system *coding;
+{
+ if (BUFFERP (coding->dst_object))
+ {
+ if (coding->src_pos < 0)
{
- method = COMPOSITION_RELATIVE;
- for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
- {
- DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
- if (c < 0)
- break;
- component[ncomponent++] = c;
- }
- if (ncomponent < 2)
- return 0;
- nchars = ncomponent;
+ coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
+ coding->dst_bytes = (GAP_END_ADDR
+ - (coding->src_bytes - coding->consumed)
+ - coding->destination);
}
- else if (c == 0xFF)
+ else
{
- method = COMPOSITION_WITH_RULE;
- src++;
- DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
- if (c < 0)
- return 0;
- component[0] = c;
- for (ncomponent = 1;
- ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
- {
- DECODE_EMACS_MULE_COMPOSITION_RULE (c);
- if (c < 0)
- break;
- component[ncomponent++] = c;
- DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
- if (c < 0)
- break;
- component[ncomponent++] = c;
- }
- if (ncomponent < 3)
- return 0;
- nchars = (ncomponent + 1) / 2;
+ /* We are sure that coding->dst_pos_byte is before the gap
+ of the buffer. */
+ coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
+ + coding->dst_pos_byte - 1);
+ coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
+ - coding->destination);
}
- else
- return 0;
}
else
- return 0;
+ /* Otherwise, the destination is C string and is never relocated
+ automatically. Thus we don't have to update anything. */
+ ;
+}
+
+
+static void
+coding_alloc_by_realloc (coding, bytes)
+ struct coding_system *coding;
+ EMACS_INT bytes;
+{
+ coding->destination = (unsigned char *) xrealloc (coding->destination,
+ coding->dst_bytes + bytes);
+ coding->dst_bytes += bytes;
+}
- if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
+static void
+coding_alloc_by_making_gap (coding, bytes)
+ struct coding_system *coding;
+ EMACS_INT bytes;
+{
+ if (BUFFERP (coding->dst_object)
+ && EQ (coding->src_object, coding->dst_object))
{
- CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
- for (i = 0; i < ncomponent; i++)
- CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
- CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
- if (buf < bufp)
- {
- unsigned char *p = buf;
- EMIT_BYTES (p, bufp);
- *destination += bufp - buf;
- coding->produced_char += nchars;
- }
- return (src - src_base);
+ EMACS_INT add = coding->src_bytes - coding->consumed;
+
+ GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
+ make_gap (bytes);
+ GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
+ }
- else
++ else if (c >= 0x80)
+ {
+ Lisp_Object this_buffer;
+
+ this_buffer = Fcurrent_buffer ();
+ set_buffer_internal (XBUFFER (coding->dst_object));
+ make_gap (bytes);
+ set_buffer_internal (XBUFFER (this_buffer));
}
- label_end_of_loop:
- return -1;
}
-/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
-static void
-decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
+static unsigned char *
+alloc_destination (coding, nbytes, dst)
struct coding_system *coding;
- const unsigned char *source;
- unsigned char *destination;
- int src_bytes, dst_bytes;
+ EMACS_INT nbytes;
+ unsigned char *dst;
{
- const unsigned char *src = source;
- const unsigned char *src_end = source + src_bytes;
- unsigned char *dst = destination;
- unsigned char *dst_end = destination + dst_bytes;
- /* SRC_BASE remembers the start position in source in each loop.
- The loop will be exited when there's not enough source code, or
- when there's not enough destination area to produce a
- character. */
- const unsigned char *src_base;
+ EMACS_INT offset = dst - coding->destination;
+
+ if (BUFFERP (coding->dst_object))
+ coding_alloc_by_making_gap (coding, nbytes);
+ else
+ coding_alloc_by_realloc (coding, nbytes);
+ record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ coding_set_destination (coding);
+ dst = coding->destination + offset;
+ return dst;
+}
+
+/** Macros for annotations. */
+
+/* Maximum length of annotation data (sum of annotations for
+ composition and charset). */
+#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
+
+/* An annotation data is stored in the array coding->charbuf in this
+ format:
+ [ -LENGTH ANNOTATION_MASK NCHARS ... ]
+ LENGTH is the number of elements in the annotation.
+ ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
+ NCHARS is the number of characters in the text annotated.
+
+ The format of the following elements depend on ANNOTATION_MASK.
+
+ In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
+ follows:
+ ... METHOD [ COMPOSITION-COMPONENTS ... ]
+ METHOD is one of enum composition_method.
+ Optionnal COMPOSITION-COMPONENTS are characters and composition
+ rules.
+
+ In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
+ follows. */
+
+#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
+ do { \
+ *(buf)++ = -(len); \
+ *(buf)++ = (mask); \
+ *(buf)++ = (nchars); \
+ coding->annotated = 1; \
+ } while (0);
+
+#define ADD_COMPOSITION_DATA(buf, nchars, method) \
+ do { \
+ ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
+ *buf++ = method; \
+ } while (0)
+
+
+#define ADD_CHARSET_DATA(buf, nchars, id) \
+ do { \
+ ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
+ *buf++ = id; \
+ } while (0)
+
+\f
+/*** 2. Emacs' internal format (emacs-utf-8) ***/
+
+
+
+\f
+/*** 3. UTF-8 ***/
+
+/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ Check if a text is encoded in UTF-8. If it is, return 1, else
+ return 0. */
+
+#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
+#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
+#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
+#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
+#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
+#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
+
+static int
+detect_coding_utf_8 (coding, detect_info)
+ struct coding_system *coding;
+ struct coding_detection_info *detect_info;
+{
+ const unsigned char *src = coding->source, *src_base;
+ const unsigned char *src_end = coding->source + coding->src_bytes;
+ int multibytep = coding->src_multibyte;
+ int consumed_chars = 0;
+ int found = 0;
+
+ detect_info->checked |= CATEGORY_MASK_UTF_8;
+ /* A coding system of this category is always ASCII compatible. */
+ src += coding->head_ascii;
- coding->produced_char = 0;
- while ((src_base = src) < src_end)
+ while (1)
{
- unsigned char tmp[MAX_MULTIBYTE_LENGTH];
- const unsigned char *p;
- int bytes;
+ int c, c1, c2, c3, c4;
- if (*src == '\r')
+ src_base = src;
+ ONE_MORE_BYTE (c);
+ if (c < 0 || UTF_8_1_OCTET_P (c))
+ continue;
+ ONE_MORE_BYTE (c1);
+ if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
+ break;
+ if (UTF_8_2_OCTET_LEADING_P (c))
{
- int c = *src++;
-
- if (coding->eol_type == CODING_EOL_CR)
- c = '\n';
- else if (coding->eol_type == CODING_EOL_CRLF)
- {
- ONE_MORE_BYTE (c);
- if (c != '\n')
- {
- src--;
- c = '\r';
- }
- }
- *dst++ = c;
- coding->produced_char++;
+ found = CATEGORY_MASK_UTF_8;
continue;
}
- else if (*src == '\n')
+ ONE_MORE_BYTE (c2);
+ if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
+ break;
+ if (UTF_8_3_OCTET_LEADING_P (c))
{
- if ((coding->eol_type == CODING_EOL_CR
- || coding->eol_type == CODING_EOL_CRLF)
- && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
- {
- coding->result = CODING_FINISH_INCONSISTENT_EOL;
- goto label_end_of_loop;
- }
- *dst++ = *src++;
- coding->produced_char++;
+ found = CATEGORY_MASK_UTF_8;
continue;
}
- else if (*src == 0x80 && coding->cmp_data)
+ ONE_MORE_BYTE (c3);
+ if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
+ break;
+ if (UTF_8_4_OCTET_LEADING_P (c))
{
- /* Start of composition data. */
- int consumed = decode_composition_emacs_mule (coding, src, src_end,
- &dst, dst_end,
- dst_bytes);
- if (consumed < 0)
- goto label_end_of_loop;
- else if (consumed > 0)
- {
- src += consumed;
- continue;
- }
- bytes = CHAR_STRING (*src, tmp);
- p = tmp;
- src++;
+ found = CATEGORY_MASK_UTF_8;
+ continue;
}
- else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
- || (coding->flags /* We are recovering a file. */
- && src[0] == LEADING_CODE_8_BIT_CONTROL
- && ! CHAR_HEAD_P (src[1])))
+ ONE_MORE_BYTE (c4);
+ if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
+ break;
+ if (UTF_8_5_OCTET_LEADING_P (c))
{
- p = src;
- src += bytes;
+ found = CATEGORY_MASK_UTF_8;
+ continue;
}
- else
- {
- int i, c;
+ break;
+ }
+ detect_info->rejected |= CATEGORY_MASK_UTF_8;
+ return 0;
- bytes = BYTES_BY_CHAR_HEAD (*src);
- src++;
- for (i = 1; i < bytes; i++)
- {
- ONE_MORE_BYTE (c);
- if (CHAR_HEAD_P (c))
- break;
- }
- if (i < bytes)
- {
- bytes = CHAR_STRING (*src_base, tmp);
- p = tmp;
- src = src_base + 1;
- }
- else
- {
- p = src_base;
- }
- }
- if (dst + bytes >= (dst_bytes ? dst_end : src))
- {
- coding->result = CODING_FINISH_INSUFFICIENT_DST;
- break;
- }
- while (bytes--) *dst++ = *p++;
- coding->produced_char++;
+ no_more_source:
+ if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ detect_info->rejected |= CATEGORY_MASK_UTF_8;
+ return 0;
}
- label_end_of_loop:
- coding->consumed = coding->consumed_char = src_base - source;
- coding->produced = dst - destination;
+ detect_info->found |= found;
+ return 1;
}
-/* Encode composition data stored at DATA into a special byte sequence
- starting by 0x80. Update CODING->cmp_data_start and maybe
- CODING->cmp_data for the next call. */
-
-#define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
- do { \
- unsigned char buf[1024], *p0 = buf, *p; \
- int len = data[0]; \
- int i; \
- \
- buf[0] = 0x80; \
- buf[1] = 0xF0 + data[3]; /* METHOD */ \
- buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
- p = buf + 4; \
- if (data[3] == COMPOSITION_WITH_RULE \
- || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
- { \
- p += CHAR_STRING (data[4], p); \
- for (i = 5; i < len; i += 2) \
- { \
- int gref, nref; \
- COMPOSITION_DECODE_RULE (data[i], gref, nref); \
- *p++ = 0x20 + gref; \
- *p++ = 0x20 + nref; \
- p += CHAR_STRING (data[i + 1], p); \
- } \
- } \
- else \
- { \
- for (i = 4; i < len; i++) \
- p += CHAR_STRING (data[i], p); \
- } \
- buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
- \
- if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
- { \
- coding->result = CODING_FINISH_INSUFFICIENT_DST; \
- goto label_end_of_loop; \
- } \
- while (p0 < p) \
- *dst++ = *p0++; \
- coding->cmp_data_start += data[0]; \
- if (coding->cmp_data_start == coding->cmp_data->used \
- && coding->cmp_data->next) \
- { \
- coding->cmp_data = coding->cmp_data->next; \
- coding->cmp_data_start = 0; \
- } \
- } while (0)
-
-
-static void encode_eol P_ ((struct coding_system *, const unsigned char *,
- unsigned char *, int, int));
-
static void
-encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
+decode_coding_utf_8 (coding)
struct coding_system *coding;
- const unsigned char *source;
- unsigned char *destination;
- int src_bytes, dst_bytes;
{
- const unsigned char *src = source;
- const unsigned char *src_end = source + src_bytes;
- unsigned char *dst = destination;
- unsigned char *dst_end = destination + dst_bytes;
+ const unsigned char *src = coding->source + coding->consumed;
+ const unsigned char *src_end = coding->source + coding->src_bytes;
const unsigned char *src_base;
- int c;
- int char_offset;
- int *data;
-
- Lisp_Object translation_table;
+ int *charbuf = coding->charbuf + coding->charbuf_used;
+ int *charbuf_end = coding->charbuf + coding->charbuf_size;
+ int consumed_chars = 0, consumed_chars_base;
+ int multibytep = coding->src_multibyte;
+ Lisp_Object attr, charset_list;
- translation_table = Qnil;
+ CODING_GET_INFO (coding, attr, charset_list);
- /* Optimization for the case that there's no composition. */
- if (!coding->cmp_data || coding->cmp_data->used == 0)
- {
- encode_eol (coding, source, destination, src_bytes, dst_bytes);
- return;
- }
-
- char_offset = coding->cmp_data->char_offset;
- data = coding->cmp_data->data + coding->cmp_data_start;
while (1)
{
+ int c, c1, c2, c3, c4, c5;
+
src_base = src;
+ consumed_chars_base = consumed_chars;
- /* If SRC starts a composition, encode the information about the
- composition in advance. */
- if (coding->cmp_data_start < coding->cmp_data->used
- && char_offset + coding->consumed_char == data[1])
+ if (charbuf >= charbuf_end)
+ break;
+
+ ONE_MORE_BYTE (c1);
+ if (c1 < 0)
{
- ENCODE_COMPOSITION_EMACS_MULE (coding, data);
- char_offset = coding->cmp_data->char_offset;
- data = coding->cmp_data->data + coding->cmp_data_start;
+ c = - c1;
}
-
- ONE_MORE_CHAR (c);
- if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
- || coding->eol_type == CODING_EOL_CR))
+ else if (UTF_8_1_OCTET_P(c1))
{
- if (coding->eol_type == CODING_EOL_CRLF)
- EMIT_TWO_BYTES ('\r', c);
- else
- EMIT_ONE_BYTE ('\r');
+ c = c1;
}
- else if (SINGLE_BYTE_CHAR_P (c))
+ else
{
- if (coding->flags && ! ASCII_BYTE_P (c))
+ ONE_MORE_BYTE (c2);
+ if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
+ goto invalid_code;
+ if (UTF_8_2_OCTET_LEADING_P (c1))
{
- /* As we are auto saving, retain the multibyte form for
- 8-bit chars. */
- unsigned char buf[MAX_MULTIBYTE_LENGTH];
- int bytes = CHAR_STRING (c, buf);
-
- if (bytes == 1)
- EMIT_ONE_BYTE (buf[0]);
- else
- EMIT_TWO_BYTES (buf[0], buf[1]);
+ c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
+ /* Reject overlong sequences here and below. Encoders
+ producing them are incorrect, they can be misleading,
+ and they mess up read/write invariance. */
+ if (c < 128)
+ goto invalid_code;
}
else
- EMIT_ONE_BYTE (c);
+ {
+ ONE_MORE_BYTE (c3);
+ if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
+ goto invalid_code;
+ if (UTF_8_3_OCTET_LEADING_P (c1))
+ {
+ c = (((c1 & 0xF) << 12)
+ | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
+ if (c < 0x800
+ || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
+ goto invalid_code;
+ }
+ else
+ {
+ ONE_MORE_BYTE (c4);
+ if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
+ goto invalid_code;
+ if (UTF_8_4_OCTET_LEADING_P (c1))
+ {
+ c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
+ | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
+ if (c < 0x10000)
+ goto invalid_code;
+ }
+ else
+ {
+ ONE_MORE_BYTE (c5);
+ if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
+ goto invalid_code;
+ if (UTF_8_5_OCTET_LEADING_P (c1))
+ {
+ c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
+ | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
+ | (c5 & 0x3F));
+ if ((c > MAX_CHAR) || (c < 0x200000))
+ goto invalid_code;
+ }
+ else
+ goto invalid_code;
+ }
+ }
+ }
}
- else
- EMIT_BYTES (src_base, src);
- coding->consumed_char++;
- }
- label_end_of_loop:
- coding->consumed = src_base - source;
- coding->produced = coding->produced_char = dst - destination;
- return;
-}
-\f
-/*** 3. ISO2022 handlers ***/
+ *charbuf++ = c;
+ continue;
-/* The following note describes the coding system ISO2022 briefly.
- Since the intention of this note is to help understand the
- functions in this file, some parts are NOT ACCURATE or are OVERLY
- SIMPLIFIED. For thorough understanding, please refer to the
- original document of ISO2022. This is equivalent to the standard
- ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
+ invalid_code:
+ src = src_base;
+ consumed_chars = consumed_chars_base;
+ ONE_MORE_BYTE (c);
+ *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
+ coding->errors++;
+ }
++ else
++ return 0;
- ISO2022 provides many mechanisms to encode several character sets
- in 7-bit and 8-bit environments. For 7-bit environments, all text
- is encoded using bytes less than 128. This may make the encoded
- text a little bit longer, but the text passes more easily through
- several types of gateway, some of which strip off the MSB (Most
- Significant Bit).
+ no_more_source:
+ coding->consumed_char += consumed_chars_base;
+ coding->consumed = src_base - coding->source;
+ coding->charbuf_used = charbuf - coding->charbuf;
+}
- There are two kinds of character sets: control character sets and
- graphic character sets. The former contain control characters such
- as `newline' and `escape' to provide control functions (control
- functions are also provided by escape sequences). The latter
- contain graphic characters such as 'A' and '-'. Emacs recognizes
- two control character sets and many graphic character sets.
- Graphic character sets are classified into one of the following
- four classes, according to the number of bytes (DIMENSION) and
- number of characters in one dimension (CHARS) of the set:
- - DIMENSION1_CHARS94
- - DIMENSION1_CHARS96
- - DIMENSION2_CHARS94
- - DIMENSION2_CHARS96
+static int
+encode_coding_utf_8 (coding)
+ struct coding_system *coding;
+{
+ int multibytep = coding->dst_multibyte;
+ int *charbuf = coding->charbuf;
+ int *charbuf_end = charbuf + coding->charbuf_used;
+ unsigned char *dst = coding->destination + coding->produced;
+ unsigned char *dst_end = coding->destination + coding->dst_bytes;
+ int produced_chars = 0;
+ int c;
- In addition, each character set is assigned an identification tag,
- unique for each set, called the "final character" (denoted as <F>
- hereafter). The <F> of each character set is decided by ECMA(*)
- when it is registered in ISO. The code range of <F> is 0x30..0x7F
- (0x30..0x3F are for private use only).
+ if (multibytep)
+ {
+ int safe_room = MAX_MULTIBYTE_LENGTH * 2;
- Note (*): ECMA = European Computer Manufacturers Association
+ while (charbuf < charbuf_end)
+ {
+ unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
- Here are examples of graphic character sets [NAME(<F>)]:
+ ASSURE_DESTINATION (safe_room);
+ c = *charbuf++;
+ if (CHAR_BYTE8_P (c))
+ {
+ c = CHAR_TO_BYTE8 (c);
+ EMIT_ONE_BYTE (c);
+ }
+ else
+ {
+ CHAR_STRING_ADVANCE (c, pend);
+ for (p = str; p < pend; p++)
+ EMIT_ONE_BYTE (*p);
+ }
+ }
+ }
+ else
+ {
+ int safe_room = MAX_MULTIBYTE_LENGTH;
+
+ while (charbuf < charbuf_end)
+ {
+ ASSURE_DESTINATION (safe_room);
+ c = *charbuf++;
+ if (CHAR_BYTE8_P (c))
+ *dst++ = CHAR_TO_BYTE8 (c);
+ else
+ dst += CHAR_STRING (c, dst);
+ produced_chars++;
+ }
+ }
+ record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ coding->produced_char += produced_chars;
+ coding->produced = dst - coding->destination;
+ return 0;
+}
+
+
+/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ Check if a text is encoded in one of UTF-16 based coding systems.
+ If it is, return 1, else return 0. */
+
+#define UTF_16_HIGH_SURROGATE_P(val) \
+ (((val) & 0xFC00) == 0xD800)
+
+#define UTF_16_LOW_SURROGATE_P(val) \
+ (((val) & 0xFC00) == 0xDC00)
+
+#define UTF_16_INVALID_P(val) \
+ (((val) == 0xFFFE) \
+ || ((val) == 0xFFFF) \
+ || UTF_16_LOW_SURROGATE_P (val))
+
+
+static int
+detect_coding_utf_16 (coding, detect_info)
+ struct coding_system *coding;
+ struct coding_detection_info *detect_info;
+{
+ const unsigned char *src = coding->source, *src_base = src;
+ const unsigned char *src_end = coding->source + coding->src_bytes;
+ int multibytep = coding->src_multibyte;
+ int consumed_chars = 0;
+ int c1, c2;
+
+ detect_info->checked |= CATEGORY_MASK_UTF_16;
+ if (coding->mode & CODING_MODE_LAST_BLOCK
+ && (coding->src_chars & 1))
+ {
+ detect_info->rejected |= CATEGORY_MASK_UTF_16;
+ return 0;
+ }
+
+ ONE_MORE_BYTE (c1);
+ ONE_MORE_BYTE (c2);
+ if ((c1 == 0xFF) && (c2 == 0xFE))
+ {
+ detect_info->found |= (CATEGORY_MASK_UTF_16_LE
+ | CATEGORY_MASK_UTF_16_AUTO);
+ detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
+ | CATEGORY_MASK_UTF_16_BE_NOSIG
+ | CATEGORY_MASK_UTF_16_LE_NOSIG);
+ }
+ else if ((c1 == 0xFE) && (c2 == 0xFF))
+ {
+ detect_info->found |= (CATEGORY_MASK_UTF_16_BE
+ | CATEGORY_MASK_UTF_16_AUTO);
+ detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
+ | CATEGORY_MASK_UTF_16_BE_NOSIG
+ | CATEGORY_MASK_UTF_16_LE_NOSIG);
+ }
+ else if (c1 >= 0 && c2 >= 0)
+ {
+ detect_info->rejected
+ |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
+ }
+ no_more_source:
+ return 1;
+}
+
+static void
+decode_coding_utf_16 (coding)
+ struct coding_system *coding;
+{
+ const unsigned char *src = coding->source + coding->consumed;
+ const unsigned char *src_end = coding->source + coding->src_bytes;
+ const unsigned char *src_base;
+ int *charbuf = coding->charbuf + coding->charbuf_used;
+ int *charbuf_end = coding->charbuf + coding->charbuf_size;
+ int consumed_chars = 0, consumed_chars_base;
+ int multibytep = coding->src_multibyte;
+ enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
+ int surrogate = CODING_UTF_16_SURROGATE (coding);
+ Lisp_Object attr, charset_list;
+
+ CODING_GET_INFO (coding, attr, charset_list);
+
+ if (bom == utf_16_with_bom)
+ {
+ int c, c1, c2;
+
+ src_base = src;
+ ONE_MORE_BYTE (c1);
+ ONE_MORE_BYTE (c2);
+ c = (c1 << 8) | c2;
+
+ if (endian == utf_16_big_endian
+ ? c != 0xFEFF : c != 0xFFFE)
+ {
+ /* The first two bytes are not BOM. Treat them as bytes
+ for a normal character. */
+ src = src_base;
+ coding->errors++;
+ }
+ CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ }
+ else if (bom == utf_16_detect_bom)
+ {
+ /* We have already tried to detect BOM and failed in
+ detect_coding. */
+ CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ }
+
+ while (1)
+ {
+ int c, c1, c2;
+
+ src_base = src;
+ consumed_chars_base = consumed_chars;
+
+ if (charbuf + 2 >= charbuf_end)
+ break;
+
+ ONE_MORE_BYTE (c1);
+ if (c1 < 0)
+ {
+ *charbuf++ = -c1;
+ continue;
+ }
+ ONE_MORE_BYTE (c2);
+ if (c2 < 0)
+ {
+ *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
+ *charbuf++ = -c2;
+ continue;
+ }
+ c = (endian == utf_16_big_endian
+ ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
+ if (surrogate)
+ {
+ if (! UTF_16_LOW_SURROGATE_P (c))
+ {
+ if (endian == utf_16_big_endian)
+ c1 = surrogate >> 8, c2 = surrogate & 0xFF;
+ else
+ c1 = surrogate & 0xFF, c2 = surrogate >> 8;
+ *charbuf++ = c1;
+ *charbuf++ = c2;
+ coding->errors++;
+ if (UTF_16_HIGH_SURROGATE_P (c))
+ CODING_UTF_16_SURROGATE (coding) = surrogate = c;
+ else
+ *charbuf++ = c;
+ }
+ else
+ {
+ c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
+ CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
+ *charbuf++ = 0x10000 + c;
+ }
+ }
+ else
+ {
+ if (UTF_16_HIGH_SURROGATE_P (c))
+ CODING_UTF_16_SURROGATE (coding) = surrogate = c;
+ else
+ *charbuf++ = c;
+ }
+ }
+
+ no_more_source:
+ coding->consumed_char += consumed_chars_base;
+ coding->consumed = src_base - coding->source;
+ coding->charbuf_used = charbuf - coding->charbuf;
+}
+
+static int
+encode_coding_utf_16 (coding)
+ struct coding_system *coding;
+{
+ int multibytep = coding->dst_multibyte;
+ int *charbuf = coding->charbuf;
+ int *charbuf_end = charbuf + coding->charbuf_used;
+ unsigned char *dst = coding->destination + coding->produced;
+ unsigned char *dst_end = coding->destination + coding->dst_bytes;
+ int safe_room = 8;
+ enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
+ int produced_chars = 0;
+ Lisp_Object attrs, charset_list;
+ int c;
+
+ CODING_GET_INFO (coding, attrs, charset_list);
+
+ if (bom != utf_16_without_bom)
+ {
+ ASSURE_DESTINATION (safe_room);
+ if (big_endian)
+ EMIT_TWO_BYTES (0xFE, 0xFF);
+ else
+ EMIT_TWO_BYTES (0xFF, 0xFE);
+ CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ }
+
+ while (charbuf < charbuf_end)
+ {
+ ASSURE_DESTINATION (safe_room);
+ c = *charbuf++;
+ if (c >= MAX_UNICODE_CHAR)
+ c = coding->default_char;
+
+ if (c < 0x10000)
+ {
+ if (big_endian)
+ EMIT_TWO_BYTES (c >> 8, c & 0xFF);
+ else
+ EMIT_TWO_BYTES (c & 0xFF, c >> 8);
+ }
+ else
+ {
+ int c1, c2;
+
+ c -= 0x10000;
+ c1 = (c >> 10) + 0xD800;
+ c2 = (c & 0x3FF) + 0xDC00;
+ if (big_endian)
+ EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
+ else
+ EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
+ }
+ }
+ record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ coding->produced = dst - coding->destination;
+ coding->produced_char += produced_chars;
+ return 0;
+}
+
+\f
+/*** 6. Old Emacs' internal format (emacs-mule) ***/
+
+/* Emacs' internal format for representation of multiple character
+ sets is a kind of multi-byte encoding, i.e. characters are
+ represented by variable-length sequences of one-byte codes.
+
+ ASCII characters and control characters (e.g. `tab', `newline') are
+ represented by one-byte sequences which are their ASCII codes, in
+ the range 0x00 through 0x7F.
+
+ 8-bit characters of the range 0x80..0x9F are represented by
+ two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
+ code + 0x20).
+
+ 8-bit characters of the range 0xA0..0xFF are represented by
+ one-byte sequences which are their 8-bit code.
+
+ The other characters are represented by a sequence of `base
+ leading-code', optional `extended leading-code', and one or two
+ `position-code's. The length of the sequence is determined by the
+ base leading-code. Leading-code takes the range 0x81 through 0x9D,
+ whereas extended leading-code and position-code take the range 0xA0
+ through 0xFF. See `charset.h' for more details about leading-code
+ and position-code.
+
+ --- CODE RANGE of Emacs' internal format ---
+ character set range
+ ------------- -----
+ ascii 0x00..0x7F
+ eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
+ eight-bit-graphic 0xA0..0xBF
+ ELSE 0x81..0x9D + [0xA0..0xFF]+
+ ---------------------------------------------
+
+ As this is the internal character representation, the format is
+ usually not used externally (i.e. in a file or in a data sent to a
+ process). But, it is possible to have a text externally in this
+ format (i.e. by encoding by the coding system `emacs-mule').
+
+ In that case, a sequence of one-byte codes has a slightly different
+ form.
+
+ At first, all characters in eight-bit-control are represented by
+ one-byte sequences which are their 8-bit code.
+
+ Next, character composition data are represented by the byte
+ sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
+ where,
+ METHOD is 0xF0 plus one of composition method (enum
+ composition_method),
+
+ BYTES is 0xA0 plus a byte length of this composition data,
+
+ CHARS is 0x20 plus a number of characters composed by this
+ data,
+
+ COMPONENTs are characters of multibye form or composition
+ rules encoded by two-byte of ASCII codes.
+
+ In addition, for backward compatibility, the following formats are
+ also recognized as composition data on decoding.
+
+ 0x80 MSEQ ...
+ 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
+
+ Here,
+ MSEQ is a multibyte form but in these special format:
+ ASCII: 0xA0 ASCII_CODE+0x80,
+ other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
+ RULE is a one byte code of the range 0xA0..0xF0 that
+ represents a composition rule.
+ */
+
+char emacs_mule_bytes[256];
+
+int
+emacs_mule_char (coding, src, nbytes, nchars, id)
+ struct coding_system *coding;
+ const unsigned char *src;
+ int *nbytes, *nchars, *id;
+{
+ const unsigned char *src_end = coding->source + coding->src_bytes;
+ const unsigned char *src_base = src;
+ int multibytep = coding->src_multibyte;
+ struct charset *charset;
+ unsigned code;
+ int c;
+ int consumed_chars = 0;
+
+ ONE_MORE_BYTE (c);
+ if (c < 0)
+ {
+ c = -c;
+ charset = emacs_mule_charset[0];
+ }
+ else
+ {
+ switch (emacs_mule_bytes[c])
+ {
+ case 2:
+ if (! (charset = emacs_mule_charset[c]))
+ goto invalid_code;
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ goto invalid_code;
+ code = c & 0x7F;
+ break;
+
+ case 3:
+ if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
+ || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
+ {
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
+ goto invalid_code;
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ goto invalid_code;
+ code = c & 0x7F;
+ }
+ else
+ {
+ if (! (charset = emacs_mule_charset[c]))
+ goto invalid_code;
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ goto invalid_code;
+ code = (c & 0x7F) << 8;
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ goto invalid_code;
+ code |= c & 0x7F;
+ }
+ break;
+
+ case 4:
+ ONE_MORE_BYTE (c);
+ if (c < 0 || ! (charset = emacs_mule_charset[c]))
+ goto invalid_code;
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ goto invalid_code;
+ code = (c & 0x7F) << 8;
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ goto invalid_code;
+ code |= c & 0x7F;
+ break;
+
+ case 1:
+ code = c;
+ charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
+ ? charset_ascii : charset_eight_bit);
+ break;
+
+ default:
+ abort ();
+ }
+ c = DECODE_CHAR (charset, code);
+ if (c < 0)
+ goto invalid_code;
+ }
+ *nbytes = src - src_base;
+ *nchars = consumed_chars;
+ if (id)
+ *id = charset->id;
+ return c;
+
+ no_more_source:
+ return -2;
+
+ invalid_code:
+ return -1;
+}
+
+
+/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ Check if a text is encoded in `emacs-mule'. If it is, return 1,
+ else return 0. */
+
+static int
+detect_coding_emacs_mule (coding, detect_info)
+ struct coding_system *coding;
+ struct coding_detection_info *detect_info;
+{
+ const unsigned char *src = coding->source, *src_base;
+ const unsigned char *src_end = coding->source + coding->src_bytes;
+ int multibytep = coding->src_multibyte;
+ int consumed_chars = 0;
+ int c;
+ int found = 0;
+
+ detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
+ /* A coding system of this category is always ASCII compatible. */
+ src += coding->head_ascii;
+
+ while (1)
+ {
+ src_base = src;
+ ONE_MORE_BYTE (c);
+ if (c < 0)
+ continue;
+ if (c == 0x80)
+ {
+ /* Perhaps the start of composite character. We simple skip
+ it because analyzing it is too heavy for detecting. But,
+ at least, we check that the composite character
+ constitues of more than 4 bytes. */
+ const unsigned char *src_base;
+
+ repeat:
+ src_base = src;
+ do
+ {
+ ONE_MORE_BYTE (c);
+ }
+ while (c >= 0xA0);
+
+ if (src - src_base <= 4)
+ break;
+ found = CATEGORY_MASK_EMACS_MULE;
+ if (c == 0x80)
+ goto repeat;
+ }
+
+ if (c < 0x80)
+ {
+ if (c < 0x20
+ && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
+ break;
+ }
+ else
+ {
+ int more_bytes = emacs_mule_bytes[*src_base] - 1;
+
+ while (more_bytes > 0)
+ {
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ {
+ src--; /* Unread the last byte. */
+ break;
+ }
+ more_bytes--;
+ }
+ if (more_bytes != 0)
+ break;
+ found = CATEGORY_MASK_EMACS_MULE;
+ }
+ }
+ detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
+ return 0;
+
+ no_more_source:
+ if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
+ return 0;
+ }
+ detect_info->found |= found;
+ return 1;
+}
+
+
+/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
+
+/* Decode a character represented as a component of composition
+ sequence of Emacs 20/21 style at SRC. Set C to that character and
+ update SRC to the head of next character (or an encoded composition
+ rule). If SRC doesn't points a composition component, set C to -1.
+ If SRC points an invalid byte sequence, global exit by a return
+ value 0. */
+
+#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
+ if (1) \
+ { \
+ int c; \
+ int nbytes, nchars; \
+ \
+ if (src == src_end) \
+ break; \
+ c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
+ if (c < 0) \
+ { \
+ if (c == -2) \
+ break; \
+ goto invalid_code; \
+ } \
+ *buf++ = c; \
+ src += nbytes; \
+ consumed_chars += nchars; \
+ } \
+ else
+
+
+/* Decode a composition rule represented as a component of composition
+ sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
+ and increment BUF. If SRC points an invalid byte sequence, set C
+ to -1. */
+
+#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
+ do { \
+ int c, gref, nref; \
+ \
+ if (src >= src_end) \
+ goto invalid_code; \
+ ONE_MORE_BYTE_NO_CHECK (c); \
+ c -= 0x20; \
+ if (c < 0 || c >= 81) \
+ goto invalid_code; \
+ \
+ gref = c / 9, nref = c % 9; \
+ *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
+ } while (0)
+
+
+/* Decode a composition rule represented as a component of composition
+ sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
+ and increment BUF. If SRC points an invalid byte sequence, set C
+ to -1. */
+
+#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
+ do { \
+ int gref, nref; \
+ \
+ if (src + 1>= src_end) \
+ goto invalid_code; \
+ ONE_MORE_BYTE_NO_CHECK (gref); \
+ gref -= 0x20; \
+ ONE_MORE_BYTE_NO_CHECK (nref); \
+ nref -= 0x20; \
+ if (gref < 0 || gref >= 81 \
+ || nref < 0 || nref >= 81) \
+ goto invalid_code; \
+ *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
+ } while (0)
+
+
+#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
+ do { \
+ /* Emacs 21 style format. The first three bytes at SRC are \
+ (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
+ the byte length of this composition information, CHARS is the \
+ number of characters composed by this composition. */ \
+ enum composition_method method = c - 0xF2; \
+ int *charbuf_base = charbuf; \
+ int consumed_chars_limit; \
+ int nbytes, nchars; \
+ \
+ ONE_MORE_BYTE (c); \
+ if (c < 0) \
+ goto invalid_code; \
+ nbytes = c - 0xA0; \
+ if (nbytes < 3) \
+ goto invalid_code; \
+ ONE_MORE_BYTE (c); \
+ if (c < 0) \
+ goto invalid_code; \
+ nchars = c - 0xA0; \
+ ADD_COMPOSITION_DATA (charbuf, nchars, method); \
+ consumed_chars_limit = consumed_chars_base + nbytes; \
+ if (method != COMPOSITION_RELATIVE) \
+ { \
+ int i = 0; \
+ while (consumed_chars < consumed_chars_limit) \
+ { \
+ if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
+ DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
+ else \
+ DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
+ i++; \
+ } \
+ if (consumed_chars < consumed_chars_limit) \
+ goto invalid_code; \
+ charbuf_base[0] -= i; \
+ } \
+ } while (0)
+
+
+#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
+ do { \
+ /* Emacs 20 style format for relative composition. */ \
+ /* Store multibyte form of characters to be composed. */ \
+ enum composition_method method = COMPOSITION_RELATIVE; \
+ int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
+ int *buf = components; \
+ int i, j; \
+ \
+ src = src_base; \
+ ONE_MORE_BYTE (c); /* skip 0x80 */ \
+ for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
+ DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
+ if (i < 2) \
+ goto invalid_code; \
+ ADD_COMPOSITION_DATA (charbuf, i, method); \
+ for (j = 0; j < i; j++) \
+ *charbuf++ = components[j]; \
+ } while (0)
+
+
+#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
+ do { \
+ /* Emacs 20 style format for rule-base composition. */ \
+ /* Store multibyte form of characters to be composed. */ \
+ enum composition_method method = COMPOSITION_WITH_RULE; \
+ int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
+ int *buf = components; \
+ int i, j; \
+ \
+ DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
+ for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
+ { \
+ DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
+ DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
+ } \
+ if (i < 1 || (buf - components) % 2 == 0) \
+ goto invalid_code; \
+ if (charbuf + i + (i / 2) + 1 < charbuf_end) \
+ goto no_more_source; \
+ ADD_COMPOSITION_DATA (buf, i, method); \
+ for (j = 0; j < i; j++) \
+ *charbuf++ = components[j]; \
+ for (j = 0; j < i; j += 2) \
+ *charbuf++ = components[j]; \
+ } while (0)
+
+
+static void
+decode_coding_emacs_mule (coding)
+ struct coding_system *coding;
+{
+ const unsigned char *src = coding->source + coding->consumed;
+ const unsigned char *src_end = coding->source + coding->src_bytes;
+ const unsigned char *src_base;
+ int *charbuf = coding->charbuf + coding->charbuf_used;
+ int *charbuf_end
+ = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
+ int consumed_chars = 0, consumed_chars_base;
+ int multibytep = coding->src_multibyte;
+ Lisp_Object attrs, charset_list;
+ int char_offset = coding->produced_char;
+ int last_offset = char_offset;
+ int last_id = charset_ascii;
+
+ CODING_GET_INFO (coding, attrs, charset_list);
+
+ while (1)
+ {
+ int c;
+
+ src_base = src;
+ consumed_chars_base = consumed_chars;
+
+ if (charbuf >= charbuf_end)
+ break;
+
+ ONE_MORE_BYTE (c);
+ if (c < 0)
+ {
+ *charbuf++ = -c;
+ char_offset++;
+ }
+ else if (c < 0x80)
+ {
+ *charbuf++ = c;
+ char_offset++;
+ }
+ else if (c == 0x80)
+ {
+ ONE_MORE_BYTE (c);
+ if (c < 0)
+ goto invalid_code;
+ if (c - 0xF2 >= COMPOSITION_RELATIVE
+ && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
+ DECODE_EMACS_MULE_21_COMPOSITION (c);
+ else if (c < 0xC0)
+ DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
+ else if (c == 0xFF)
+ DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
+ else
+ goto invalid_code;
+ }
+ else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
+ {
+ int nbytes, nchars;
+ int id;
+
+ src = src_base;
+ consumed_chars = consumed_chars_base;
+ c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
+ if (c < 0)
+ {
+ if (c == -2)
+ break;
+ goto invalid_code;
+ }
+ if (last_id != id)
+ {
+ if (last_id != charset_ascii)
+ ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
+ last_id = id;
+ last_offset = char_offset;
+ }
+ *charbuf++ = c;
+ src += nbytes;
+ consumed_chars += nchars;
+ char_offset++;
+ }
+ continue;
+
+ invalid_code:
+ src = src_base;
+ consumed_chars = consumed_chars_base;
+ ONE_MORE_BYTE (c);
+ *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
+ char_offset++;
+ coding->errors++;
+ }
+
+ no_more_source:
+ if (last_id != charset_ascii)
+ ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
+ coding->consumed_char += consumed_chars_base;
+ coding->consumed = src_base - coding->source;
+ coding->charbuf_used = charbuf - coding->charbuf;
+}
+
+
+#define EMACS_MULE_LEADING_CODES(id, codes) \
+ do { \
+ if (id < 0xA0) \
+ codes[0] = id, codes[1] = 0; \
+ else if (id < 0xE0) \
+ codes[0] = 0x9A, codes[1] = id; \
+ else if (id < 0xF0) \
+ codes[0] = 0x9B, codes[1] = id; \
+ else if (id < 0xF5) \
+ codes[0] = 0x9C, codes[1] = id; \
+ else \
+ codes[0] = 0x9D, codes[1] = id; \
+ } while (0);
+
+
+static int
+encode_coding_emacs_mule (coding)
+ struct coding_system *coding;
+{
+ int multibytep = coding->dst_multibyte;
+ int *charbuf = coding->charbuf;
+ int *charbuf_end = charbuf + coding->charbuf_used;
+ unsigned char *dst = coding->destination + coding->produced;
+ unsigned char *dst_end = coding->destination + coding->dst_bytes;
+ int safe_room = 8;
+ int produced_chars = 0;
+ Lisp_Object attrs, charset_list;
+ int c;
+ int preferred_charset_id = -1;
+
+ CODING_GET_INFO (coding, attrs, charset_list);
+ if (! EQ (charset_list, Vemacs_mule_charset_list))
+ {
+ CODING_ATTR_CHARSET_LIST (attrs)
+ = charset_list = Vemacs_mule_charset_list;
+ }
+
+ while (charbuf < charbuf_end)
+ {
+ ASSURE_DESTINATION (safe_room);
+ c = *charbuf++;
+
+ if (c < 0)
+ {
+ /* Handle an annotation. */
+ switch (*charbuf)
+ {
+ case CODING_ANNOTATE_COMPOSITION_MASK:
+ /* Not yet implemented. */
+ break;
+ case CODING_ANNOTATE_CHARSET_MASK:
+ preferred_charset_id = charbuf[3];
+ if (preferred_charset_id >= 0
+ && NILP (Fmemq (make_number (preferred_charset_id),
+ charset_list)))
+ preferred_charset_id = -1;
+ break;
+ default:
+ abort ();
+ }
+ charbuf += -c - 1;
+ continue;
+ }
+
+ if (ASCII_CHAR_P (c))
+ EMIT_ONE_ASCII_BYTE (c);
+ else if (CHAR_BYTE8_P (c))
+ {
+ c = CHAR_TO_BYTE8 (c);
+ EMIT_ONE_BYTE (c);
+ }
+ else
+ {
+ struct charset *charset;
+ unsigned code;
+ int dimension;
+ int emacs_mule_id;
+ unsigned char leading_codes[2];
+
+ if (preferred_charset_id >= 0)
+ {
+ charset = CHARSET_FROM_ID (preferred_charset_id);
+ if (! CHAR_CHARSET_P (c, charset))
+ charset = char_charset (c, charset_list, NULL);
+ }
+ else
+ charset = char_charset (c, charset_list, &code);
+ if (! charset)
+ {
+ c = coding->default_char;
+ if (ASCII_CHAR_P (c))
+ {
+ EMIT_ONE_ASCII_BYTE (c);
+ continue;
+ }
+ charset = char_charset (c, charset_list, &code);
+ }
+ dimension = CHARSET_DIMENSION (charset);
+ emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
+ EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
+ EMIT_ONE_BYTE (leading_codes[0]);
+ if (leading_codes[1])
+ EMIT_ONE_BYTE (leading_codes[1]);
+ if (dimension == 1)
+ EMIT_ONE_BYTE (code | 0x80);
+ else
+ {
+ code |= 0x8080;
+ EMIT_ONE_BYTE (code >> 8);
+ EMIT_ONE_BYTE (code & 0xFF);
+ }
+ }
+ }
+ record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ coding->produced_char += produced_chars;
+ coding->produced = dst - coding->destination;
+ return 0;
+}
+
+\f
+/*** 7. ISO2022 handlers ***/
+
+/* The following note describes the coding system ISO2022 briefly.
+ Since the intention of this note is to help understand the
+ functions in this file, some parts are NOT ACCURATE or are OVERLY
+ SIMPLIFIED. For thorough understanding, please refer to the
+ original document of ISO2022. This is equivalent to the standard
+ ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
+
+ ISO2022 provides many mechanisms to encode several character sets
+ in 7-bit and 8-bit environments. For 7-bit environments, all text
+ is encoded using bytes less than 128. This may make the encoded
+ text a little bit longer, but the text passes more easily through
+ several types of gateway, some of which strip off the MSB (Most
+ Significant Bit).
+
+ There are two kinds of character sets: control character sets and
+ graphic character sets. The former contain control characters such
+ as `newline' and `escape' to provide control functions (control
+ functions are also provided by escape sequences). The latter
+ contain graphic characters such as 'A' and '-'. Emacs recognizes
+ two control character sets and many graphic character sets.
+
+ Graphic character sets are classified into one of the following
+ four classes, according to the number of bytes (DIMENSION) and
+ number of characters in one dimension (CHARS) of the set:
+ - DIMENSION1_CHARS94
+ - DIMENSION1_CHARS96
+ - DIMENSION2_CHARS94
+ - DIMENSION2_CHARS96
+
+ In addition, each character set is assigned an identification tag,
+ unique for each set, called the "final character" (denoted as <F>
+ hereafter). The <F> of each character set is decided by ECMA(*)
+ when it is registered in ISO. The code range of <F> is 0x30..0x7F
+ (0x30..0x3F are for private use only).
+
+ Note (*): ECMA = European Computer Manufacturers Association
+
+ Here are examples of graphic character sets [NAME(<F>)]:
o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...