X-Git-Url: http://git.hcoop.net/bpt/emacs.git/blobdiff_plain/dfcf069d565c347abf3cb7cec80e6ed8432037ba..0968d4ee86ed782247c55849bd6dcecc446dfc37:/src/coding.c diff --git a/src/coding.c b/src/coding.c index a98ca65eb9..d865bd139b 100644 --- a/src/coding.c +++ b/src/coding.c @@ -315,12 +315,15 @@ Lisp_Object Vcoding_system_for_write; Lisp_Object Vlast_coding_system_used; /* A vector of length 256 which contains information about special - Latin codes (espepcially for dealing with Microsoft code). */ + Latin codes (especially for dealing with Microsoft codes). */ Lisp_Object Vlatin_extra_code_table; /* Flag to inhibit code conversion of end-of-line format. */ int inhibit_eol_conversion; +/* Flag to make buffer-file-coding-system inherit from process-coding. */ +int inherit_process_coding_system; + /* Coding system to be used to encode text for terminal display. */ struct coding_system terminal_coding; @@ -360,21 +363,27 @@ char *coding_category_name[CODING_CATEGORY_IDX_MAX] = { "coding-category-binary" }; -/* Table pointers to coding systems corresponding to each coding +/* Table of pointers to coding systems corresponding to each coding categories. */ struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX]; -/* Flag to tell if we look up unification table on character code +/* Table of coding category masks. Nth element is a mask for a coding + cateogry of which priority is Nth. */ +static +int coding_priorities[CODING_CATEGORY_IDX_MAX]; + +/* Flag to tell if we look up translation table on character code conversion. */ -Lisp_Object Venable_character_unification; -/* Standard unification table to look up on decoding (reading). */ -Lisp_Object Vstandard_character_unification_table_for_decode; -/* Standard unification table to look up on encoding (writing). */ -Lisp_Object Vstandard_character_unification_table_for_encode; +Lisp_Object Venable_character_translation; +/* Standard translation table to look up on decoding (reading). */ +Lisp_Object Vstandard_translation_table_for_decode; +/* Standard translation table to look up on encoding (writing). */ +Lisp_Object Vstandard_translation_table_for_encode; -Lisp_Object Qcharacter_unification_table; -Lisp_Object Qcharacter_unification_table_for_decode; -Lisp_Object Qcharacter_unification_table_for_encode; +Lisp_Object Qtranslation_table; +Lisp_Object Qtranslation_table_id; +Lisp_Object Qtranslation_table_for_decode; +Lisp_Object Qtranslation_table_for_encode; /* Alist of charsets vs revision number. */ Lisp_Object Vcharset_revision_alist; @@ -875,21 +884,21 @@ detect_coding_iso2022 (src, src_end) *dst++ = 0xFF; \ coding->composing += 2; \ } \ - if ((charset) >= 0) \ + if (charset_alt >= 0) \ { \ - if (CHARSET_DIMENSION (charset) == 2) \ + if (CHARSET_DIMENSION (charset_alt) == 2) \ { \ ONE_MORE_BYTE (c2); \ if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \ && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \ { \ src--; \ - c2 = ' '; \ + charset_alt = CHARSET_ASCII; \ } \ } \ - if (!NILP (unification_table) \ - && ((c_alt = unify_char (unification_table, \ - -1, (charset), c1, c2)) >= 0)) \ + if (!NILP (translation_table) \ + && ((c_alt = translate_char (translation_table, \ + -1, charset_alt, c1, c2)) >= 0)) \ SPLIT_CHAR (c_alt, charset_alt, c1, c2); \ } \ if (charset_alt == CHARSET_ASCII || charset_alt < 0) \ @@ -940,7 +949,8 @@ detect_coding_iso2022 (src, src_end) Else, if it contains only valid codes, return 0. Else return the length of the composing sequence. */ -int check_composing_code (coding, src, src_end) +int +check_composing_code (coding, src, src_end) struct coding_system *coding; unsigned char *src, *src_end; { @@ -979,7 +989,9 @@ int check_composing_code (coding, src, src_end) invalid_code_found = 1; } } - return ((coding->mode & CODING_MODE_LAST_BLOCK) ? src_end - src_start : -1); + return (invalid_code_found + ? src - src_start + : (coding->mode & CODING_MODE_LAST_BLOCK ? 0 : -1)); } /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ @@ -1002,12 +1014,12 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) /* Charsets invoked to graphic plane 0 and 1 respectively. */ int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); - Lisp_Object unification_table - = coding->character_unification_table_for_decode; + Lisp_Object translation_table + = coding->translation_table_for_decode; int result = CODING_FINISH_NORMAL; - if (!NILP (Venable_character_unification) && NILP (unification_table)) - unification_table = Vstandard_character_unification_table_for_decode; + if (!NILP (Venable_character_translation) && NILP (translation_table)) + translation_table = Vstandard_translation_table_for_decode; coding->produced_char = 0; coding->fake_multibyte = 0; @@ -1219,9 +1231,12 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) result1 = check_composing_code (coding, src, src_end); if (result1 == 0) - coding->composing = (c1 == '0' - ? COMPOSING_NO_RULE_HEAD - : COMPOSING_WITH_RULE_HEAD); + { + coding->composing = (c1 == '0' + ? COMPOSING_NO_RULE_HEAD + : COMPOSING_WITH_RULE_HEAD); + coding->produced_char++; + } else if (result1 > 0) { if (result1 + 2 < (dst_bytes ? dst_end : src_base) - dst) @@ -1244,7 +1259,6 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) case '1': /* end composing */ coding->composing = COMPOSING_NO; - coding->produced_char++; break; case '[': /* specification of direction */ @@ -1549,32 +1563,33 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) dst = encode_invocation_designation (charset, coding, dst); \ } while (1) -#define ENCODE_ISO_CHARACTER(charset, c1, c2) \ - do { \ - int c_alt, charset_alt; \ - if (!NILP (unification_table) \ - && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \ - >= 0)) \ - SPLIT_CHAR (c_alt, charset_alt, c1, c2); \ - else \ - charset_alt = charset; \ - if (CHARSET_DIMENSION (charset_alt) == 1) \ - { \ - if (charset == CHARSET_ASCII \ - && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \ - charset_alt = charset_latin_jisx0201; \ - ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \ - } \ - else \ - { \ - if (charset == charset_jisx0208 \ - && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \ - charset_alt = charset_jisx0208_1978; \ - ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \ - } \ - if (! COMPOSING_P (coding->composing)) \ - coding->consumed_char++; \ - } while (0) +#define ENCODE_ISO_CHARACTER(charset, c1, c2) \ + do { \ + int c_alt, charset_alt; \ + if (!NILP (translation_table) \ + && ((c_alt = translate_char (translation_table, -1, \ + charset, c1, c2)) \ + >= 0)) \ + SPLIT_CHAR (c_alt, charset_alt, c1, c2); \ + else \ + charset_alt = charset; \ + if (CHARSET_DIMENSION (charset_alt) == 1) \ + { \ + if (charset == CHARSET_ASCII \ + && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \ + charset_alt = charset_latin_jisx0201; \ + ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \ + } \ + else \ + { \ + if (charset == charset_jisx0208 \ + && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \ + charset_alt = charset_jisx0208_1978; \ + ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \ + } \ + if (! COMPOSING_P (coding->composing)) \ + coding->consumed_char++; \ + } while (0) /* Produce designation and invocation codes at a place pointed by DST to use CHARSET. The element `spec.iso2022' of *CODING is updated. @@ -1707,7 +1722,7 @@ encode_designation_at_bol (coding, table, src, src_end, dstp) unsigned char c1, c2; SPLIT_STRING(src, bytes, charset, c1, c2); - if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0) + if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0) charset = CHAR_CHARSET (c_alt); } @@ -1747,12 +1762,12 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) from DST_END to assure overflow checking is necessary only at the head of loop. */ unsigned char *adjusted_dst_end = dst_end - 19; - Lisp_Object unification_table - = coding->character_unification_table_for_encode; + Lisp_Object translation_table + = coding->translation_table_for_encode; int result = CODING_FINISH_NORMAL; - if (!NILP (Venable_character_unification) && NILP (unification_table)) - unification_table = Vstandard_character_unification_table_for_encode; + if (!NILP (Venable_character_translation) && NILP (translation_table)) + translation_table = Vstandard_translation_table_for_encode; coding->consumed_char = 0; coding->fake_multibyte = 0; @@ -1772,7 +1787,7 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) && CODING_SPEC_ISO_BOL (coding)) { /* We have to produce designation sequences if any now. */ - encode_designation_at_bol (coding, unification_table, + encode_designation_at_bol (coding, translation_table, src, src_end, &dst); CODING_SPEC_ISO_BOL (coding) = 0; } @@ -1865,8 +1880,8 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) { /* invalid sequence */ *dst++ = c1; - *dst++ = c2; - coding->consumed_char += 2; + src--; + coding->consumed_char++; } else ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3); @@ -1878,9 +1893,8 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) { /* invalid sequence */ *dst++ = c1; - *dst++ = c2; - *dst++ = c3; - coding->consumed_char += 3; + src -= 2; + coding->consumed_char++; } else if (c1 < LEADING_CODE_PRIVATE_11) ENCODE_ISO_CHARACTER (c1, c2, c3); @@ -1894,10 +1908,8 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) { /* invalid sequence */ *dst++ = c1; - *dst++ = c2; - *dst++ = c3; - *dst++ = c4; - coding->consumed_char += 4; + src -= 3; + coding->consumed_char++; } else ENCODE_ISO_CHARACTER (c2, c3, c4); @@ -1909,8 +1921,8 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) { /* invalid sequence */ *dst++ = c1; - *dst++ = c2; - coding->consumed_char += 2; + src--; + coding->consumed_char++; } else if (c2 == 0xFF) { @@ -1943,18 +1955,18 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) break; } - if (src < src_end) + if (src < src_end && result == CODING_FINISH_NORMAL) + result = CODING_FINISH_INSUFFICIENT_DST; + + /* If this is the last block of the text to be encoded, we must + reset graphic planes and registers to the initial state, and + flush out the carryover if any. */ + if (coding->mode & CODING_MODE_LAST_BLOCK) { - if (result == CODING_FINISH_NORMAL) - result = CODING_FINISH_INSUFFICIENT_DST; - else - /* If this is the last block of the text to be encoded, we - must reset graphic planes and registers to the initial - state, and flush out the carryover if any. */ - if (coding->mode & CODING_MODE_LAST_BLOCK) - ENCODE_RESET_PLANE_AND_REGISTER; + ENCODE_RESET_PLANE_AND_REGISTER; + if (COMPOSING_P (coding->composing)) + ENCODE_COMPOSITION_END; } - coding->consumed = src - source; coding->produced = coding->produced_char = dst - destination; return result; @@ -1978,7 +1990,7 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) (character set) (range) ASCII 0x00 .. 0x7F KATAKANA-JISX0201 0xA0 .. 0xDF - JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF + JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF (2nd byte) 0x40 .. 0xFF ------------------------------- @@ -2038,9 +2050,9 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \ do { \ int c_alt, charset_alt = (charset); \ - if (!NILP (unification_table) \ - && ((c_alt = unify_char (unification_table, \ - -1, (charset), c1, c2)) >= 0)) \ + if (!NILP (translation_table) \ + && ((c_alt = translate_char (translation_table, \ + -1, (charset), c1, c2)) >= 0)) \ SPLIT_CHAR (c_alt, charset_alt, c1, c2); \ if (charset_alt == CHARSET_ASCII || charset_alt < 0) \ DECODE_CHARACTER_ASCII (c1); \ @@ -2050,54 +2062,55 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \ } while (0) -#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \ - do { \ - int c_alt, charset_alt; \ - if (!NILP (unification_table) \ - && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \ - >= 0)) \ - SPLIT_CHAR (c_alt, charset_alt, c1, c2); \ - else \ - charset_alt = charset; \ - if (charset_alt == charset_ascii) \ - *dst++ = c1; \ - else if (CHARSET_DIMENSION (charset_alt) == 1) \ - { \ - if (sjis_p && charset_alt == charset_katakana_jisx0201) \ - *dst++ = c1; \ - else \ - { \ - *dst++ = charset_alt, *dst++ = c1; \ - coding->fake_multibyte = 1; \ - } \ - } \ - else \ - { \ - c1 &= 0x7F, c2 &= 0x7F; \ - if (sjis_p && charset_alt == charset_jisx0208) \ - { \ - unsigned char s1, s2; \ - \ - ENCODE_SJIS (c1, c2, s1, s2); \ - *dst++ = s1, *dst++ = s2; \ - coding->fake_multibyte = 1; \ - } \ - else if (!sjis_p \ - && (charset_alt == charset_big5_1 \ - || charset_alt == charset_big5_2)) \ - { \ - unsigned char b1, b2; \ - \ - ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \ - *dst++ = b1, *dst++ = b2; \ - } \ - else \ - { \ - *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \ - coding->fake_multibyte = 1; \ - } \ - } \ - coding->consumed_char++; \ +#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \ + do { \ + int c_alt, charset_alt; \ + if (!NILP (translation_table) \ + && ((c_alt = translate_char (translation_table, -1, \ + charset, c1, c2)) \ + >= 0)) \ + SPLIT_CHAR (c_alt, charset_alt, c1, c2); \ + else \ + charset_alt = charset; \ + if (charset_alt == charset_ascii) \ + *dst++ = c1; \ + else if (CHARSET_DIMENSION (charset_alt) == 1) \ + { \ + if (sjis_p && charset_alt == charset_katakana_jisx0201) \ + *dst++ = c1; \ + else \ + { \ + *dst++ = charset_alt, *dst++ = c1; \ + coding->fake_multibyte = 1; \ + } \ + } \ + else \ + { \ + c1 &= 0x7F, c2 &= 0x7F; \ + if (sjis_p && charset_alt == charset_jisx0208) \ + { \ + unsigned char s1, s2; \ + \ + ENCODE_SJIS (c1, c2, s1, s2); \ + *dst++ = s1, *dst++ = s2; \ + coding->fake_multibyte = 1; \ + } \ + else if (!sjis_p \ + && (charset_alt == charset_big5_1 \ + || charset_alt == charset_big5_2)) \ + { \ + unsigned char b1, b2; \ + \ + ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \ + *dst++ = b1, *dst++ = b2; \ + } \ + else \ + { \ + *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \ + coding->fake_multibyte = 1; \ + } \ + } \ + coding->consumed_char++; \ } while (0); /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". @@ -2166,12 +2179,12 @@ decode_coding_sjis_big5 (coding, source, destination, from DST_END to assure overflow checking is necessary only at the head of loop. */ unsigned char *adjusted_dst_end = dst_end - 3; - Lisp_Object unification_table - = coding->character_unification_table_for_decode; + Lisp_Object translation_table + = coding->translation_table_for_decode; int result = CODING_FINISH_NORMAL; - if (!NILP (Venable_character_unification) && NILP (unification_table)) - unification_table = Vstandard_character_unification_table_for_decode; + if (!NILP (Venable_character_translation) && NILP (translation_table)) + translation_table = Vstandard_translation_table_for_decode; coding->produced_char = 0; coding->fake_multibyte = 0; @@ -2223,69 +2236,47 @@ decode_coding_sjis_big5 (coding, source, destination, } else if (c1 < 0x80) DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2); - else if (c1 < 0xA0) + else { - /* SJIS -> JISX0208 */ if (sjis_p) { - ONE_MORE_BYTE (c2); - if (c2 >= 0x40) + if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0)) { - DECODE_SJIS (c1, c2, c3, c4); - DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4); + /* SJIS -> JISX0208 */ + ONE_MORE_BYTE (c2); + if (c2 >= 0x40) + { + DECODE_SJIS (c1, c2, c3, c4); + DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4); + } + else + goto label_invalid_code_2; } + else if (c1 < 0xE0) + /* SJIS -> JISX0201-Kana */ + DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, + /* dummy */ c2); else - goto label_invalid_code_2; + goto label_invalid_code_1; } else - goto label_invalid_code_1; - } - else if (c1 < 0xE0) - { - /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */ - if (sjis_p) - DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, - /* dummy */ c2); - else - { - int charset; - - ONE_MORE_BYTE (c2); - if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) - { - DECODE_BIG5 (c1, c2, charset, c3, c4); - DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4); - } - else - goto label_invalid_code_2; - } - } - else /* C1 >= 0xE0 */ - { - /* SJIS -> JISX0208, BIG5 -> Big5 */ - if (sjis_p) { - ONE_MORE_BYTE (c2); - if (c2 >= 0x40) + /* BIG5 -> Big5 */ + if (c1 >= 0xA1 && c1 <= 0xFE) { - DECODE_SJIS (c1, c2, c3, c4); - DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4); - } - else - goto label_invalid_code_2; - } - else - { - int charset; + ONE_MORE_BYTE (c2); + if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) + { + int charset; - ONE_MORE_BYTE (c2); - if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) - { - DECODE_BIG5 (c1, c2, charset, c3, c4); - DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4); + DECODE_BIG5 (c1, c2, charset, c3, c4); + DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4); + } + else + goto label_invalid_code_2; } else - goto label_invalid_code_2; + goto label_invalid_code_1; } } continue; @@ -2355,12 +2346,12 @@ encode_coding_sjis_big5 (coding, source, destination, from DST_END to assure overflow checking is necessary only at the head of loop. */ unsigned char *adjusted_dst_end = dst_end - 1; - Lisp_Object unification_table - = coding->character_unification_table_for_encode; + Lisp_Object translation_table + = coding->translation_table_for_encode; int result = CODING_FINISH_NORMAL; - if (!NILP (Venable_character_unification) && NILP (unification_table)) - unification_table = Vstandard_character_unification_table_for_encode; + if (!NILP (Venable_character_translation) && NILP (translation_table)) + translation_table = Vstandard_translation_table_for_encode; coding->consumed_char = 0; coding->fake_multibyte = 0; @@ -2582,7 +2573,7 @@ decode_eol (coding, source, destination, src_bytes, dst_bytes) else safe_bcopy (source, destination, src_bytes); src += src_bytes; - dst += dst_bytes; + dst += src_bytes; coding->fake_multibyte = 1; break; } @@ -2649,18 +2640,16 @@ encode_eol (coding, source, destination, src_bytes, dst_bytes) if (dst_bytes) bcopy (source, destination, src_bytes); else - { - safe_bcopy (source, destination, src_bytes); - dst_bytes = src_bytes; - } - if (coding->eol_type == CODING_EOL_CRLF) + safe_bcopy (source, destination, src_bytes); + dst_bytes = src_bytes; + if (coding->eol_type == CODING_EOL_CR) { while (src_bytes--) { if ((c = *dst++) == '\n') dst[-1] = '\r'; else if (BASE_LEADING_CODE_P (c)) - coding->fake_multibyte = 1; + coding->fake_multibyte = 1; } } else @@ -2808,26 +2797,23 @@ setup_coding_system (coding_system, coding) /* Initialize remaining fields. */ coding->composing = 0; - coding->character_unification_table_for_decode = Qnil; - coding->character_unification_table_for_encode = Qnil; + coding->translation_table_for_decode = Qnil; + coding->translation_table_for_encode = Qnil; /* Get values of coding system properties: `post-read-conversion', `pre-write-conversion', - `character-unification-table-for-decode', - `character-unification-table-for-encode'. */ + `translation-table-for-decode', `translation-table-for-encode'. */ plist = XVECTOR (coding_spec)->contents[3]; coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion); coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion); - val = Fplist_get (plist, Qcharacter_unification_table_for_decode); + val = Fplist_get (plist, Qtranslation_table_for_decode); if (SYMBOLP (val)) - val = Fget (val, Qcharacter_unification_table_for_decode); - coding->character_unification_table_for_decode - = CHAR_TABLE_P (val) ? val : Qnil; - val = Fplist_get (plist, Qcharacter_unification_table_for_encode); + val = Fget (val, Qtranslation_table_for_decode); + coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil; + val = Fplist_get (plist, Qtranslation_table_for_encode); if (SYMBOLP (val)) - val = Fget (val, Qcharacter_unification_table_for_encode); - coding->character_unification_table_for_encode - = CHAR_TABLE_P (val) ? val : Qnil; + val = Fget (val, Qtranslation_table_for_encode); + coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil; val = Fplist_get (plist, Qcoding_category); if (!NILP (val)) { @@ -3042,12 +3028,18 @@ setup_coding_system (coding_system, coding) |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; { Lisp_Object val = XVECTOR (coding_spec)->contents[4]; + Lisp_Object decoder, encoder; + if (CONSP (val) - && VECTORP (XCONS (val)->car) - && VECTORP (XCONS (val)->cdr)) + && SYMBOLP (XCONS (val)->car) + && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx)) + && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder))) + && SYMBOLP (XCONS (val)->cdr) + && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx)) + && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder)))) { - setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car); - setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr); + setup_ccl_program (&(coding->spec.ccl.decoder), decoder); + setup_ccl_program (&(coding->spec.ccl.encoder), encoder); } else goto label_invalid_coding_system; @@ -3073,6 +3065,32 @@ setup_coding_system (coding_system, coding) return -1; } +/* Setup raw-text or one of its subsidiaries in the structure + coding_system CODING according to the already setup value eol_type + in CODING. CODING should be setup for some coding system in + advance. */ + +void +setup_raw_text_coding_system (coding) + struct coding_system *coding; +{ + if (coding->type != coding_type_raw_text) + { + coding->symbol = Qraw_text; + coding->type = coding_type_raw_text; + if (coding->eol_type != CODING_EOL_UNDECIDED) + { + Lisp_Object subsidiaries = Fget (Qraw_text, Qeol_type); + + if (VECTORP (subsidiaries) + && XVECTOR (subsidiaries)->size == 3) + coding->symbol + = XVECTOR (subsidiaries)->contents[coding->eol_type]; + } + } + return; +} + /* Emacs has a mechanism to automatically detect a coding system if it is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But, it's impossible to distinguish some coding systems accurately @@ -3156,6 +3174,9 @@ setup_coding_system (coding_system, coding) */ +static +int ascii_skip_code[256]; + /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded. If it detects possible coding systems, return an integer in which appropriate flag bits are set. Flag bits are defined by macros @@ -3170,30 +3191,24 @@ detect_coding_mask (source, src_bytes, priorities, skip) { register unsigned char c; unsigned char *src = source, *src_end = source + src_bytes; - unsigned int mask = (CODING_CATEGORY_MASK_ISO_7BIT - | CODING_CATEGORY_MASK_ISO_SHIFT); + unsigned int mask; int i; /* At first, skip all ASCII characters and control characters except for three ISO2022 specific control characters. */ + ascii_skip_code[ISO_CODE_SO] = 0; + ascii_skip_code[ISO_CODE_SI] = 0; + ascii_skip_code[ISO_CODE_ESC] = 0; + label_loop_detect_coding: - while (src < src_end) - { - c = *src; - if (c >= 0x80 - || ((mask & CODING_CATEGORY_MASK_ISO_7BIT) - && c == ISO_CODE_ESC) - || ((mask & CODING_CATEGORY_MASK_ISO_SHIFT) - && (c == ISO_CODE_SI || c == ISO_CODE_SO))) - break; - src++; - } + while (src < src_end && ascii_skip_code[*src]) src++; *skip = src - source; if (src >= src_end) /* We found nothing other than ASCII. There's nothing to do. */ return 0; + c = *src; /* The text seems to be encoded in some multilingual coding system. Now, try to find in which coding system the text is encoded. */ if (c < 0x80) @@ -3205,9 +3220,10 @@ detect_coding_mask (source, src_bytes, priorities, skip) { /* No valid ISO2022 code follows C. Try again. */ src++; - mask = (c != ISO_CODE_ESC - ? CODING_CATEGORY_MASK_ISO_7BIT - : CODING_CATEGORY_MASK_ISO_SHIFT); + if (c == ISO_CODE_ESC) + ascii_skip_code[ISO_CODE_ESC] = 1; + else + ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1; goto label_loop_detect_coding; } if (priorities) @@ -3253,15 +3269,18 @@ detect_coding_mask (source, src_bytes, priorities, skip) { for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) { - priorities[i] &= try; - if (priorities[i] & CODING_CATEGORY_MASK_ISO) + if (priorities[i] & try & CODING_CATEGORY_MASK_ISO) mask = detect_coding_iso2022 (src, src_end); - else if (priorities[i] & CODING_CATEGORY_MASK_SJIS) + else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) mask = detect_coding_sjis (src, src_end); - else if (priorities[i] & CODING_CATEGORY_MASK_BIG5) + else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) mask = detect_coding_big5 (src, src_end); - else if (priorities[i] & CODING_CATEGORY_MASK_EMACS_MULE) + else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) mask = detect_coding_emacs_mule (src, src_end); + else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) + mask = CODING_CATEGORY_MASK_RAW_TEXT; + else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) + mask = CODING_CATEGORY_MASK_BINARY; if (mask) goto label_return_highest_only; } @@ -3276,7 +3295,7 @@ detect_coding_mask (source, src_bytes, priorities, skip) if (try & CODING_CATEGORY_MASK_EMACS_MULE) mask |= detect_coding_emacs_mule (src, src_end); } - return (mask | CODING_CATEGORY_MASK_RAW_TEXT); + return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); label_return_highest_only: for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) @@ -3298,27 +3317,9 @@ detect_coding (coding, src, src_bytes) { unsigned int idx; int skip, mask, i; - int priorities[CODING_CATEGORY_IDX_MAX]; Lisp_Object val = Vcoding_category_list; - i = 0; - while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX) - { - if (! SYMBOLP (XCONS (val)->car)) - break; - idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index)); - if (idx >= CODING_CATEGORY_IDX_MAX) - break; - priorities[i++] = (1 << idx); - val = XCONS (val)->cdr; - } - /* If coding-category-list is valid and contains all coding - categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, - the following code saves Emacs from craching. */ - while (i < CODING_CATEGORY_IDX_MAX) - priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; - - mask = detect_coding_mask (src, src_bytes, priorities, &skip); + mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip); coding->heading_ascii = skip; if (!mask) return; @@ -3747,7 +3748,9 @@ shrink_decoding_region (beg, end, coding, str) return; } - if (coding->heading_ascii >= 0) + eol_conversion = (coding->eol_type != CODING_EOL_LF); + + if ((! eol_conversion) && (coding->heading_ascii >= 0)) /* Detection routine has already found how much we can skip at the head. */ *beg += coding->heading_ascii; @@ -3763,8 +3766,6 @@ shrink_decoding_region (beg, end, coding, str) endp_orig = endp = begp + *end - *beg; } - eol_conversion = (coding->eol_type != CODING_EOL_LF); - switch (coding->type) { case coding_type_emacs_mule: @@ -3773,8 +3774,12 @@ shrink_decoding_region (beg, end, coding, str) { if (coding->heading_ascii < 0) while (begp < endp && *begp != '\r' && *begp < 0x80) begp++; - while (begp < endp && *(endp - 1) != '\r' && *(endp - 1) < 0x80) + while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80) endp--; + /* Do not consider LF as ascii if preceded by CR, since that + confuses eol decoding. */ + if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') + endp++; } else begp = endp; @@ -3796,6 +3801,10 @@ shrink_decoding_region (beg, end, coding, str) while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--; else while (begp < endp && endp[-1] < 0x80) endp--; + /* Do not consider LF as ascii if preceded by CR, since that + confuses eol decoding. */ + if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') + endp++; if (begp < endp && endp < endp_orig && endp[-1] >= 0x80) endp++; break; @@ -3820,6 +3829,10 @@ shrink_decoding_region (beg, end, coding, str) while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--; else while (begp < endp && endp[-1] < 0x80) endp--; + /* Do not consider LF as ascii if preceded by CR, since that + confuses eol decoding. */ + if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') + endp++; break; case CODING_CATEGORY_IDX_ISO_7: @@ -3834,6 +3847,10 @@ shrink_decoding_region (beg, end, coding, str) while (begp < endp && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC) endp--; + /* Do not consider LF as ascii if preceded by CR, since that + confuses eol decoding. */ + if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') + endp++; if (begp < endp && endp[-1] == ISO_CODE_ESC) { if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B') @@ -3964,6 +3981,9 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) unsigned char *src, *dst; Lisp_Object deletion = Qnil; + if (from < PT && PT < to) + SET_PT_BOTH (from, from_byte); + if (replace) { int saved_from = from; @@ -4049,7 +4069,8 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) new buffer. */ struct buffer *prev = current_buffer, *new; - call2 (coding->pre_write_conversion, from, to); + call2 (coding->pre_write_conversion, + make_number (from), make_number (to)); if (current_buffer != prev) { len = ZV - BEGV; @@ -4209,7 +4230,7 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) inserted += len_byte; inserted_byte += len_byte; while (len_byte--) - *src++ = *dst++; + *dst++ = *src++; fake_multibyte = 1; break; } @@ -4412,8 +4433,13 @@ code_convert_string (str, coding, encodep, nocopy) if (encodep) str = make_unibyte_string (buf, len + coding->produced); else - str = make_string_from_bytes (buf, len + coding->produced_char, - len + coding->produced); + { + int chars= (coding->fake_multibyte + ? multibyte_chars_in_text (buf + from, coding->produced) + : coding->produced_char); + str = make_multibyte_string (buf, len + chars, len + coding->produced); + } + return str; } @@ -4509,7 +4535,7 @@ detect_coding_system (src, src_bytes, highest) if (VECTORP (val2)) val = XVECTOR (val2)->contents[eol_type]; } - return val; + return (highest ? val : Fcons (val, Qnil)); } /* At first, gather possible coding systems in VAL. */ @@ -4528,10 +4554,11 @@ detect_coding_system (src, src_bytes, highest) if (!highest) val = Fnreverse (val); - /* Then, substitute the elements by subsidiary coding systems. */ + /* Then, replace the elements with subsidiary coding systems. */ for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr) { - if (eol_type != CODING_EOL_UNDECIDED) + if (eol_type != CODING_EOL_UNDECIDED + && eol_type != CODING_EOL_INCONSISTENT) { Lisp_Object eol; eol = Fget (XCONS (tmp)->car, Qeol_type); @@ -4547,8 +4574,9 @@ DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, "Detect coding system of the text in the region between START and END.\n\ Return a list of possible coding systems ordered by priority.\n\ \n\ -If only ASCII characters are found, it returns `undecided'\n\ -or its subsidiary coding system according to a detected end-of-line format.\n\ +If only ASCII characters are found, it returns a list of single element\n\ +`undecided' or its subsidiary coding system according to a detected\n\ +end-of-line format.\n\ \n\ If optional argument HIGHEST is non-nil, return the coding system of\n\ highest priority.") @@ -4579,8 +4607,9 @@ DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, "Detect coding system of the text in STRING.\n\ Return a list of possible coding systems ordered by priority.\n\ \n\ -If only ASCII characters are found, it returns `undecided'\n\ -or its subsidiary coding system according to a detected end-of-line format.\n\ +If only ASCII characters are found, it returns a list of single element\n\ +`undecided' or its subsidiary coding system according to a detected\n\ +end-of-line format.\n\ \n\ If optional argument HIGHEST is non-nil, return the coding system of\n\ highest priority.") @@ -4619,6 +4648,7 @@ code_convert_region1 (start, end, coding_system, encodep) coding.mode |= CODING_MODE_LAST_BLOCK; code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to), &coding, encodep, 1); + Vlast_coding_system_used = coding.symbol; return make_number (coding.produced_char); } @@ -4627,7 +4657,10 @@ DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region, "Decode the current region by specified coding system.\n\ When called from a program, takes three arguments:\n\ START, END, and CODING-SYSTEM. START and END are buffer positions.\n\ -Return length of decoded text.") +This function sets `last-coding-system-used' to the precise coding system\n\ +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\ +not fully specified.)\n\ +It returns the length of the decoded text.") (start, end, coding_system) Lisp_Object start, end, coding_system; { @@ -4639,7 +4672,10 @@ DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region, "Encode the current region by specified coding system.\n\ When called from a program, takes three arguments:\n\ START, END, and CODING-SYSTEM. START and END are buffer positions.\n\ -Return length of encoded text.") +This function sets `last-coding-system-used' to the precise coding system\n\ +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\ +not fully specified.)\n\ +It returns the length of the encoded text.") (start, end, coding_system) Lisp_Object start, end, coding_system; { @@ -4663,6 +4699,7 @@ code_convert_string1 (string, coding_system, nocopy, encodep) error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data); coding.mode |= CODING_MODE_LAST_BLOCK; + Vlast_coding_system_used = coding.symbol; return code_convert_string (string, &coding, encodep, !NILP (nocopy)); } @@ -4670,24 +4707,52 @@ DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string, 2, 3, 0, "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\ Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\ -if the decoding operation is trivial.") +if the decoding operation is trivial.\n\ +This function sets `last-coding-system-used' to the precise coding system\n\ +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\ +not fully specified.)") (string, coding_system, nocopy) Lisp_Object string, coding_system, nocopy; { - return code_convert_string1(string, coding_system, nocopy, 0); + return code_convert_string1 (string, coding_system, nocopy, 0); } DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string, 2, 3, 0, "Encode STRING to CODING-SYSTEM, and return the result.\n\ Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\ -if the encoding operation is trivial.") +if the encoding operation is trivial.\n\ +This function sets `last-coding-system-used' to the precise coding system\n\ +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\ +not fully specified.)") (string, coding_system, nocopy) Lisp_Object string, coding_system, nocopy; { - return code_convert_string1(string, coding_system, nocopy, 1); + return code_convert_string1 (string, coding_system, nocopy, 1); } +/* Encode or decode STRING according to CODING_SYSTEM. + Do not set Vlast_coding_system_used. */ + +Lisp_Object +code_convert_string_norecord (string, coding_system, encodep) + Lisp_Object string, coding_system; + int encodep; +{ + struct coding_system coding; + + CHECK_STRING (string, 0); + CHECK_SYMBOL (coding_system, 1); + + if (NILP (coding_system)) + return string; + + if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) + error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data); + + coding.mode |= CODING_MODE_LAST_BLOCK; + return code_convert_string (string, &coding, encodep, Qt); +} DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0, "Decode a JISX0208 character of shift-jis encoding.\n\ @@ -4939,6 +5004,34 @@ call this function:\n\ return Qnil; } +DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal, + Sset_coding_priority_internal, 0, 0, 0, + "Update internal database for the current value of `coding-category-list'.\n\ +This function is internal use only.") + () +{ + int i = 0, idx; + Lisp_Object val = Vcoding_category_list; + + while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX) + { + if (! SYMBOLP (XCONS (val)->car)) + break; + idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index)); + if (idx >= CODING_CATEGORY_IDX_MAX) + break; + coding_priorities[i++] = (1 << idx); + val = XCONS (val)->cdr; + } + /* If coding-category-list is valid and contains all coding + categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, + the following code saves Emacs from craching. */ + while (i < CODING_CATEGORY_IDX_MAX) + coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; + + return Qnil; +} + #endif /* emacs */ @@ -4994,6 +5087,10 @@ init_coding_once () bzero (coding_system_table, sizeof coding_system_table); + bzero (ascii_skip_code, sizeof ascii_skip_code); + for (i = 0; i < 128; i++) + ascii_skip_code[i] = 1; + #if defined (MSDOS) || defined (WINDOWSNT) system_eol_type = CODING_EOL_CRLF; #else @@ -5089,18 +5186,18 @@ syms_of_coding () } } - Qcharacter_unification_table = intern ("character-unification-table"); - staticpro (&Qcharacter_unification_table); - Fput (Qcharacter_unification_table, Qchar_table_extra_slots, - make_number (0)); + Qtranslation_table = intern ("translation-table"); + staticpro (&Qtranslation_table); + Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (0)); - Qcharacter_unification_table_for_decode - = intern ("character-unification-table-for-decode"); - staticpro (&Qcharacter_unification_table_for_decode); + Qtranslation_table_id = intern ("translation-table-id"); + staticpro (&Qtranslation_table_id); - Qcharacter_unification_table_for_encode - = intern ("character-unification-table-for-encode"); - staticpro (&Qcharacter_unification_table_for_encode); + Qtranslation_table_for_decode = intern ("translation-table-for-decode"); + staticpro (&Qtranslation_table_for_decode); + + Qtranslation_table_for_encode = intern ("translation-table-for-encode"); + staticpro (&Qtranslation_table_for_encode); Qsafe_charsets = intern ("safe-charsets"); staticpro (&Qsafe_charsets); @@ -5132,6 +5229,7 @@ syms_of_coding () defsubr (&Skeyboard_coding_system); defsubr (&Sfind_operation_coding_system); defsubr (&Supdate_iso_coding_systems); + defsubr (&Sset_coding_priority_internal); DEFVAR_LISP ("coding-system-list", &Vcoding_system_list, "List of coding systems.\n\ @@ -5189,6 +5287,12 @@ There are three such tables, `file-coding-system-alist',\n\ "*Non-nil inhibit code conversion of end-of-line format in any cases."); inhibit_eol_conversion = 0; + DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system, + "Non-nil means process buffer inherits coding system of process output.\n\ +Bind it to t if the process output is to be treated as if it were a file\n\ +read from some filesystem."); + inherit_process_coding_system = 0; + DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist, "Alist to decide a coding system to use for a file I/O operation.\n\ The format is ((PATTERN . VAL) ...),\n\ @@ -5251,19 +5355,19 @@ See also the function `find-operation-coding-system'."); "Mnemonic character indicating end-of-line format is not yet decided."); eol_mnemonic_undecided = ':'; - DEFVAR_LISP ("enable-character-unification", &Venable_character_unification, - "Non-nil means ISO 2022 encoder/decoder do character unification."); - Venable_character_unification = Qt; + DEFVAR_LISP ("enable-character-translation", &Venable_character_translation, + "*Non-nil enables character translation while encoding and decoding."); + Venable_character_translation = Qt; - DEFVAR_LISP ("standard-character-unification-table-for-decode", - &Vstandard_character_unification_table_for_decode, - "Table for unifying characters when reading."); - Vstandard_character_unification_table_for_decode = Qnil; + DEFVAR_LISP ("standard-translation-table-for-decode", + &Vstandard_translation_table_for_decode, + "Table for translating characters while decoding."); + Vstandard_translation_table_for_decode = Qnil; - DEFVAR_LISP ("standard-character-unification-table-for-encode", - &Vstandard_character_unification_table_for_encode, - "Table for unifying characters when writing."); - Vstandard_character_unification_table_for_encode = Qnil; + DEFVAR_LISP ("standard-translation-table-for-encode", + &Vstandard_translation_table_for_encode, + "Table for translationg characters while encoding."); + Vstandard_translation_table_for_encode = Qnil; DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist, "Alist of charsets vs revision numbers.\n\