X-Git-Url: https://git.hcoop.net/bpt/emacs.git/blobdiff_plain/2846c6e3607995ce250435e5998ea6a08f60dd89..c95a500868e8e6df777e479e8aa9f195f31bcde3:/src/coding.c diff --git a/src/coding.c b/src/coding.c index 65754b4b1b..8d5304dc11 100644 --- a/src/coding.c +++ b/src/coding.c @@ -1,8 +1,8 @@ /* Coding system handler (conversion, detection, etc). Copyright (C) 2001, 2002, 2003, 2004, 2005, - 2006, 2007, 2008 Free Software Foundation, Inc. + 2006, 2007, 2008, 2009 Free Software Foundation, Inc. Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008 + 2005, 2006, 2007, 2008, 2009 National Institute of Advanced Industrial Science and Technology (AIST) Registration Number H14PRO021 Copyright (C) 2003 @@ -314,7 +314,7 @@ Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; Lisp_Object Qbig, Qlittle; Lisp_Object Qcoding_system_history; Lisp_Object Qvalid_codes; -Lisp_Object QCcategory, QCmnemonic, QCdefalut_char; +Lisp_Object QCcategory, QCmnemonic, QCdefault_char; Lisp_Object QCdecode_translation_table, QCencode_translation_table; Lisp_Object QCpost_read_conversion, QCpre_write_conversion; Lisp_Object QCascii_compatible_p; @@ -380,6 +380,9 @@ int inhibit_eol_conversion; /* Flag to inhibit ISO2022 escape sequence detection. */ int inhibit_iso_escape_detection; +/* Flag to inhibit detection of binary files through null bytes. */ +int inhibit_null_byte_detection; + /* Flag to make buffer-file-coding-system inherit from process-coding. */ int inherit_process_coding_system; @@ -429,9 +432,11 @@ Lisp_Object Vbig5_coding_system; reg))) -#define CODING_ISO_REQUEST(coding, charset_id) \ - ((charset_id <= (coding)->max_charset_id \ - ? (coding)->safe_charsets[charset_id] \ +#define CODING_ISO_REQUEST(coding, charset_id) \ + (((charset_id) <= (coding)->max_charset_id \ + ? ((coding)->safe_charsets[charset_id] != 255 \ + ? (coding)->safe_charsets[charset_id] \ + : -1) \ : -1)) @@ -447,6 +452,12 @@ Lisp_Object Vbig5_coding_system; ((coding)->spec.iso_2022.bol) #define CODING_ISO_INVOKED_CHARSET(coding, plane) \ CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane))) +#define CODING_ISO_CMP_STATUS(coding) \ + (&(coding)->spec.iso_2022.cmp_status) +#define CODING_ISO_EXTSEGMENT_LEN(coding) \ + ((coding)->spec.iso_2022.ctext_extended_segment_len) +#define CODING_ISO_EMBEDDED_UTF_8(coding) \ + ((coding)->spec.iso_2022.embedded_utf_8) /* Control characters of ISO2022. */ /* code */ /* function */ @@ -740,6 +751,45 @@ static struct coding_system coding_categories[coding_category_max]; consumed_chars++; \ } while (0) +/* Safely get two bytes from the source text pointed by SRC which ends + at SRC_END, and set C1 and C2 to those bytes while skipping the + heading multibyte characters. If there are not enough bytes in the + source, it jumps to `no_more_source'. If multibytep is nonzero and + a multibyte character is found for C2, set C2 to the negative value + of the character code. The caller should declare and set these + variables appropriately in advance: + src, src_end, multibytep + It is intended that this macro is used in detect_coding_utf_16. */ + +#define TWO_MORE_BYTES(c1, c2) \ + do { \ + do { \ + if (src == src_end) \ + goto no_more_source; \ + c1 = *src++; \ + if (multibytep && (c1 & 0x80)) \ + { \ + if ((c1 & 0xFE) == 0xC0) \ + c1 = ((c1 & 1) << 6) | *src++; \ + else \ + { \ + src += BYTES_BY_CHAR_HEAD (c1) - 1; \ + c1 = -1; \ + } \ + } \ + } while (c1 < 0); \ + if (src == src_end) \ + goto no_more_source; \ + c2 = *src++; \ + if (multibytep && (c2 & 0x80)) \ + { \ + if ((c2 & 0xFE) == 0xC0) \ + c2 = ((c2 & 1) << 6) | *src++; \ + else \ + c2 = -1; \ + } \ + } while (0) + #define ONE_MORE_BYTE_NO_CHECK(c) \ do { \ @@ -901,11 +951,8 @@ static int detect_eol P_ ((const unsigned char *, static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int)); static void decode_eol P_ ((struct coding_system *)); static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *)); -static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *, - int, int *, int *)); +static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *)); static int produce_chars P_ ((struct coding_system *, Lisp_Object, int)); -static INLINE void produce_composition P_ ((struct coding_system *, int *, - EMACS_INT)); static INLINE void produce_charset P_ ((struct coding_system *, int *, EMACS_INT)); static void produce_annotation P_ ((struct coding_system *, EMACS_INT)); @@ -1164,10 +1211,6 @@ alloc_destination (coding, nbytes, dst) /** Macros for annotations. */ -/* Maximum length of annotation data (sum of annotations for - composition and charset). */ -#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4) - /* An annotation data is stored in the array coding->charbuf in this format: [ -LENGTH ANNOTATION_MASK NCHARS ... ] @@ -1179,13 +1222,26 @@ alloc_destination (coding, nbytes, dst) In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements follows: - ... METHOD [ COMPOSITION-COMPONENTS ... ] + ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ] + + NBYTES is the number of bytes specified in the header part of + old-style emacs-mule encoding, or 0 for the other kind of + composition. + METHOD is one of enum composition_method. + Optionnal COMPOSITION-COMPONENTS are characters and composition rules. In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID - follows. */ + follows. + + If ANNOTATION_MASK is 0, this annotation is just a space holder to + recover from an invalid annotation, and should be skipped by + produce_annotation. */ + +/* Maximum length of the header of annotation data. */ +#define MAX_ANNOTATION_LENGTH 5 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \ do { \ @@ -1195,9 +1251,10 @@ alloc_destination (coding, nbytes, dst) coding->annotated = 1; \ } while (0); -#define ADD_COMPOSITION_DATA(buf, nchars, method) \ +#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \ do { \ - ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \ + ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \ + *buf++ = nbytes; \ *buf++ = method; \ } while (0) @@ -1326,11 +1383,12 @@ decode_coding_utf_8 (coding) const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; int *charbuf_end = coding->charbuf + coding->charbuf_size; - int consumed_chars = 0, consumed_chars_base; + int consumed_chars = 0, consumed_chars_base = 0; int multibytep = coding->src_multibyte; enum utf_bom_type bom = CODING_UTF_8_BOM (coding); Lisp_Object attr, charset_list; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attr, charset_list); @@ -1345,12 +1403,12 @@ decode_coding_utf_8 (coding) src = src_base; else { - ONE_MORE_BYTE (c2); + ONE_MORE_BYTE (c2); if (! UTF_8_EXTRA_OCTET_P (c2)) src = src_base; else { - ONE_MORE_BYTE (c3); + ONE_MORE_BYTE (c3); if (! UTF_8_EXTRA_OCTET_P (c3)) src = src_base; else @@ -1376,7 +1434,11 @@ decode_coding_utf_8 (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c1 = byte_after_cr, byte_after_cr = -1; @@ -1568,8 +1630,7 @@ detect_coding_utf_16 (coding, detect_info) return 0; } - ONE_MORE_BYTE (c1); - ONE_MORE_BYTE (c2); + TWO_MORE_BYTES (c1, c2); if ((c1 == 0xFF) && (c2 == 0xFE)) { detect_info->found |= (CATEGORY_MASK_UTF_16_LE @@ -1586,6 +1647,11 @@ detect_coding_utf_16 (coding, detect_info) | CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG); } + else if (c2 < 0) + { + detect_info->rejected |= CATEGORY_MASK_UTF_16; + return 0; + } else { /* We check the dispersion of Eth and Oth bytes where E is even and @@ -1603,8 +1669,9 @@ detect_coding_utf_16 (coding, detect_info) while (1) { - ONE_MORE_BYTE (c1); - ONE_MORE_BYTE (c2); + TWO_MORE_BYTES (c1, c2); + if (c2 < 0) + break; if (! e[c1]) { e[c1] = 1; @@ -1637,13 +1704,14 @@ decode_coding_utf_16 (coding) const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; int *charbuf_end = coding->charbuf + coding->charbuf_size; - int consumed_chars = 0, consumed_chars_base; + int consumed_chars = 0, consumed_chars_base = 0; int multibytep = coding->src_multibyte; enum utf_bom_type bom = CODING_UTF_16_BOM (coding); enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); int surrogate = CODING_UTF_16_SURROGATE (coding); Lisp_Object attr, charset_list; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr1 = -1, byte_after_cr2 = -1; CODING_GET_INFO (coding, attr, charset_list); @@ -1682,7 +1750,11 @@ decode_coding_utf_16 (coding) consumed_chars_base = consumed_chars; if (charbuf + 2 >= charbuf_end) - break; + { + if (byte_after_cr1 >= 0) + src_base -= 2; + break; + } if (byte_after_cr1 >= 0) c1 = byte_after_cr1, byte_after_cr1 = -1; @@ -1861,12 +1933,12 @@ encode_coding_utf_16 (coding) Next, character composition data are represented by the byte sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ..., where, - METHOD is 0xF0 plus one of composition method (enum + METHOD is 0xF2 plus one of composition method (enum composition_method), BYTES is 0xA0 plus a byte length of this composition data, - CHARS is 0x20 plus a number of characters composed by this + CHARS is 0xA0 plus a number of characters composed by this data, COMPONENTs are characters of multibye form or composition @@ -1888,11 +1960,107 @@ encode_coding_utf_16 (coding) char emacs_mule_bytes[256]; + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in `emacs-mule'. If it is, return 1, + else return 0. */ + +static int +detect_coding_emacs_mule (coding, detect_info) + struct coding_system *coding; + struct coding_detection_info *detect_info; +{ + const unsigned char *src = coding->source, *src_base; + const unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int c; + int found = 0; + + detect_info->checked |= CATEGORY_MASK_EMACS_MULE; + /* A coding system of this category is always ASCII compatible. */ + src += coding->head_ascii; + + while (1) + { + src_base = src; + ONE_MORE_BYTE (c); + if (c < 0) + continue; + if (c == 0x80) + { + /* Perhaps the start of composite character. We simply skip + it because analyzing it is too heavy for detecting. But, + at least, we check that the composite character + constitutes of more than 4 bytes. */ + const unsigned char *src_base; + + repeat: + src_base = src; + do + { + ONE_MORE_BYTE (c); + } + while (c >= 0xA0); + + if (src - src_base <= 4) + break; + found = CATEGORY_MASK_EMACS_MULE; + if (c == 0x80) + goto repeat; + } + + if (c < 0x80) + { + if (c < 0x20 + && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)) + break; + } + else + { + int more_bytes = emacs_mule_bytes[*src_base] - 1; + + while (more_bytes > 0) + { + ONE_MORE_BYTE (c); + if (c < 0xA0) + { + src--; /* Unread the last byte. */ + break; + } + more_bytes--; + } + if (more_bytes != 0) + break; + found = CATEGORY_MASK_EMACS_MULE; + } + } + detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; + return 0; + + no_more_source: + if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK) + { + detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; + return 0; + } + detect_info->found |= found; + return 1; +} + + +/* Parse emacs-mule multibyte sequence at SRC and return the decoded + character. If CMP_STATUS indicates that we must expect MSEQ or + RULE described above, decode it and return the negative value of + the deocded character or rule. If an invalid byte is found, return + -1. If SRC is too short, return -2. */ + int -emacs_mule_char (coding, src, nbytes, nchars, id) +emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status) struct coding_system *coding; const unsigned char *src; int *nbytes, *nchars, *id; + struct composition_status *cmp_status; { const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base = src; @@ -1901,6 +2069,7 @@ emacs_mule_char (coding, src, nbytes, nchars, id) unsigned code; int c; int consumed_chars = 0; + int mseq_found = 0; ONE_MORE_BYTE (c); if (c < 0) @@ -1912,14 +2081,31 @@ emacs_mule_char (coding, src, nbytes, nchars, id) { if (c >= 0xA0) { - /* Old style component character of a composition. */ - if (c == 0xA0) + if (cmp_status->state != COMPOSING_NO + && cmp_status->old_form) { - ONE_MORE_BYTE (c); - c -= 0x80; + if (cmp_status->state == COMPOSING_CHAR) + { + if (c == 0xA0) + { + ONE_MORE_BYTE (c); + c -= 0x80; + if (c < 0) + goto invalid_code; + } + else + c -= 0x20; + mseq_found = 1; + } + else + { + *nbytes = src - src_base; + *nchars = consumed_chars; + return -c; + } } else - c -= 0x20; + goto invalid_code; } switch (emacs_mule_bytes[c]) @@ -1991,7 +2177,7 @@ emacs_mule_char (coding, src, nbytes, nchars, id) *nchars = consumed_chars; if (id) *id = charset->id; - return c; + return (mseq_found ? -c : c); no_more_source: return -2; @@ -2001,259 +2187,250 @@ emacs_mule_char (coding, src, nbytes, nchars, id) } -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in `emacs-mule'. If it is, return 1, - else return 0. */ +/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ -static int -detect_coding_emacs_mule (coding, detect_info) - struct coding_system *coding; - struct coding_detection_info *detect_info; -{ - const unsigned char *src = coding->source, *src_base; - const unsigned char *src_end = coding->source + coding->src_bytes; - int multibytep = coding->src_multibyte; - int consumed_chars = 0; - int c; - int found = 0; +/* Handle these composition sequence ('|': the end of header elements, + BYTES and CHARS >= 0xA0): - detect_info->checked |= CATEGORY_MASK_EMACS_MULE; - /* A coding system of this category is always ASCII compatible. */ - src += coding->head_ascii; + (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ... + (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ... + (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ... - while (1) - { - src_base = src; - ONE_MORE_BYTE (c); - if (c < 0) - continue; - if (c == 0x80) - { - /* Perhaps the start of composite character. We simple skip - it because analyzing it is too heavy for detecting. But, - at least, we check that the composite character - constitutes of more than 4 bytes. */ - const unsigned char *src_base; + and these old form: + + (4) relative composition: 0x80 | MSEQ ... MSEQ + (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ - repeat: - src_base = src; - do - { - ONE_MORE_BYTE (c); - } - while (c >= 0xA0); + When the starter 0x80 and the following header elements are found, + this annotation header is produced. - if (src - src_base <= 4) - break; - found = CATEGORY_MASK_EMACS_MULE; - if (c == 0x80) - goto repeat; - } + [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ] - if (c < 0x80) - { - if (c < 0x20 - && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)) - break; - } - else - { - int more_bytes = emacs_mule_bytes[*src_base] - 1; + NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5). + NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5). - while (more_bytes > 0) - { - ONE_MORE_BYTE (c); - if (c < 0xA0) - { - src--; /* Unread the last byte. */ - break; - } - more_bytes--; - } - if (more_bytes != 0) - break; - found = CATEGORY_MASK_EMACS_MULE; - } - } - detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; - return 0; + Then, upon reading the following elements, these codes are produced + until the composition end is found: - no_more_source: - if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK) - { - detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; - return 0; - } - detect_info->found |= found; - return 1; -} + (1) CHAR ... CHAR + (2) ALT ... ALT CHAR ... CHAR + (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR + (4) CHAR ... CHAR + (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR + When the composition end is found, LENGTH and NCHARS in the + annotation header is updated as below: -/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ + (1) LENGTH: unchanged, NCHARS: unchanged + (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged + (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged + (4) LENGTH: unchanged, NCHARS: number of CHARs + (5) LENGTH: unchanged, NCHARS: number of CHARs -/* Decode a character represented as a component of composition - sequence of Emacs 20/21 style at SRC. Set C to that character and - update SRC to the head of next character (or an encoded composition - rule). If SRC doesn't points a composition component, set C to -1. - If SRC points an invalid byte sequence, global exit by a return - value 0. */ - -#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \ - do \ - { \ - int c; \ - int nbytes, nchars; \ - \ - if (src == src_end) \ - break; \ - c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\ - if (c < 0) \ - { \ - if (c == -2) \ - break; \ - goto invalid_code; \ - } \ - *buf++ = c; \ - src += nbytes; \ - consumed_chars += nchars; \ - } \ - while (0) - - -/* Decode a composition rule represented as a component of composition - sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF, - and increment BUF. If SRC points an invalid byte sequence, set C - to -1. */ - -#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \ + If an error is found while composing, the annotation header is + changed to the original composition header (plus filler -1s) as + below: + + (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ] + (5) [ 0x80 0xFF -1 -1- -1 ] + + and the sequence [ -2 DECODED-RULE ] is changed to the original + byte sequence as below: + o the original byte sequence is B: [ B -1 ] + o the original byte sequence is B1 B2: [ B1 B2 ] + + Most of the routines are implemented by macros because many + variables and labels in the caller decode_coding_emacs_mule must be + accessible, and they are usually called just once (thus doesn't + increase the size of compiled object). */ + +/* Decode a composition rule represented by C as a component of + composition sequence of Emacs 20 style. Set RULE to the decoded + rule. */ + +#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \ do { \ - int c, gref, nref; \ - \ - if (src >= src_end) \ - goto invalid_code; \ - ONE_MORE_BYTE_NO_CHECK (c); \ + int gref, nref; \ + \ c -= 0xA0; \ if (c < 0 || c >= 81) \ goto invalid_code; \ - \ gref = c / 9, nref = c % 9; \ - *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ + if (gref == 4) gref = 10; \ + if (nref == 4) nref = 10; \ + rule = COMPOSITION_ENCODE_RULE (gref, nref); \ } while (0) -/* Decode a composition rule represented as a component of composition - sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF, - and increment BUF. If SRC points an invalid byte sequence, set C - to -1. */ +/* Decode a composition rule represented by C and the following byte + at SRC as a component of composition sequence of Emacs 21 style. + Set RULE to the decoded rule. */ -#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \ +#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \ do { \ int gref, nref; \ - \ - if (src + 1>= src_end) \ + \ + gref = c - 0x20; \ + if (gref < 0 || gref >= 81) \ goto invalid_code; \ - ONE_MORE_BYTE_NO_CHECK (gref); \ - gref -= 0x20; \ - ONE_MORE_BYTE_NO_CHECK (nref); \ - nref -= 0x20; \ - if (gref < 0 || gref >= 81 \ - || nref < 0 || nref >= 81) \ + ONE_MORE_BYTE (c); \ + nref = c - 0x20; \ + if (nref < 0 || nref >= 81) \ goto invalid_code; \ - *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ + rule = COMPOSITION_ENCODE_RULE (gref, nref); \ } while (0) -#define DECODE_EMACS_MULE_21_COMPOSITION(c) \ +/* Start of Emacs 21 style format. The first three bytes at SRC are + (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the + byte length of this composition information, CHARS is the number of + characters composed by this composition. */ + +#define DECODE_EMACS_MULE_21_COMPOSITION() \ do { \ - /* Emacs 21 style format. The first three bytes at SRC are \ - (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \ - the byte length of this composition information, CHARS is the \ - number of characters composed by this composition. */ \ enum composition_method method = c - 0xF2; \ int *charbuf_base = charbuf; \ - int consumed_chars_limit; \ int nbytes, nchars; \ - \ + \ ONE_MORE_BYTE (c); \ if (c < 0) \ goto invalid_code; \ nbytes = c - 0xA0; \ - if (nbytes < 3) \ + if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \ goto invalid_code; \ ONE_MORE_BYTE (c); \ - if (c < 0) \ - goto invalid_code; \ nchars = c - 0xA0; \ - ADD_COMPOSITION_DATA (charbuf, nchars, method); \ - consumed_chars_limit = consumed_chars_base + nbytes; \ - if (method != COMPOSITION_RELATIVE) \ - { \ - int i = 0; \ - while (consumed_chars < consumed_chars_limit) \ - { \ - if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \ - DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \ - else \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \ - i++; \ - } \ - if (consumed_chars < consumed_chars_limit) \ - goto invalid_code; \ - charbuf_base[0] -= i; \ - } \ + if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \ + goto invalid_code; \ + cmp_status->old_form = 0; \ + cmp_status->method = method; \ + if (method == COMPOSITION_RELATIVE) \ + cmp_status->state = COMPOSING_CHAR; \ + else \ + cmp_status->state = COMPOSING_COMPONENT_CHAR; \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = nchars; \ + cmp_status->ncomps = nbytes - 4; \ + ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \ } while (0) -#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \ - do { \ - /* Emacs 20 style format for relative composition. */ \ - /* Store multibyte form of characters to be composed. */ \ - enum composition_method method = COMPOSITION_RELATIVE; \ - int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ - int *buf = components; \ - int i, j; \ - \ - src = src_base; \ - ONE_MORE_BYTE (c); /* skip 0x80 */ \ - for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ - if (i < 2) \ - goto invalid_code; \ - ADD_COMPOSITION_DATA (charbuf, i, method); \ - for (j = 0; j < i; j++) \ - *charbuf++ = components[j]; \ +/* Start of Emacs 20 style format for relative composition. */ + +#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \ + do { \ + cmp_status->old_form = 1; \ + cmp_status->method = COMPOSITION_RELATIVE; \ + cmp_status->state = COMPOSING_CHAR; \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = cmp_status->ncomps = 0; \ + ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \ } while (0) -#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \ +/* Start of Emacs 20 style format for rule-base composition. */ + +#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \ + do { \ + cmp_status->old_form = 1; \ + cmp_status->method = COMPOSITION_WITH_RULE; \ + cmp_status->state = COMPOSING_CHAR; \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = cmp_status->ncomps = 0; \ + ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \ + } while (0) + + +#define DECODE_EMACS_MULE_COMPOSITION_START() \ + do { \ + const unsigned char *current_src = src; \ + \ + ONE_MORE_BYTE (c); \ + if (c < 0) \ + goto invalid_code; \ + if (c - 0xF2 >= COMPOSITION_RELATIVE \ + && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \ + DECODE_EMACS_MULE_21_COMPOSITION (); \ + else if (c < 0xA0) \ + goto invalid_code; \ + else if (c < 0xC0) \ + { \ + DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \ + /* Re-read C as a composition component. */ \ + src = current_src; \ + } \ + else if (c == 0xFF) \ + DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \ + else \ + goto invalid_code; \ + } while (0) + +#define EMACS_MULE_COMPOSITION_END() \ do { \ - /* Emacs 20 style format for rule-base composition. */ \ - /* Store multibyte form of characters to be composed. */ \ - enum composition_method method = COMPOSITION_WITH_RULE; \ - int *charbuf_base = charbuf; \ - int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ - int *buf = components; \ - int i, j; \ + int idx = - cmp_status->length; \ \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ - for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \ - { \ - if (*src < 0xA0) \ - break; \ - DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ - } \ - if (i <= 1 || (buf - components) % 2 == 0) \ - goto invalid_code; \ - if (charbuf + i + (i / 2) + 1 >= charbuf_end) \ - goto no_more_source; \ - ADD_COMPOSITION_DATA (charbuf, i, method); \ - i = i * 2 - 1; \ - for (j = 0; j < i; j++) \ - *charbuf++ = components[j]; \ - charbuf_base[0] -= i; \ - for (j = 0; j < i; j += 2) \ - *charbuf++ = components[j]; \ + if (cmp_status->old_form) \ + charbuf[idx + 2] = cmp_status->nchars; \ + else if (cmp_status->method > COMPOSITION_RELATIVE) \ + charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \ + cmp_status->state = COMPOSING_NO; \ + } while (0) + + +static int +emacs_mule_finish_composition (charbuf, cmp_status) + int *charbuf; + struct composition_status *cmp_status; +{ + int idx = - cmp_status->length; + int new_chars; + + if (cmp_status->old_form && cmp_status->nchars > 0) + { + charbuf[idx + 2] = cmp_status->nchars; + new_chars = 0; + if (cmp_status->method == COMPOSITION_WITH_RULE + && cmp_status->state == COMPOSING_CHAR) + { + /* The last rule was invalid. */ + int rule = charbuf[-1] + 0xA0; + + charbuf[-2] = BYTE8_TO_CHAR (rule); + charbuf[-1] = -1; + new_chars = 1; + } + } + else + { + charbuf[idx++] = BYTE8_TO_CHAR (0x80); + + if (cmp_status->method == COMPOSITION_WITH_RULE) + { + charbuf[idx++] = BYTE8_TO_CHAR (0xFF); + charbuf[idx++] = -3; + charbuf[idx++] = 0; + new_chars = 1; + } + else + { + int nchars = charbuf[idx + 1] + 0xA0; + int nbytes = charbuf[idx + 2] + 0xA0; + + charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method); + charbuf[idx++] = BYTE8_TO_CHAR (nbytes); + charbuf[idx++] = BYTE8_TO_CHAR (nchars); + charbuf[idx++] = -1; + new_chars = 4; + } + } + cmp_status->state = COMPOSING_NO; + return new_chars; +} + +#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \ + do { \ + if (cmp_status->state != COMPOSING_NO) \ + char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \ } while (0) @@ -2273,83 +2450,194 @@ decode_coding_emacs_mule (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; + struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status; CODING_GET_INFO (coding, attrs, charset_list); + if (cmp_status->state != COMPOSING_NO) + { + int i; + + for (i = 0; i < cmp_status->length; i++) + *charbuf++ = cmp_status->carryover[i]; + coding->annotated = 1; + } + while (1) { - int c; + int c, id; src_base = src; consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c = byte_after_cr, byte_after_cr = -1; else ONE_MORE_BYTE (c); - if (c < 0) + + if (c < 0 || c == 0x80) { - *charbuf++ = -c; - char_offset++; + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + if (c < 0) + { + *charbuf++ = -c; + char_offset++; + } + else + DECODE_EMACS_MULE_COMPOSITION_START (); + continue; } - else if (c < 0x80) + + if (c < 0x80) { if (eol_crlf && c == '\r') ONE_MORE_BYTE (byte_after_cr); - *charbuf++ = c; - char_offset++; - } - else if (c == 0x80) - { - ONE_MORE_BYTE (c); - if (c < 0) - goto invalid_code; - if (c - 0xF2 >= COMPOSITION_RELATIVE - && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) - DECODE_EMACS_MULE_21_COMPOSITION (c); - else if (c < 0xC0) - DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c); - else if (c == 0xFF) - DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); - else - goto invalid_code; + id = charset_ascii; + if (cmp_status->state != COMPOSING_NO) + { + if (cmp_status->old_form) + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR) + cmp_status->ncomps--; + } } - else if (c < 0xA0 && emacs_mule_bytes[c] > 1) + else { - int nbytes, nchars; - int id; + int nchars, nbytes; - src = src_base; - consumed_chars = consumed_chars_base; - c = emacs_mule_char (coding, src, &nbytes, &nchars, &id); + c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id, + cmp_status); if (c < 0) { + if (c == -1) + goto invalid_code; if (c == -2) break; - goto invalid_code; } + src = src_base + nbytes; + consumed_chars = consumed_chars_base + nchars; + if (cmp_status->state >= COMPOSING_COMPONENT_CHAR) + cmp_status->ncomps -= nchars; + } + + /* Now if C >= 0, we found a normally encoded characer, if C < + 0, we found an old-style composition component character or + rule. */ + + if (cmp_status->state == COMPOSING_NO) + { if (last_id != id) { if (last_id != charset_ascii) - ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id); + ADD_CHARSET_DATA (charbuf, char_offset - last_offset, + last_id); last_id = id; last_offset = char_offset; } *charbuf++ = c; - src += nbytes; - consumed_chars += nchars; char_offset++; } - else - goto invalid_code; + else if (cmp_status->state == COMPOSING_CHAR) + { + if (cmp_status->old_form) + { + if (c >= 0) + { + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + *charbuf++ = c; + char_offset++; + } + else + { + *charbuf++ = -c; + cmp_status->nchars++; + cmp_status->length++; + if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS) + EMACS_MULE_COMPOSITION_END (); + else if (cmp_status->method == COMPOSITION_WITH_RULE) + cmp_status->state = COMPOSING_RULE; + } + } + else + { + *charbuf++ = c; + cmp_status->length++; + cmp_status->nchars--; + if (cmp_status->nchars == 0) + EMACS_MULE_COMPOSITION_END (); + } + } + else if (cmp_status->state == COMPOSING_RULE) + { + int rule; + + if (c >= 0) + { + EMACS_MULE_COMPOSITION_END (); + *charbuf++ = c; + char_offset++; + } + else + { + c = -c; + DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule); + if (rule < 0) + goto invalid_code; + *charbuf++ = -2; + *charbuf++ = rule; + cmp_status->length += 2; + cmp_status->state = COMPOSING_CHAR; + } + } + else if (cmp_status->state == COMPOSING_COMPONENT_CHAR) + { + *charbuf++ = c; + cmp_status->length++; + if (cmp_status->ncomps == 0) + cmp_status->state = COMPOSING_CHAR; + else if (cmp_status->ncomps > 0) + { + if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) + cmp_status->state = COMPOSING_COMPONENT_RULE; + } + else + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + } + else /* COMPOSING_COMPONENT_RULE */ + { + int rule; + + DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule); + if (rule < 0) + goto invalid_code; + *charbuf++ = -2; + *charbuf++ = rule; + cmp_status->length += 2; + cmp_status->ncomps--; + if (cmp_status->ncomps > 0) + cmp_status->state = COMPOSING_COMPONENT_CHAR; + else + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + } + continue; + + retry: + src = src_base; + consumed_chars = consumed_chars_base; continue; invalid_code: + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); src = src_base; consumed_chars = consumed_chars_base; ONE_MORE_BYTE (c); @@ -2359,6 +2647,19 @@ decode_coding_emacs_mule (coding) } no_more_source: + if (cmp_status->state != COMPOSING_NO) + { + if (coding->mode & CODING_MODE_LAST_BLOCK) + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + else + { + int i; + + charbuf -= cmp_status->length; + for (i = 0; i < cmp_status->length; i++) + cmp_status->carryover[i] = charbuf[i]; + } + } if (last_id != charset_ascii) ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id); coding->consumed_char += consumed_chars_base; @@ -2449,8 +2750,10 @@ encode_coding_emacs_mule (coding) if (preferred_charset_id >= 0) { charset = CHARSET_FROM_ID (preferred_charset_id); - if (! CHAR_CHARSET_P (c, charset)) - charset = char_charset (c, charset_list, NULL); + if (CHAR_CHARSET_P (c, charset)) + code = ENCODE_CHAR (charset, c); + else + charset = char_charset (c, charset_list, &code); } else charset = char_charset (c, charset_list, &code); @@ -2668,7 +2971,7 @@ enum iso_code_class_type iso_code_class[256]; #define SAFE_CHARSET_P(coding, id) \ ((id) <= (coding)->max_charset_id \ - && (coding)->safe_charsets[id] >= 0) + && (coding)->safe_charsets[id] != 255) #define SHIFT_OUT_OK(category) \ @@ -2706,8 +3009,8 @@ setup_iso_safe_charsets (attrs) max_charset_id = id; } - safe_charsets = Fmake_string (make_number (max_charset_id + 1), - make_number (255)); + safe_charsets = make_uninit_string (max_charset_id + 1); + memset (SDATA (safe_charsets), 255, max_charset_id + 1); request = AREF (attrs, coding_attr_iso_request); reg_usage = AREF (attrs, coding_attr_iso_usage); reg94 = XINT (XCAR (reg_usage)); @@ -2758,6 +3061,7 @@ detect_coding_iso_2022 (coding, detect_info) int i; int rejected = 0; int found = 0; + int composition_count = -1; detect_info->checked |= CATEGORY_MASK_ISO; @@ -2770,11 +3074,11 @@ detect_coding_iso_2022 (coding, detect_info) continue; attrs = CODING_ID_ATTRS (this->id); if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT - && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list)) + && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list)) setup_iso_safe_charsets (attrs); val = CODING_ATTR_SAFE_CHARSETS (attrs); this->max_charset_id = SCHARS (val) - 1; - this->safe_charsets = (char *) SDATA (val); + this->safe_charsets = SDATA (val); } /* A coding system of this category is always ASCII compatible. */ @@ -2826,10 +3130,20 @@ detect_coding_iso_2022 (coding, detect_info) rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT; break; } + else if (c == '1') + { + /* End of composition. */ + if (composition_count < 0 + || composition_count > MAX_COMPOSITION_COMPONENTS) + /* Invalid */ + break; + composition_count = -1; + found |= CATEGORY_MASK_ISO; + } else if (c >= '0' && c <= '4') { /* ESC for start/end composition. */ - found |= CATEGORY_MASK_ISO; + composition_count = 0; break; } else @@ -2900,6 +3214,8 @@ detect_coding_iso_2022 (coding, detect_info) continue; if (c < 0x80) { + if (composition_count >= 0) + composition_count++; single_shifting = 0; break; } @@ -2924,9 +3240,17 @@ detect_coding_iso_2022 (coding, detect_info) } if (i & 1 && src < src_end) - rejected |= CATEGORY_MASK_ISO_8_2; + { + rejected |= CATEGORY_MASK_ISO_8_2; + if (composition_count >= 0) + composition_count += i; + } else - found |= CATEGORY_MASK_ISO_8_2; + { + found |= CATEGORY_MASK_ISO_8_2; + if (composition_count >= 0) + composition_count += i / 2; + } } break; } @@ -2976,146 +3300,251 @@ detect_coding_iso_2022 (coding, detect_info) if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \ id = charset_ascii; \ } \ - else if (id == charset_jisx0208_1978) \ + else if (id == charset_jisx0208_1978) \ + { \ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \ + id = charset_jisx0208; \ + } \ + CODING_ISO_DESIGNATION (coding, reg) = id; \ + /* If there was an invalid designation to REG previously, and this \ + designation is ASCII to REG, we should keep this designation \ + sequence. */ \ + if (prev == -2 && id == charset_ascii) \ + chars_96 = -1; \ + } while (0) + + +/* Handle these composition sequence (ALT: alternate char): + + (1) relative composition: ESC 0 CHAR ... ESC 1 + (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 + (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1 + (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1 + + When the start sequence (ESC 0/2/3/4) is found, this annotation + header is produced. + + [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ] + + Then, upon reading CHAR or RULE (one or two bytes), these codes are + produced until the end sequence (ESC 1) is found: + + (1) CHAR ... CHAR + (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR + (3) ALT ... ALT -1 -1 CHAR ... CHAR + (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR + + When the end sequence (ESC 1) is found, LENGTH and NCHARS in the + annotation header is updated as below: + + (1) LENGTH: unchanged, NCHARS: number of CHARs + (2) LENGTH: unchanged, NCHARS: number of CHARs + (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs + (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs + + If an error is found while composing, the annotation header is + changed to: + + [ ESC '0'/'2'/'3'/'4' -2 0 ] + + and the sequence [ -2 DECODED-RULE ] is changed to the original + byte sequence as below: + o the original byte sequence is B: [ B -1 ] + o the original byte sequence is B1 B2: [ B1 B2 ] + and the sequence [ -1 -1 ] is changed to the original byte + sequence: + [ ESC '0' ] +*/ + +/* Decode a composition rule C1 and maybe one more byte from the + source, and set RULE to the encoded composition rule, NBYTES to the + length of the composition rule. If the rule is invalid, set RULE + to some negative value. */ + +#define DECODE_COMPOSITION_RULE(rule, nbytes) \ + do { \ + rule = c1 - 32; \ + if (rule < 0) \ + break; \ + if (rule < 81) /* old format (before ver.21) */ \ + { \ + int gref = (rule) / 9; \ + int nref = (rule) % 9; \ + if (gref == 4) gref = 10; \ + if (nref == 4) nref = 10; \ + rule = COMPOSITION_ENCODE_RULE (gref, nref); \ + nbytes = 1; \ + } \ + else /* new format (after ver.21) */ \ { \ - if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \ - id = charset_jisx0208; \ + int c; \ + \ + ONE_MORE_BYTE (c); \ + rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \ + if (rule >= 0) \ + rule += 0x100; /* to destinguish it from the old format */ \ + nbytes = 2; \ } \ - CODING_ISO_DESIGNATION (coding, reg) = id; \ - /* If there was an invalid designation to REG previously, and this \ - designation is ASCII to REG, we should keep this designation \ - sequence. */ \ - if (prev == -2 && id == charset_ascii) \ - chars_96 = -1; \ } while (0) - -#define MAYBE_FINISH_COMPOSITION() \ +#define ENCODE_COMPOSITION_RULE(rule) \ do { \ - int i; \ - if (composition_state == COMPOSING_NO) \ - break; \ - /* It is assured that we have enough room for producing \ - characters stored in the table `components'. */ \ - if (charbuf + component_idx > charbuf_end) \ - goto no_more_source; \ - composition_state = COMPOSING_NO; \ - if (method == COMPOSITION_RELATIVE \ - || method == COMPOSITION_WITH_ALTCHARS) \ + int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \ + \ + if (rule < 0x100) /* old format */ \ { \ - for (i = 0; i < component_idx; i++) \ - *charbuf++ = components[i]; \ - char_offset += component_idx; \ + if (gref == 10) gref = 4; \ + if (nref == 10) nref = 4; \ + charbuf[idx] = 32 + gref * 9 + nref; \ + charbuf[idx + 1] = -1; \ + new_chars++; \ } \ - else \ + else /* new format */ \ { \ - for (i = 0; i < component_idx; i += 2) \ - *charbuf++ = components[i]; \ - char_offset += (component_idx / 2) + 1; \ + charbuf[idx] = 32 + 81 + gref; \ + charbuf[idx + 1] = 32 + nref; \ + new_chars += 2; \ } \ } while (0) +/* Finish the current composition as invalid. */ + +static int finish_composition P_ ((int *, struct composition_status *)); + +static int +finish_composition (charbuf, cmp_status) + int *charbuf; + struct composition_status *cmp_status; +{ + int idx = - cmp_status->length; + int new_chars; + + /* Recover the original ESC sequence */ + charbuf[idx++] = ISO_CODE_ESC; + charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0' + : cmp_status->method == COMPOSITION_WITH_RULE ? '2' + : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3' + /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */ + : '4'); + charbuf[idx++] = -2; + charbuf[idx++] = 0; + charbuf[idx++] = -1; + new_chars = cmp_status->nchars; + if (cmp_status->method >= COMPOSITION_WITH_RULE) + for (; idx < 0; idx++) + { + int elt = charbuf[idx]; + + if (elt == -2) + { + ENCODE_COMPOSITION_RULE (charbuf[idx + 1]); + idx++; + } + else if (elt == -1) + { + charbuf[idx++] = ISO_CODE_ESC; + charbuf[idx] = '0'; + new_chars += 2; + } + } + cmp_status->state = COMPOSING_NO; + return new_chars; +} + +/* If characers are under composition, finish the composition. */ +#define MAYBE_FINISH_COMPOSITION() \ + do { \ + if (cmp_status->state != COMPOSING_NO) \ + char_offset += finish_composition (charbuf, cmp_status); \ + } while (0) /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4. + ESC 0 : relative composition : ESC 0 CHAR ... ESC 1 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1 - */ -#define DECODE_COMPOSITION_START(c1) \ - do { \ - if (c1 == '0' \ - && composition_state == COMPOSING_COMPONENT_RULE) \ - { \ - component_len = component_idx; \ - composition_state = COMPOSING_CHAR; \ - } \ - else \ - { \ - const unsigned char *p; \ - \ - MAYBE_FINISH_COMPOSITION (); \ - if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \ - goto no_more_source; \ - for (p = src; p < src_end - 1; p++) \ - if (*p == ISO_CODE_ESC && p[1] == '1') \ - break; \ - if (p == src_end - 1) \ - { \ - /* The current composition doesn't end in the current \ - source. */ \ - record_conversion_result \ - (coding, CODING_RESULT_INSUFFICIENT_SRC); \ - goto no_more_source; \ - } \ - \ - /* This is surely the start of a composition. */ \ - method = (c1 == '0' ? COMPOSITION_RELATIVE \ - : c1 == '2' ? COMPOSITION_WITH_RULE \ - : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ - : COMPOSITION_WITH_RULE_ALTCHARS); \ - composition_state = (c1 <= '2' ? COMPOSING_CHAR \ - : COMPOSING_COMPONENT_CHAR); \ - component_idx = component_len = 0; \ - } \ + Produce this annotation sequence now: + + [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ] +*/ + +#define DECODE_COMPOSITION_START(c1) \ + do { \ + if (c1 == '0' \ + && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \ + && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \ + || (cmp_status->state == COMPOSING_COMPONENT_RULE \ + && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \ + { \ + *charbuf++ = -1; \ + *charbuf++= -1; \ + cmp_status->state = COMPOSING_CHAR; \ + cmp_status->length += 2; \ + } \ + else \ + { \ + MAYBE_FINISH_COMPOSITION (); \ + cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \ + : c1 == '2' ? COMPOSITION_WITH_RULE \ + : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ + : COMPOSITION_WITH_RULE_ALTCHARS); \ + cmp_status->state \ + = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \ + ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = cmp_status->ncomps = 0; \ + coding->annotated = 1; \ + } \ } while (0) -/* Handle compositoin end sequence ESC 1. */ +/* Handle composition end sequence ESC 1. */ #define DECODE_COMPOSITION_END() \ do { \ - int nchars = (component_len > 0 ? component_idx - component_len \ - : method == COMPOSITION_RELATIVE ? component_idx \ - : (component_idx + 1) / 2); \ - int i; \ - int *saved_charbuf = charbuf; \ - \ - ADD_COMPOSITION_DATA (charbuf, nchars, method); \ - if (method != COMPOSITION_RELATIVE) \ + if (cmp_status->nchars == 0 \ + || ((cmp_status->state == COMPOSING_CHAR) \ + == (cmp_status->method == COMPOSITION_WITH_RULE))) \ { \ - if (component_len == 0) \ - for (i = 0; i < component_idx; i++) \ - *charbuf++ = components[i]; \ - else \ - for (i = 0; i < component_len; i++) \ - *charbuf++ = components[i]; \ - *saved_charbuf = saved_charbuf - charbuf; \ + MAYBE_FINISH_COMPOSITION (); \ + goto invalid_code; \ } \ - if (method == COMPOSITION_WITH_RULE) \ - for (i = 0; i < component_idx; i += 2, char_offset++) \ - *charbuf++ = components[i]; \ - else \ - for (i = component_len; i < component_idx; i++, char_offset++) \ - *charbuf++ = components[i]; \ - coding->annotated = 1; \ - composition_state = COMPOSING_NO; \ + if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \ + charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \ + else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \ + charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \ + charbuf[- cmp_status->length + 2] = cmp_status->nchars; \ + char_offset += cmp_status->nchars; \ + cmp_status->state = COMPOSING_NO; \ } while (0) +/* Store a composition rule RULE in charbuf, and update cmp_status. */ -/* Decode a composition rule from the byte C1 (and maybe one more byte - from SRC) and store one encoded composition rule in - coding->cmp_data. */ +#define STORE_COMPOSITION_RULE(rule) \ + do { \ + *charbuf++ = -2; \ + *charbuf++ = rule; \ + cmp_status->length += 2; \ + cmp_status->state--; \ + } while (0) -#define DECODE_COMPOSITION_RULE(c1) \ +/* Store a composed char or a component char C in charbuf, and update + cmp_status. */ + +#define STORE_COMPOSITION_CHAR(c) \ do { \ - (c1) -= 32; \ - if (c1 < 81) /* old format (before ver.21) */ \ - { \ - int gref = (c1) / 9; \ - int nref = (c1) % 9; \ - if (gref == 4) gref = 10; \ - if (nref == 4) nref = 10; \ - c1 = COMPOSITION_ENCODE_RULE (gref, nref); \ - } \ - else if (c1 < 93) /* new format (after ver.21) */ \ - { \ - ONE_MORE_BYTE (c2); \ - c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \ - } \ + *charbuf++ = (c); \ + cmp_status->length++; \ + if (cmp_status->state == COMPOSING_CHAR) \ + cmp_status->nchars++; \ else \ - c1 = 0; \ + cmp_status->ncomps++; \ + if (cmp_status->method == COMPOSITION_WITH_RULE \ + || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \ + && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \ + cmp_status->state++; \ } while (0) @@ -3130,7 +3559,7 @@ decode_coding_iso_2022 (coding) const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; int *charbuf_end - = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH; + = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; int consumed_chars = 0, consumed_chars_base; int multibytep = coding->src_multibyte; /* Charsets invoked to graphic plane 0 and 1 respectively. */ @@ -3139,30 +3568,28 @@ decode_coding_iso_2022 (coding) int charset_id_2, charset_id_3; struct charset *charset; int c; - /* For handling composition sequence. */ -#define COMPOSING_NO 0 -#define COMPOSING_CHAR 1 -#define COMPOSING_RULE 2 -#define COMPOSING_COMPONENT_CHAR 3 -#define COMPOSING_COMPONENT_RULE 4 - - int composition_state = COMPOSING_NO; - enum composition_method method; - int components[MAX_COMPOSITION_COMPONENTS * 2 + 1]; - int component_idx; - int component_len; + struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding); Lisp_Object attrs, charset_list; int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; + int i; CODING_GET_INFO (coding, attrs, charset_list); setup_iso_safe_charsets (attrs); /* Charset list may have been changed. */ charset_list = CODING_ATTR_CHARSET_LIST (attrs); - coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs)); + coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs)); + + if (cmp_status->state != COMPOSING_NO) + { + for (i = 0; i < cmp_status->length; i++) + *charbuf++ = cmp_status->carryover[i]; + coding->annotated = 1; + } while (1) { @@ -3172,7 +3599,11 @@ decode_coding_iso_2022 (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c1 = byte_after_cr, byte_after_cr = -1; @@ -3181,21 +3612,58 @@ decode_coding_iso_2022 (coding) if (c1 < 0) goto invalid_code; - /* We produce at most one character. */ - switch (iso_code_class [c1]) + if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0) { - case ISO_0x20_or_0x7F: - if (composition_state != COMPOSING_NO) + *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); + char_offset++; + CODING_ISO_EXTSEGMENT_LEN (coding)--; + continue; + } + + if (CODING_ISO_EMBEDDED_UTF_8 (coding)) + { + if (c1 == ISO_CODE_ESC) { - if (composition_state == COMPOSING_RULE - || composition_state == COMPOSING_COMPONENT_RULE) + if (src + 1 >= src_end) + goto no_more_source; + *charbuf++ = ISO_CODE_ESC; + char_offset++; + if (src[0] == '%' && src[1] == '@') { - DECODE_COMPOSITION_RULE (c1); - components[component_idx++] = c1; - composition_state--; - continue; + src += 2; + consumed_chars += 2; + char_offset += 2; + /* We are sure charbuf can contain two more chars. */ + *charbuf++ = '%'; + *charbuf++ = '@'; + CODING_ISO_EMBEDDED_UTF_8 (coding) = 0; } } + else + { + *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); + char_offset++; + } + continue; + } + + if ((cmp_status->state == COMPOSING_RULE + || cmp_status->state == COMPOSING_COMPONENT_RULE) + && c1 != ISO_CODE_ESC) + { + int rule, nbytes; + + DECODE_COMPOSITION_RULE (rule, nbytes); + if (rule < 0) + goto invalid_code; + STORE_COMPOSITION_RULE (rule); + continue; + } + + /* We produce at most one character. */ + switch (iso_code_class [c1]) + { + case ISO_0x20_or_0x7F: if (charset_id_0 < 0 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0))) /* This is SPACE or DEL. */ @@ -3205,17 +3673,6 @@ decode_coding_iso_2022 (coding) break; case ISO_graphic_plane_0: - if (composition_state != COMPOSING_NO) - { - if (composition_state == COMPOSING_RULE - || composition_state == COMPOSING_COMPONENT_RULE) - { - DECODE_COMPOSITION_RULE (c1); - components[component_idx++] = c1; - composition_state--; - continue; - } - } if (charset_id_0 < 0) charset = CHARSET_FROM_ID (charset_ascii); else @@ -3243,7 +3700,6 @@ decode_coding_iso_2022 (coding) break; case ISO_control_1: - MAYBE_FINISH_COMPOSITION (); goto invalid_code; case ISO_shift_out: @@ -3381,11 +3837,17 @@ decode_coding_iso_2022 (coding) case '0': case '2': case '3': case '4': /* start composition */ if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)) goto invalid_code; + if (last_id != charset_ascii) + { + ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id); + last_id = charset_ascii; + last_offset = char_offset; + } DECODE_COMPOSITION_START (c1); continue; case '1': /* end composition */ - if (composition_state == COMPOSING_NO) + if (cmp_status->state == COMPOSING_NO) goto invalid_code; DECODE_COMPOSITION_END (); continue; @@ -3436,10 +3898,16 @@ decode_coding_iso_2022 (coding) int size; ONE_MORE_BYTE (dim); + if (dim < 0 || dim > 4) + goto invalid_code; ONE_MORE_BYTE (M); + if (M < 128) + goto invalid_code; ONE_MORE_BYTE (L); + if (L < 128) + goto invalid_code; size = ((M - 128) * 128) + (L - 128); - if (charbuf + 8 + size > charbuf_end) + if (charbuf + 6 > charbuf_end) goto break_loop; *charbuf++ = ISO_CODE_ESC; *charbuf++ = '%'; @@ -3447,11 +3915,7 @@ decode_coding_iso_2022 (coding) *charbuf++ = dim; *charbuf++ = BYTE8_TO_CHAR (M); *charbuf++ = BYTE8_TO_CHAR (L); - while (size-- > 0) - { - ONE_MORE_BYTE (c1); - *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); - } + CODING_ISO_EXTSEGMENT_LEN (coding) = size; } else if (c1 == 'G') { @@ -3459,32 +3923,12 @@ decode_coding_iso_2022 (coding) ESC % G --UTF-8-BYTES-- ESC % @ We keep these bytes as is for the moment. They may be decoded by post-read-conversion. */ - int *p = charbuf; - - if (p + 6 > charbuf_end) - goto break_loop; - *p++ = ISO_CODE_ESC; - *p++ = '%'; - *p++ = 'G'; - while (p < charbuf_end) - { - ONE_MORE_BYTE (c1); - if (c1 == ISO_CODE_ESC - && src + 1 < src_end - && src[0] == '%' - && src[1] == '@') - { - src += 2; - break; - } - *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); - } - if (p + 3 > charbuf_end) + if (charbuf + 3 > charbuf_end) goto break_loop; - *p++ = ISO_CODE_ESC; - *p++ = '%'; - *p++ = '@'; - charbuf = p; + *charbuf++ = ISO_CODE_ESC; + *charbuf++ = '%'; + *charbuf++ = 'G'; + CODING_ISO_EMBEDDED_UTF_8 (coding) = 1; } else goto invalid_code; @@ -3522,7 +3966,8 @@ decode_coding_iso_2022 (coding) } } - if (charset->id != charset_ascii + if (cmp_status->state == COMPOSING_NO + && charset->id != charset_ascii && last_id != charset->id) { if (last_id != charset_ascii) @@ -3564,19 +4009,23 @@ decode_coding_iso_2022 (coding) *charbuf++ = BYTE8_TO_CHAR (*src_base); } } - else if (composition_state == COMPOSING_NO) + else if (cmp_status->state == COMPOSING_NO) { *charbuf++ = c; char_offset++; } - else + else if ((cmp_status->state == COMPOSING_CHAR + ? cmp_status->nchars + : cmp_status->ncomps) + >= MAX_COMPOSITION_COMPONENTS) { - components[component_idx++] = c; - if (method == COMPOSITION_WITH_RULE - || (method == COMPOSITION_WITH_RULE_ALTCHARS - && composition_state == COMPOSING_COMPONENT_CHAR)) - composition_state++; + /* Too long composition. */ + MAYBE_FINISH_COMPOSITION (); + *charbuf++ = c; + char_offset++; } + else + STORE_COMPOSITION_CHAR (c); continue; invalid_code: @@ -3594,7 +4043,18 @@ decode_coding_iso_2022 (coding) } no_more_source: - if (last_id != charset_ascii) + if (cmp_status->state != COMPOSING_NO) + { + if (coding->mode & CODING_MODE_LAST_BLOCK) + MAYBE_FINISH_COMPOSITION (); + else + { + charbuf -= cmp_status->length; + for (i = 0; i < cmp_status->length; i++) + cmp_status->carryover[i] = charbuf[i]; + } + } + else if (last_id != charset_ascii) ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id); coding->consumed_char += consumed_chars_base; coding->consumed = src_base - coding->source; @@ -4020,14 +4480,14 @@ encode_coding_iso_2022 (coding) int preferred_charset_id = -1; CODING_GET_INFO (coding, attrs, charset_list); - eol_type = CODING_ID_EOL_TYPE (coding->id); + eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); if (VECTORP (eol_type)) eol_type = Qunix; setup_iso_safe_charsets (attrs); /* Charset list may have been changed. */ charset_list = CODING_ATTR_CHARSET_LIST (attrs); - coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs)); + coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs)); ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); @@ -4308,7 +4768,8 @@ decode_coding_sjis (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -4328,7 +4789,11 @@ decode_coding_sjis (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c = byte_after_cr, byte_after_cr = -1; @@ -4419,7 +4884,8 @@ decode_coding_big5 (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -4436,7 +4902,11 @@ decode_coding_big5 (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c = byte_after_cr, byte_after_cr = -1; @@ -4867,7 +5337,8 @@ static void decode_coding_raw_text (coding) struct coding_system *coding; { - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); coding->chars_at_source = 1; coding->consumed_char = coding->src_chars; @@ -4975,15 +5446,22 @@ detect_coding_charset (coding, detect_info) const unsigned char *src_end = coding->source + coding->src_bytes; int multibytep = coding->src_multibyte; int consumed_chars = 0; - Lisp_Object attrs, valids; + Lisp_Object attrs, valids, name; int found = 0; int head_ascii = coding->head_ascii; + int check_latin_extra = 0; detect_info->checked |= CATEGORY_MASK_CHARSET; coding = &coding_categories[coding_category_charset]; attrs = CODING_ID_ATTRS (coding->id); valids = AREF (attrs, coding_attr_charset_valids); + name = CODING_ID_NAME (coding->id); + if (strncmp ((char *) SDATA (SYMBOL_NAME (name)), + "iso-8859-", sizeof ("iso-8859-") - 1) == 0 + || strncmp ((char *) SDATA (SYMBOL_NAME (name)), + "iso-latin-", sizeof ("iso-latin-") - 1) == 0) + check_latin_extra = 1; if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) src += head_ascii; @@ -5003,7 +5481,14 @@ detect_coding_charset (coding, detect_info) if (NILP (val)) break; if (c >= 0x80) - found = CATEGORY_MASK_CHARSET; + { + if (c < 0xA0 + && check_latin_extra + && (!VECTORP (Vlatin_extra_code_table) + || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))) + break; + found = CATEGORY_MASK_CHARSET; + } if (INTEGERP (val)) { charset = CHARSET_FROM_ID (XFASTINT (val)); @@ -5072,7 +5557,8 @@ decode_coding_charset (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -5091,7 +5577,11 @@ decode_coding_charset (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) { @@ -5109,7 +5599,7 @@ decode_coding_charset (coding) code = c; val = AREF (valids, c); - if (NILP (val)) + if (! INTEGERP (val) && ! CONSP (val)) goto invalid_code; if (INTEGERP (val)) { @@ -5265,7 +5755,7 @@ setup_coding_system (coding_system, coding) CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id); attrs = CODING_ID_ATTRS (coding->id); - eol_type = CODING_ID_EOL_TYPE (coding->id); + eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); coding->mode = 0; coding->head_ascii = -1; @@ -5286,7 +5776,7 @@ setup_coding_system (coding_system, coding) val = CODING_ATTR_SAFE_CHARSETS (attrs); coding->max_charset_id = SCHARS (val) - 1; - coding->safe_charsets = (char *) SDATA (val); + coding->safe_charsets = SDATA (val); coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs)); coding_type = CODING_ATTR_TYPE (attrs); @@ -5331,9 +5821,13 @@ setup_coding_system (coding_system, coding) setup_iso_safe_charsets (attrs); val = CODING_ATTR_SAFE_CHARSETS (attrs); coding->max_charset_id = SCHARS (val) - 1; - coding->safe_charsets = (char *) SDATA (val); + coding->safe_charsets = SDATA (val); } CODING_ISO_FLAGS (coding) = flags; + CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO; + CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO; + CODING_ISO_EXTSEGMENT_LEN (coding) = 0; + CODING_ISO_EMBEDDED_UTF_8 (coding) = 0; } else if (EQ (coding_type, Qcharset)) { @@ -5391,6 +5885,7 @@ setup_coding_system (coding_system, coding) coding->encoder = encode_coding_emacs_mule; coding->common_flags |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + coding->spec.emacs_mule.full_support = 1; if (! NILP (AREF (attrs, coding_attr_emacs_mule_full)) && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list)) { @@ -5401,14 +5896,17 @@ setup_coding_system (coding_system, coding) tail = XCDR (tail)) if (max_charset_id < XFASTINT (XCAR (tail))) max_charset_id = XFASTINT (XCAR (tail)); - safe_charsets = Fmake_string (make_number (max_charset_id + 1), - make_number (255)); + safe_charsets = make_uninit_string (max_charset_id + 1); + memset (SDATA (safe_charsets), 255, max_charset_id + 1); for (tail = Vemacs_mule_charset_list; CONSP (tail); tail = XCDR (tail)) SSET (safe_charsets, XFASTINT (XCAR (tail)), 0); coding->max_charset_id = max_charset_id; - coding->safe_charsets = (char *) SDATA (safe_charsets); + coding->safe_charsets = SDATA (safe_charsets); + coding->spec.emacs_mule.full_support = 1; } + coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO; + coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO; } else if (EQ (coding_type, Qshift_jis)) { @@ -5467,6 +5965,39 @@ coding_charset_list (coding) } +/* Return a list of charsets supported by CODING-SYSTEM. */ + +Lisp_Object +coding_system_charset_list (coding_system) + Lisp_Object coding_system; +{ + int id; + Lisp_Object attrs, charset_list; + + CHECK_CODING_SYSTEM_GET_ID (coding_system, id); + attrs = CODING_ID_ATTRS (id); + + if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022)) + { + int flags = XINT (AREF (attrs, coding_attr_iso_flags)); + + if (flags & CODING_ISO_FLAG_FULL_SUPPORT) + charset_list = Viso_2022_charset_list; + else + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + } + else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule)) + { + charset_list = Vemacs_mule_charset_list; + } + else + { + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + } + return charset_list; +} + + /* Return raw-text or one of its subsidiaries that has the same eol_type as CODING-SYSTEM. */ @@ -5691,16 +6222,26 @@ detect_eol (source, src_bytes, category) || src[lsb + 2] != '\n') this_eol = EOL_SEEN_CR; else - this_eol = EOL_SEEN_CRLF; + { + this_eol = EOL_SEEN_CRLF; + src += 2; + } if (eol_seen == EOL_SEEN_NONE) /* This is the first end-of-line. */ eol_seen = this_eol; else if (eol_seen != this_eol) { - /* The found type is different from what found before. */ - eol_seen = EOL_SEEN_LF; - break; + /* The found type is different from what found before. + Allow for stray ^M characters in DOS EOL files. */ + if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF + || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR) + eol_seen = EOL_SEEN_CRLF; + else + { + eol_seen = EOL_SEEN_LF; + break; + } } if (++total == MAX_EOL_CHECK_COUNT) break; @@ -5729,9 +6270,16 @@ detect_eol (source, src_bytes, category) eol_seen = this_eol; else if (eol_seen != this_eol) { - /* The found type is different from what found before. */ - eol_seen = EOL_SEEN_LF; - break; + /* The found type is different from what found before. + Allow for stray ^M characters in DOS EOL files. */ + if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF + || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR) + eol_seen = EOL_SEEN_CRLF; + else + { + eol_seen = EOL_SEEN_LF; + break; + } } if (++total == MAX_EOL_CHECK_COUNT) break; @@ -5777,6 +6325,7 @@ detect_coding (coding) struct coding_system *coding; { const unsigned char *src, *src_end; + int saved_mode = coding->mode; coding->consumed = coding->consumed_char = 0; coding->produced = coding->produced_char = 0; @@ -5825,7 +6374,7 @@ detect_coding (coding) break; } } - else if (! c) + else if (! c && !inhibit_null_byte_detection) { null_byte_found = 1; if (eight_bit_found) @@ -5947,6 +6496,7 @@ detect_coding (coding) setup_coding_system (XCDR (coding_systems), coding); } } + coding->mode = saved_mode; } @@ -5958,7 +6508,7 @@ decode_eol (coding) unsigned char *p, *pbeg, *pend; eol_type = CODING_ID_EOL_TYPE (coding->id); - if (EQ (eol_type, Qunix)) + if (EQ (eol_type, Qunix) || inhibit_eol_conversion) return; if (NILP (coding->dst_object)) @@ -5986,7 +6536,12 @@ decode_eol (coding) eol_seen |= EOL_SEEN_CR; } } - if (eol_seen != EOL_SEEN_NONE + /* Handle DOS-style EOLs in a file with stray ^M characters. */ + if ((eol_seen & EOL_SEEN_CRLF) != 0 + && (eol_seen & EOL_SEEN_CR) != 0 + && (eol_seen & EOL_SEEN_LF) == 0) + eol_seen = EOL_SEEN_CRLF; + else if (eol_seen != EOL_SEEN_NONE && eol_seen != EOL_SEEN_LF && eol_seen != EOL_SEEN_CRLF && eol_seen != EOL_SEEN_CR) @@ -6139,51 +6694,39 @@ get_translation_table (attrs, encodep, max_lookup) } while (0) +/* Return a translation of character(s) at BUF according to TRANS. + TRANS is TO-CHAR or ((FROM . TO) ...) where + FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...]. + The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a + translation is found, and Qnil if not found.. + If BUF is too short to lookup characters in FROM, return Qt. */ + static Lisp_Object -get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars) - Lisp_Object val; +get_translation (trans, buf, buf_end) + Lisp_Object trans; int *buf, *buf_end; - int last_block; - int *from_nchars, *to_nchars; { - /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or - [TO-CHAR ...]. */ - if (CONSP (val)) + + if (INTEGERP (trans)) + return trans; + for (; CONSP (trans); trans = XCDR (trans)) { - Lisp_Object from, tail; - int i, len; + Lisp_Object val = XCAR (trans); + Lisp_Object from = XCAR (val); + int len = ASIZE (from); + int i; - for (tail = val; CONSP (tail); tail = XCDR (tail)) + for (i = 0; i < len; i++) { - val = XCAR (tail); - from = XCAR (val); - len = ASIZE (from); - for (i = 0; i < len; i++) - { - if (buf + i == buf_end) - { - if (! last_block) - return Qt; - break; - } - if (XINT (AREF (from, i)) != buf[i]) - break; - } - if (i == len) - { - val = XCDR (val); - *from_nchars = len; - break; - } + if (buf + i == buf_end) + return Qt; + if (XINT (AREF (from, i)) != buf[i]) + break; } - if (! CONSP (tail)) - return Qnil; + if (i == len) + return val; } - if (VECTORP (val)) - *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val); - else - *buf = XINT (val); - return val; + return Qnil; } @@ -6223,11 +6766,23 @@ produce_chars (coding, translation_table, last_block) LOOKUP_TRANSLATION_TABLE (translation_table, c, trans); if (! NILP (trans)) { - trans = get_translation (trans, buf, buf_end, last_block, - &from_nchars, &to_nchars); - if (EQ (trans, Qt)) + trans = get_translation (trans, buf, buf_end); + if (INTEGERP (trans)) + c = XINT (trans); + else if (CONSP (trans)) + { + from_nchars = ASIZE (XCAR (trans)); + trans = XCDR (trans); + if (INTEGERP (trans)) + c = XINT (trans); + else + { + to_nchars = ASIZE (trans); + c = XINT (AREF (trans, 0)); + } + } + else if (EQ (trans, Qt) && ! last_block) break; - c = *buf; } if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end) @@ -6239,7 +6794,8 @@ produce_chars (coding, translation_table, last_block) if (EQ (coding->src_object, coding->dst_object)) { coding_set_source (coding); - dst_end = ((unsigned char *) coding->source) + coding->consumed; + dst_end = (((unsigned char *) coding->source) + + coding->consumed); } else dst_end = coding->destination + coding->dst_bytes; @@ -6256,9 +6812,7 @@ produce_chars (coding, translation_table, last_block) *dst++ = CHAR_TO_BYTE8 (c); } produced_chars += to_nchars; - *buf++ = to_nchars; - while (--from_nchars > 0) - *buf++ = 0; + buf += from_nchars; } else /* This is an annotation datum. (-C) is the length. */ @@ -6279,7 +6833,7 @@ produce_chars (coding, translation_table, last_block) if (coding->src_multibyte) { int multibytep = 1; - EMACS_INT consumed_chars; + EMACS_INT consumed_chars = 0; while (1) { @@ -6374,7 +6928,7 @@ produce_chars (coding, translation_table, last_block) /* Compose text in CODING->object according to the annotation data at CHARBUF. CHARBUF is an array: - [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ] + [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ] */ static INLINE void @@ -6388,33 +6942,33 @@ produce_composition (coding, charbuf, pos) enum composition_method method; Lisp_Object components; - len = -charbuf[0]; + len = -charbuf[0] - MAX_ANNOTATION_LENGTH; to = pos + charbuf[2]; - if (to <= pos) - return; - method = (enum composition_method) (charbuf[3]); + method = (enum composition_method) (charbuf[4]); if (method == COMPOSITION_RELATIVE) components = Qnil; - else if (method >= COMPOSITION_WITH_RULE - && method <= COMPOSITION_WITH_RULE_ALTCHARS) + else { Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1]; - int i; + int i, j; - len -= 4; - charbuf += 4; - for (i = 0; i < len; i++) + if (method == COMPOSITION_WITH_RULE) + len = charbuf[2] * 3 - 2; + charbuf += MAX_ANNOTATION_LENGTH; + /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */ + for (i = j = 0; i < len && charbuf[i] != -1; i++, j++) { - args[i] = make_number (charbuf[i]); - if (charbuf[i] < 0) - return; + if (charbuf[i] >= 0) + args[j] = make_number (charbuf[i]); + else + { + i++; + args[j] = make_number (charbuf[i] % 0x100); + } } - components = (method == COMPOSITION_WITH_ALTCHARS - ? Fstring (len, args) : Fvector (len, args)); + components = (i == j ? Fstring (j, args) : Fvector (j, args)); } - else - return; compose_text (pos, to, components, Qnil, coding->dst_object); } @@ -6443,7 +6997,7 @@ produce_charset (coding, charbuf, pos) #define ALLOC_CONVERSION_WORK_AREA(coding) \ do { \ - int size = CHARBUF_SIZE;; \ + int size = CHARBUF_SIZE; \ \ coding->charbuf = NULL; \ while (size > 1024) \ @@ -6476,21 +7030,21 @@ produce_annotation (coding, pos) while (charbuf < charbuf_end) { if (*charbuf >= 0) - pos += *charbuf++; + pos++, charbuf++; else { int len = -*charbuf; - switch (charbuf[1]) - { - case CODING_ANNOTATE_COMPOSITION_MASK: - produce_composition (coding, charbuf, pos); - break; - case CODING_ANNOTATE_CHARSET_MASK: - produce_charset (coding, charbuf, pos); - break; - default: - abort (); - } + + if (len > 2) + switch (charbuf[1]) + { + case CODING_ANNOTATE_COMPOSITION_MASK: + produce_composition (coding, charbuf, pos); + break; + case CODING_ANNOTATE_CHARSET_MASK: + produce_charset (coding, charbuf, pos); + break; + } charbuf += len; } } @@ -6601,6 +7155,8 @@ decode_coding (coding) that the number of data is less than the size of coding->charbuf. */ coding->charbuf_used = 0; + coding->chars_at_source = 0; + while (nbytes-- > 0) { int c = *src++; @@ -6618,6 +7174,8 @@ decode_coding (coding) coding->carryover. */ unsigned char *p = coding->carryover; + if (nbytes > sizeof coding->carryover) + nbytes = sizeof coding->carryover; coding->carryover_bytes = nbytes; while (nbytes-- > 0) *p++ = *src++; @@ -6625,7 +7183,8 @@ decode_coding (coding) coding->consumed = coding->src_bytes; } - if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)) + if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix) + && !inhibit_eol_conversion) decode_eol (coding); if (BUFFERP (coding->dst_object)) { @@ -6671,7 +7230,7 @@ handle_composition_annotation (pos, limit, coding, buf, stop) enum composition_method method = COMPOSITION_METHOD (prop); int nchars = COMPOSITION_LENGTH (prop); - ADD_COMPOSITION_DATA (buf, nchars, method); + ADD_COMPOSITION_DATA (buf, nchars, 0, method); if (method != COMPOSITION_RELATIVE) { Lisp_Object components; @@ -6776,7 +7335,7 @@ consume_chars (coding, translation_table, max_lookup) if (! NILP (translation_table)) lookup_buf = alloca (sizeof (int) * max_lookup); - eol_type = CODING_ID_EOL_TYPE (coding->id); + eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); if (VECTORP (eol_type)) eol_type = Qunix; @@ -6858,12 +7417,26 @@ consume_chars (coding, translation_table, max_lookup) for (i = 1; i < max_lookup && p < src_end; i++) lookup_buf[i] = STRING_CHAR_ADVANCE (p); lookup_buf_end = lookup_buf + i; - trans = get_translation (trans, lookup_buf, lookup_buf_end, 1, - &from_nchars, &to_nchars); - if (EQ (trans, Qt) - || buf + to_nchars > buf_end) + trans = get_translation (trans, lookup_buf, lookup_buf_end); + if (INTEGERP (trans)) + c = XINT (trans); + else if (CONSP (trans)) + { + from_nchars = ASIZE (XCAR (trans)); + trans = XCDR (trans); + if (INTEGERP (trans)) + c = XINT (trans); + else + { + to_nchars = ASIZE (trans); + if (buf + to_nchars > buf_end) + break; + c = XINT (AREF (trans, 0)); + } + } + else break; - *buf++ = *lookup_buf; + *buf++ = c; for (i = 1; i < to_nchars; i++) *buf++ = XINT (AREF (trans, i)); for (i = 1; i < from_nchars; i++, pos++) @@ -6972,13 +7545,17 @@ make_conversion_work_buffer (multibyte) } else { - name = Vcode_conversion_workbuf_name; - workbuf = Fget_buffer_create (name); - if (NILP (Vcode_conversion_reused_workbuf)) - Vcode_conversion_reused_workbuf = workbuf; + if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf))) + Vcode_conversion_reused_workbuf + = Fget_buffer_create (Vcode_conversion_workbuf_name); + workbuf = Vcode_conversion_reused_workbuf; } current = current_buffer; set_buffer_internal (XBUFFER (workbuf)); + /* We can't allow modification hooks to run in the work buffer. For + instance, directory_files_internal assumes that file decoding + doesn't compile new regexps. */ + Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt); Ferase_buffer (); current_buffer->undo_list = Qt; current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil; @@ -7639,7 +8216,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, { const unsigned char *src_end = src + src_bytes; Lisp_Object attrs, eol_type; - Lisp_Object val; + Lisp_Object val = Qnil; struct coding_system coding; int id; struct coding_detection_info detect_info; @@ -7703,7 +8280,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, break; } } - else if (! c) + else if (! c && !inhibit_null_byte_detection) { null_byte_found = 1; if (eight_bit_found) @@ -7771,10 +8348,11 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, } } - if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY) + if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY + || null_byte_found) { detect_info.found = CATEGORY_MASK_RAW_TEXT; - id = coding_categories[coding_category_raw_text].id; + id = CODING_SYSTEM_ID (Qno_conversion); val = Fcons (make_number (id), Qnil); } else if (! detect_info.rejected && ! detect_info.found) @@ -7804,7 +8382,6 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, { int mask = detect_info.rejected | detect_info.found; int found = 0; - val = Qnil; for (i = coding_category_raw_text - 1; i >= 0; i--) { @@ -7867,7 +8444,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, /* Then, detect eol-format if necessary. */ { - int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol; + int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1; Lisp_Object tail; if (VECTORP (eol_type)) @@ -7933,7 +8510,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, } } - return (highest ? XCAR (val) : val); + return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val); } @@ -8272,7 +8849,10 @@ value is nil. START may be a string. In that case, check if the string is encodable, and the value contains indices to the string instead of -buffer positions. END is ignored. */) +buffer positions. END is ignored. + +If the current buffer (or START if it is a string) is unibyte, the value +is nil. */) (start, end, coding_system_list) Lisp_Object start, end, coding_system_list; { @@ -8286,7 +8866,7 @@ buffer positions. END is ignored. */) if (STRINGP (start)) { if (!STRING_MULTIBYTE (start) - && SCHARS (start) != SBYTES (start)) + || SCHARS (start) == SBYTES (start)) return Qnil; start_byte = 0; end_byte = SBYTES (start); @@ -8303,7 +8883,7 @@ buffer positions. END is ignored. */) start_byte = CHAR_TO_BYTE (XINT (start)); end_byte = CHAR_TO_BYTE (XINT (end)); if (XINT (end) - XINT (start) == end_byte - start_byte) - return Qt; + return Qnil; if (XINT (start) < GPT && XINT (end) > GPT) { @@ -8432,7 +9012,8 @@ START and END are buffer positions. Optional 4th arguments DESTINATION specifies where the decoded text goes. If nil, the region between START and END is replaced by the decoded text. -If buffer, the decoded text is inserted in the buffer. +If buffer, the decoded text is inserted in that buffer after point (point +does not move). In those cases, the length of the decoded text is returned. If DESTINATION is t, the decoded text is returned. @@ -8454,7 +9035,8 @@ START and END are buffer positions. Optional 4th arguments DESTINATION specifies where the encoded text goes. If nil, the region between START and END is replace by the encoded text. -If buffer, the encoded text is inserted in the buffer. +If buffer, the encoded text is inserted in that buffer after point (point +does not move). In those cases, the length of the encoded text is returned. If DESTINATION is t, the encoded text is returned. @@ -8534,8 +9116,8 @@ Optional third arg NOCOPY non-nil means it is OK to return STRING itself if the decoding operation is trivial. Optional fourth arg BUFFER non-nil means that the decoded text is -inserted in BUFFER instead of returned as a string. In this case, -the return value is the length of the decoded text. +inserted in that buffer after point (point does not move). In this +case, the return value is the length of the decoded text. This function sets `last-coding-system-used' to the precise coding system used (which may be different from CODING-SYSTEM if CODING-SYSTEM is @@ -8555,8 +9137,8 @@ Optional third arg NOCOPY non-nil means it is OK to return STRING itself if the encoding operation is trivial. Optional fourth arg BUFFER non-nil means that the encoded text is -inserted in BUFFER instead of returned as a string. In this case, -the return value is the length of the encoded text. +inserted in that buffer after point (point does not move). In this +case, the return value is the length of the encoded text. This function sets `last-coding-system-used' to the precise coding system used (which may be different from CODING-SYSTEM if CODING-SYSTEM is @@ -9095,8 +9677,8 @@ usage: (define-coding-system-internal ...) */) } CODING_ATTR_CHARSET_LIST (attrs) = charset_list; - safe_charsets = Fmake_string (make_number (max_charset_id + 1), - make_number (255)); + safe_charsets = make_uninit_string (max_charset_id + 1); + memset (SDATA (safe_charsets), 255, max_charset_id + 1); for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) SSET (safe_charsets, XFASTINT (XCAR (tail)), 0); CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets; @@ -9563,7 +10145,7 @@ DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put, CHECK_CHARACTER (val); CODING_ATTR_MNEMONIC (attrs) = val; } - else if (EQ (prop, QCdefalut_char)) + else if (EQ (prop, QCdefault_char)) { if (NILP (val)) val = make_number (' '); @@ -9869,7 +10451,7 @@ syms_of_coding () DEFSYM (QCcategory, ":category"); DEFSYM (QCmnemonic, ":mnemonic"); - DEFSYM (QCdefalut_char, ":default-char"); + DEFSYM (QCdefault_char, ":default-char"); DEFSYM (QCdecode_translation_table, ":decode-translation-table"); DEFSYM (QCencode_translation_table, ":encode-translation-table"); DEFSYM (QCpost_read_conversion, ":post-read-conversion"); @@ -10207,18 +10789,18 @@ called even if `coding-system-for-write' is non-nil. The command DEFVAR_BOOL ("inhibit-iso-escape-detection", &inhibit_iso_escape_detection, doc: /* -If non-nil, Emacs ignores ISO2022's escape sequence on code detection. +If non-nil, Emacs ignores ISO-2022 escape sequences during code detection. -By default, on reading a file, Emacs tries to detect how the text is -encoded. This code detection is sensitive to escape sequences. If -the sequence is valid as ISO2022, the code is determined as one of -the ISO2022 encodings, and the file is decoded by the corresponding -coding system (e.g. `iso-2022-7bit'). +When Emacs reads text, it tries to detect how the text is encoded. +This code detection is sensitive to escape sequences. If Emacs sees +a valid ISO-2022 escape sequence, it assumes the text is encoded in one +of the ISO2022 encodings, and decodes text by the corresponding coding +system (e.g. `iso-2022-7bit'). However, there may be a case that you want to read escape sequences in a file as is. In such a case, you can set this variable to non-nil. -Then, as the code detection ignores any escape sequences, no file is -detected as encoded in some ISO2022 encoding. The result is that all +Then the code detection will ignore any escape sequences, and no text is +detected as encoded in some ISO-2022 encoding. The result is that all escape sequences become visible in a buffer. The default value is nil, and it is strongly recommended not to change @@ -10228,14 +10810,31 @@ in Emacs's distribution, and they won't be decoded correctly on reading if you suppress escape sequence detection. The other way to read escape sequences in a file without decoding is -to explicitly specify some coding system that doesn't use ISO2022's +to explicitly specify some coding system that doesn't use ISO-2022 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */); inhibit_iso_escape_detection = 0; + DEFVAR_BOOL ("inhibit-null-byte-detection", + &inhibit_null_byte_detection, + doc: /* If non-nil, Emacs ignores null bytes on code detection. +By default, Emacs treats it as binary data, and does not attempt to +decode it. The effect is as if you specified `no-conversion' for +reading that text. + +Set this to non-nil when a regular text happens to include null bytes. +Examples are Index nodes of Info files and null-byte delimited output +from GNU Find and GNU Grep. Emacs will then ignore the null bytes and +decode text as usual. */); + inhibit_null_byte_detection = 0; + DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input, doc: /* Char table for translating self-inserting characters. This is applied to the result of input methods, not their input. -See also `keyboard-translate-table'. */); +See also `keyboard-translate-table'. + +Use of this variable for character code unification was rendered +obsolete in Emacs 23.1 and later, since Unicode is now the basis of +internal character representation. */); Vtranslation_table_for_input = Qnil; {