X-Git-Url: http://git.hcoop.net/bpt/emacs.git/blobdiff_plain/1560f91a925c564191cecb2ac3c95f5025c415cf..964b0e76b0c609ddd0dd71b7ab7c7c44627ec044:/src/coding.c diff --git a/src/coding.c b/src/coding.c index ea0a066eb7..555e662338 100644 --- a/src/coding.c +++ b/src/coding.c @@ -1,8 +1,8 @@ /* Coding system handler (conversion, detection, etc). Copyright (C) 2001, 2002, 2003, 2004, 2005, - 2006, 2007, 2008 Free Software Foundation, Inc. + 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008 + 2005, 2006, 2007, 2008, 2009, 2010, 2011 National Institute of Advanced Industrial Science and Technology (AIST) Registration Number H14PRO021 Copyright (C) 2003 @@ -167,7 +167,7 @@ detect_coding_XXX (coding, detect_info) while (1) { - /* Get one byte from the source. If the souce is exausted, jump + /* Get one byte from the source. If the source is exhausted, jump to no_more_source:. */ ONE_MORE_BYTE (c); @@ -181,7 +181,7 @@ detect_coding_XXX (coding, detect_info) return 0; no_more_source: - /* The source exausted successfully. */ + /* The source exhausted successfully. */ detect_info->found |= found; return 1; } @@ -289,6 +289,7 @@ encode_coding_XXX (coding) #include #include +#include #include "lisp.h" #include "buffer.h" @@ -314,7 +315,7 @@ Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; Lisp_Object Qbig, Qlittle; Lisp_Object Qcoding_system_history; Lisp_Object Qvalid_codes; -Lisp_Object QCcategory, QCmnemonic, QCdefalut_char; +Lisp_Object QCcategory, QCmnemonic, QCdefault_char; Lisp_Object QCdecode_translation_table, QCencode_translation_table; Lisp_Object QCpost_read_conversion, QCpre_write_conversion; Lisp_Object QCascii_compatible_p; @@ -380,6 +381,9 @@ int inhibit_eol_conversion; /* Flag to inhibit ISO2022 escape sequence detection. */ int inhibit_iso_escape_detection; +/* Flag to inhibit detection of binary files through null bytes. */ +int inhibit_null_byte_detection; + /* Flag to make buffer-file-coding-system inherit from process-coding. */ int inherit_process_coding_system; @@ -429,9 +433,11 @@ Lisp_Object Vbig5_coding_system; reg))) -#define CODING_ISO_REQUEST(coding, charset_id) \ - ((charset_id <= (coding)->max_charset_id \ - ? (coding)->safe_charsets[charset_id] \ +#define CODING_ISO_REQUEST(coding, charset_id) \ + (((charset_id) <= (coding)->max_charset_id \ + ? ((coding)->safe_charsets[charset_id] != 255 \ + ? (coding)->safe_charsets[charset_id] \ + : -1) \ : -1)) @@ -447,6 +453,12 @@ Lisp_Object Vbig5_coding_system; ((coding)->spec.iso_2022.bol) #define CODING_ISO_INVOKED_CHARSET(coding, plane) \ CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane))) +#define CODING_ISO_CMP_STATUS(coding) \ + (&(coding)->spec.iso_2022.cmp_status) +#define CODING_ISO_EXTSEGMENT_LEN(coding) \ + ((coding)->spec.iso_2022.ctext_extended_segment_len) +#define CODING_ISO_EMBEDDED_UTF_8(coding) \ + ((coding)->spec.iso_2022.embedded_utf_8) /* Control characters of ISO2022. */ /* code */ /* function */ @@ -525,7 +537,7 @@ enum iso_code_class_type on output. */ #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400 -/* If set, do not encode unsafe charactes on output. */ +/* If set, do not encode unsafe characters on output. */ #define CODING_ISO_FLAG_SAFE 0x0800 /* If set, extra latin codes (128..159) are accepted as a valid code @@ -681,7 +693,7 @@ enum coding_category static Lisp_Object Vcoding_category_list; /* Table of coding categories (Lisp symbols). This variable is for - internal use oly. */ + internal use only. */ static Lisp_Object Vcoding_category_table; /* Table of coding-categories ordered by priority. */ @@ -740,6 +752,45 @@ static struct coding_system coding_categories[coding_category_max]; consumed_chars++; \ } while (0) +/* Safely get two bytes from the source text pointed by SRC which ends + at SRC_END, and set C1 and C2 to those bytes while skipping the + heading multibyte characters. If there are not enough bytes in the + source, it jumps to `no_more_source'. If multibytep is nonzero and + a multibyte character is found for C2, set C2 to the negative value + of the character code. The caller should declare and set these + variables appropriately in advance: + src, src_end, multibytep + It is intended that this macro is used in detect_coding_utf_16. */ + +#define TWO_MORE_BYTES(c1, c2) \ + do { \ + do { \ + if (src == src_end) \ + goto no_more_source; \ + c1 = *src++; \ + if (multibytep && (c1 & 0x80)) \ + { \ + if ((c1 & 0xFE) == 0xC0) \ + c1 = ((c1 & 1) << 6) | *src++; \ + else \ + { \ + src += BYTES_BY_CHAR_HEAD (c1) - 1; \ + c1 = -1; \ + } \ + } \ + } while (c1 < 0); \ + if (src == src_end) \ + goto no_more_source; \ + c2 = *src++; \ + if (multibytep && (c2 & 0x80)) \ + { \ + if ((c2 & 0xFE) == 0xC0) \ + c2 = ((c2 & 1) << 6) | *src++; \ + else \ + c2 = -1; \ + } \ + } while (0) + #define ONE_MORE_BYTE_NO_CHECK(c) \ do { \ @@ -774,7 +825,7 @@ static struct coding_system coding_categories[coding_category_max]; } while (0) -/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */ +/* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */ #define EMIT_TWO_ASCII_BYTES(c1, c2) \ do { \ @@ -901,11 +952,8 @@ static int detect_eol P_ ((const unsigned char *, static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int)); static void decode_eol P_ ((struct coding_system *)); static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *)); -static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *, - int, int *, int *)); +static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *)); static int produce_chars P_ ((struct coding_system *, Lisp_Object, int)); -static INLINE void produce_composition P_ ((struct coding_system *, int *, - EMACS_INT)); static INLINE void produce_charset P_ ((struct coding_system *, int *, EMACS_INT)); static void produce_annotation P_ ((struct coding_system *, EMACS_INT)); @@ -945,11 +993,22 @@ record_conversion_result (struct coding_system *coding, case CODING_RESULT_INSUFFICIENT_MEM: Vlast_code_conversion_error = Qinsufficient_memory; break; + case CODING_RESULT_INSUFFICIENT_DST: + /* Don't record this error in Vlast_code_conversion_error + because it happens just temporarily and is resolved when the + whole conversion is finished. */ + break; + case CODING_RESULT_SUCCESS: + break; default: Vlast_code_conversion_error = intern ("Unknown error"); } } +/* This wrapper macro is used to preserve validity of pointers into + buffer text across calls to decode_char, which could cause + relocation of buffers if it loads a charset map, because loading a + charset map allocates large structures. */ #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \ do { \ charset_map_loaded = 0; \ @@ -1156,7 +1215,6 @@ alloc_destination (coding, nbytes, dst) } else coding_alloc_by_realloc (coding, nbytes); - record_conversion_result (coding, CODING_RESULT_SUCCESS); coding_set_destination (coding); dst = coding->destination + offset; return dst; @@ -1164,10 +1222,6 @@ alloc_destination (coding, nbytes, dst) /** Macros for annotations. */ -/* Maximum length of annotation data (sum of annotations for - composition and charset). */ -#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4) - /* An annotation data is stored in the array coding->charbuf in this format: [ -LENGTH ANNOTATION_MASK NCHARS ... ] @@ -1179,13 +1233,26 @@ alloc_destination (coding, nbytes, dst) In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements follows: - ... METHOD [ COMPOSITION-COMPONENTS ... ] + ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ] + + NBYTES is the number of bytes specified in the header part of + old-style emacs-mule encoding, or 0 for the other kind of + composition. + METHOD is one of enum composition_method. - Optionnal COMPOSITION-COMPONENTS are characters and composition + + Optional COMPOSITION-COMPONENTS are characters and composition rules. In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID - follows. */ + follows. + + If ANNOTATION_MASK is 0, this annotation is just a space holder to + recover from an invalid annotation, and should be skipped by + produce_annotation. */ + +/* Maximum length of the header of annotation data. */ +#define MAX_ANNOTATION_LENGTH 5 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \ do { \ @@ -1195,9 +1262,10 @@ alloc_destination (coding, nbytes, dst) coding->annotated = 1; \ } while (0); -#define ADD_COMPOSITION_DATA(buf, nchars, method) \ +#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \ do { \ - ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \ + ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \ + *buf++ = nbytes; \ *buf++ = method; \ } while (0) @@ -1326,11 +1394,12 @@ decode_coding_utf_8 (coding) const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; int *charbuf_end = coding->charbuf + coding->charbuf_size; - int consumed_chars = 0, consumed_chars_base; + int consumed_chars = 0, consumed_chars_base = 0; int multibytep = coding->src_multibyte; enum utf_bom_type bom = CODING_UTF_8_BOM (coding); Lisp_Object attr, charset_list; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attr, charset_list); @@ -1345,12 +1414,12 @@ decode_coding_utf_8 (coding) src = src_base; else { - ONE_MORE_BYTE (c2); + ONE_MORE_BYTE (c2); if (! UTF_8_EXTRA_OCTET_P (c2)) src = src_base; else { - ONE_MORE_BYTE (c3); + ONE_MORE_BYTE (c3); if (! UTF_8_EXTRA_OCTET_P (c3)) src = src_base; else @@ -1376,7 +1445,11 @@ decode_coding_utf_8 (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c1 = byte_after_cr, byte_after_cr = -1; @@ -1568,8 +1641,7 @@ detect_coding_utf_16 (coding, detect_info) return 0; } - ONE_MORE_BYTE (c1); - ONE_MORE_BYTE (c2); + TWO_MORE_BYTES (c1, c2); if ((c1 == 0xFF) && (c2 == 0xFE)) { detect_info->found |= (CATEGORY_MASK_UTF_16_LE @@ -1586,6 +1658,11 @@ detect_coding_utf_16 (coding, detect_info) | CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG); } + else if (c2 < 0) + { + detect_info->rejected |= CATEGORY_MASK_UTF_16; + return 0; + } else { /* We check the dispersion of Eth and Oth bytes where E is even and @@ -1598,29 +1675,31 @@ detect_coding_utf_16 (coding, detect_info) e[c1] = 1; o[c2] = 1; - detect_info->rejected - |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); + detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO + |CATEGORY_MASK_UTF_16_BE + | CATEGORY_MASK_UTF_16_LE); - while (1) + while ((detect_info->rejected & CATEGORY_MASK_UTF_16) + != CATEGORY_MASK_UTF_16) { - ONE_MORE_BYTE (c1); - ONE_MORE_BYTE (c2); + TWO_MORE_BYTES (c1, c2); + if (c2 < 0) + break; if (! e[c1]) { e[c1] = 1; e_num++; if (e_num >= 128) - break; + detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG; } if (! o[c2]) { - o[c1] = 1; + o[c2] = 1; o_num++; if (o_num >= 128) - break; + detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG; } } - detect_info->rejected |= CATEGORY_MASK_UTF_16; return 0; } @@ -1636,14 +1715,16 @@ decode_coding_utf_16 (coding) const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; - int *charbuf_end = coding->charbuf + coding->charbuf_size; - int consumed_chars = 0, consumed_chars_base; + /* We may produces at most 3 chars in one loop. */ + int *charbuf_end = coding->charbuf + coding->charbuf_size - 2; + int consumed_chars = 0, consumed_chars_base = 0; int multibytep = coding->src_multibyte; enum utf_bom_type bom = CODING_UTF_16_BOM (coding); enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); int surrogate = CODING_UTF_16_SURROGATE (coding); Lisp_Object attr, charset_list; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr1 = -1, byte_after_cr2 = -1; CODING_GET_INFO (coding, attr, charset_list); @@ -1681,8 +1762,12 @@ decode_coding_utf_16 (coding) src_base = src; consumed_chars_base = consumed_chars; - if (charbuf + 2 >= charbuf_end) - break; + if (charbuf >= charbuf_end) + { + if (byte_after_cr1 >= 0) + src_base -= 2; + break; + } if (byte_after_cr1 >= 0) c1 = byte_after_cr1, byte_after_cr1 = -1; @@ -1783,7 +1868,7 @@ encode_coding_utf_16 (coding) { ASSURE_DESTINATION (safe_room); c = *charbuf++; - if (c >= MAX_UNICODE_CHAR) + if (c > MAX_UNICODE_CHAR) c = coding->default_char; if (c < 0x10000) @@ -1861,15 +1946,15 @@ encode_coding_utf_16 (coding) Next, character composition data are represented by the byte sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ..., where, - METHOD is 0xF0 plus one of composition method (enum + METHOD is 0xF2 plus one of composition method (enum composition_method), BYTES is 0xA0 plus a byte length of this composition data, - CHARS is 0x20 plus a number of characters composed by this + CHARS is 0xA0 plus a number of characters composed by this data, - COMPONENTs are characters of multibye form or composition + COMPONENTs are characters of multibyte form or composition rules encoded by two-byte of ASCII codes. In addition, for backward compatibility, the following formats are @@ -1888,44 +1973,158 @@ encode_coding_utf_16 (coding) char emacs_mule_bytes[256]; + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in `emacs-mule'. If it is, return 1, + else return 0. */ + +static int +detect_coding_emacs_mule (coding, detect_info) + struct coding_system *coding; + struct coding_detection_info *detect_info; +{ + const unsigned char *src = coding->source, *src_base; + const unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int c; + int found = 0; + + detect_info->checked |= CATEGORY_MASK_EMACS_MULE; + /* A coding system of this category is always ASCII compatible. */ + src += coding->head_ascii; + + while (1) + { + src_base = src; + ONE_MORE_BYTE (c); + if (c < 0) + continue; + if (c == 0x80) + { + /* Perhaps the start of composite character. We simply skip + it because analyzing it is too heavy for detecting. But, + at least, we check that the composite character + constitutes of more than 4 bytes. */ + const unsigned char *src_base; + + repeat: + src_base = src; + do + { + ONE_MORE_BYTE (c); + } + while (c >= 0xA0); + + if (src - src_base <= 4) + break; + found = CATEGORY_MASK_EMACS_MULE; + if (c == 0x80) + goto repeat; + } + + if (c < 0x80) + { + if (c < 0x20 + && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)) + break; + } + else + { + int more_bytes = emacs_mule_bytes[c] - 1; + + while (more_bytes > 0) + { + ONE_MORE_BYTE (c); + if (c < 0xA0) + { + src--; /* Unread the last byte. */ + break; + } + more_bytes--; + } + if (more_bytes != 0) + break; + found = CATEGORY_MASK_EMACS_MULE; + } + } + detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; + return 0; + + no_more_source: + if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK) + { + detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; + return 0; + } + detect_info->found |= found; + return 1; +} + + +/* Parse emacs-mule multibyte sequence at SRC and return the decoded + character. If CMP_STATUS indicates that we must expect MSEQ or + RULE described above, decode it and return the negative value of + the decoded character or rule. If an invalid byte is found, return + -1. If SRC is too short, return -2. */ + int -emacs_mule_char (coding, src, nbytes, nchars, id) +emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status) struct coding_system *coding; const unsigned char *src; int *nbytes, *nchars, *id; + struct composition_status *cmp_status; { const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base = src; int multibytep = coding->src_multibyte; - struct charset *charset; + int charset_id; unsigned code; int c; int consumed_chars = 0; + int mseq_found = 0; ONE_MORE_BYTE (c); if (c < 0) { c = -c; - charset = emacs_mule_charset[0]; + charset_id = emacs_mule_charset[0]; } else { if (c >= 0xA0) { - /* Old style component character of a composition. */ - if (c == 0xA0) + if (cmp_status->state != COMPOSING_NO + && cmp_status->old_form) { - ONE_MORE_BYTE (c); - c -= 0x80; + if (cmp_status->state == COMPOSING_CHAR) + { + if (c == 0xA0) + { + ONE_MORE_BYTE (c); + c -= 0x80; + if (c < 0) + goto invalid_code; + } + else + c -= 0x20; + mseq_found = 1; + } + else + { + *nbytes = src - src_base; + *nchars = consumed_chars; + return -c; + } } else - c -= 0x20; + goto invalid_code; } switch (emacs_mule_bytes[c]) { case 2: - if (! (charset = emacs_mule_charset[c])) + if ((charset_id = emacs_mule_charset[c]) < 0) goto invalid_code; ONE_MORE_BYTE (c); if (c < 0xA0) @@ -1938,7 +2137,7 @@ emacs_mule_char (coding, src, nbytes, nchars, id) || c == EMACS_MULE_LEADING_CODE_PRIVATE_12) { ONE_MORE_BYTE (c); - if (c < 0xA0 || ! (charset = emacs_mule_charset[c])) + if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0) goto invalid_code; ONE_MORE_BYTE (c); if (c < 0xA0) @@ -1947,7 +2146,7 @@ emacs_mule_char (coding, src, nbytes, nchars, id) } else { - if (! (charset = emacs_mule_charset[c])) + if ((charset_id = emacs_mule_charset[c]) < 0) goto invalid_code; ONE_MORE_BYTE (c); if (c < 0xA0) @@ -1962,7 +2161,7 @@ emacs_mule_char (coding, src, nbytes, nchars, id) case 4: ONE_MORE_BYTE (c); - if (c < 0 || ! (charset = emacs_mule_charset[c])) + if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0) goto invalid_code; ONE_MORE_BYTE (c); if (c < 0xA0) @@ -1976,22 +2175,22 @@ emacs_mule_char (coding, src, nbytes, nchars, id) case 1: code = c; - charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) - ? charset_ascii : charset_eight_bit); + charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit; break; default: abort (); } - c = DECODE_CHAR (charset, code); + CODING_DECODE_CHAR (coding, src, src_base, src_end, + CHARSET_FROM_ID (charset_id), code, c); if (c < 0) goto invalid_code; } *nbytes = src - src_base; *nchars = consumed_chars; if (id) - *id = charset->id; - return c; + *id = charset_id; + return (mseq_found ? -c : c); no_more_source: return -2; @@ -2001,259 +2200,250 @@ emacs_mule_char (coding, src, nbytes, nchars, id) } -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in `emacs-mule'. If it is, return 1, - else return 0. */ +/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ -static int -detect_coding_emacs_mule (coding, detect_info) - struct coding_system *coding; - struct coding_detection_info *detect_info; -{ - const unsigned char *src = coding->source, *src_base; - const unsigned char *src_end = coding->source + coding->src_bytes; - int multibytep = coding->src_multibyte; - int consumed_chars = 0; - int c; - int found = 0; +/* Handle these composition sequence ('|': the end of header elements, + BYTES and CHARS >= 0xA0): - detect_info->checked |= CATEGORY_MASK_EMACS_MULE; - /* A coding system of this category is always ASCII compatible. */ - src += coding->head_ascii; + (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ... + (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ... + (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ... - while (1) - { - src_base = src; - ONE_MORE_BYTE (c); - if (c < 0) - continue; - if (c == 0x80) - { - /* Perhaps the start of composite character. We simple skip - it because analyzing it is too heavy for detecting. But, - at least, we check that the composite character - constitutes of more than 4 bytes. */ - const unsigned char *src_base; + and these old form: + + (4) relative composition: 0x80 | MSEQ ... MSEQ + (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ - repeat: - src_base = src; - do - { - ONE_MORE_BYTE (c); - } - while (c >= 0xA0); + When the starter 0x80 and the following header elements are found, + this annotation header is produced. - if (src - src_base <= 4) - break; - found = CATEGORY_MASK_EMACS_MULE; - if (c == 0x80) - goto repeat; - } + [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ] - if (c < 0x80) - { - if (c < 0x20 - && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)) - break; - } - else - { - int more_bytes = emacs_mule_bytes[*src_base] - 1; + NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5). + NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5). - while (more_bytes > 0) - { - ONE_MORE_BYTE (c); - if (c < 0xA0) - { - src--; /* Unread the last byte. */ - break; - } - more_bytes--; - } - if (more_bytes != 0) - break; - found = CATEGORY_MASK_EMACS_MULE; - } - } - detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; - return 0; + Then, upon reading the following elements, these codes are produced + until the composition end is found: - no_more_source: - if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK) - { - detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; - return 0; - } - detect_info->found |= found; - return 1; -} + (1) CHAR ... CHAR + (2) ALT ... ALT CHAR ... CHAR + (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR + (4) CHAR ... CHAR + (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR + When the composition end is found, LENGTH and NCHARS in the + annotation header is updated as below: -/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ + (1) LENGTH: unchanged, NCHARS: unchanged + (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged + (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged + (4) LENGTH: unchanged, NCHARS: number of CHARs + (5) LENGTH: unchanged, NCHARS: number of CHARs -/* Decode a character represented as a component of composition - sequence of Emacs 20/21 style at SRC. Set C to that character and - update SRC to the head of next character (or an encoded composition - rule). If SRC doesn't points a composition component, set C to -1. - If SRC points an invalid byte sequence, global exit by a return - value 0. */ - -#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \ - do \ - { \ - int c; \ - int nbytes, nchars; \ - \ - if (src == src_end) \ - break; \ - c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\ - if (c < 0) \ - { \ - if (c == -2) \ - break; \ - goto invalid_code; \ - } \ - *buf++ = c; \ - src += nbytes; \ - consumed_chars += nchars; \ - } \ - while (0) - - -/* Decode a composition rule represented as a component of composition - sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF, - and increment BUF. If SRC points an invalid byte sequence, set C - to -1. */ - -#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \ + If an error is found while composing, the annotation header is + changed to the original composition header (plus filler -1s) as + below: + + (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ] + (5) [ 0x80 0xFF -1 -1- -1 ] + + and the sequence [ -2 DECODED-RULE ] is changed to the original + byte sequence as below: + o the original byte sequence is B: [ B -1 ] + o the original byte sequence is B1 B2: [ B1 B2 ] + + Most of the routines are implemented by macros because many + variables and labels in the caller decode_coding_emacs_mule must be + accessible, and they are usually called just once (thus doesn't + increase the size of compiled object). */ + +/* Decode a composition rule represented by C as a component of + composition sequence of Emacs 20 style. Set RULE to the decoded + rule. */ + +#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \ do { \ - int c, gref, nref; \ - \ - if (src >= src_end) \ - goto invalid_code; \ - ONE_MORE_BYTE_NO_CHECK (c); \ + int gref, nref; \ + \ c -= 0xA0; \ if (c < 0 || c >= 81) \ goto invalid_code; \ - \ gref = c / 9, nref = c % 9; \ - *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ + if (gref == 4) gref = 10; \ + if (nref == 4) nref = 10; \ + rule = COMPOSITION_ENCODE_RULE (gref, nref); \ } while (0) -/* Decode a composition rule represented as a component of composition - sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF, - and increment BUF. If SRC points an invalid byte sequence, set C - to -1. */ +/* Decode a composition rule represented by C and the following byte + at SRC as a component of composition sequence of Emacs 21 style. + Set RULE to the decoded rule. */ -#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \ +#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \ do { \ int gref, nref; \ - \ - if (src + 1>= src_end) \ + \ + gref = c - 0x20; \ + if (gref < 0 || gref >= 81) \ goto invalid_code; \ - ONE_MORE_BYTE_NO_CHECK (gref); \ - gref -= 0x20; \ - ONE_MORE_BYTE_NO_CHECK (nref); \ - nref -= 0x20; \ - if (gref < 0 || gref >= 81 \ - || nref < 0 || nref >= 81) \ + ONE_MORE_BYTE (c); \ + nref = c - 0x20; \ + if (nref < 0 || nref >= 81) \ goto invalid_code; \ - *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ + rule = COMPOSITION_ENCODE_RULE (gref, nref); \ } while (0) -#define DECODE_EMACS_MULE_21_COMPOSITION(c) \ +/* Start of Emacs 21 style format. The first three bytes at SRC are + (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the + byte length of this composition information, CHARS is the number of + characters composed by this composition. */ + +#define DECODE_EMACS_MULE_21_COMPOSITION() \ do { \ - /* Emacs 21 style format. The first three bytes at SRC are \ - (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \ - the byte length of this composition information, CHARS is the \ - number of characters composed by this composition. */ \ enum composition_method method = c - 0xF2; \ int *charbuf_base = charbuf; \ - int consumed_chars_limit; \ int nbytes, nchars; \ - \ + \ ONE_MORE_BYTE (c); \ if (c < 0) \ goto invalid_code; \ nbytes = c - 0xA0; \ - if (nbytes < 3) \ + if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \ goto invalid_code; \ ONE_MORE_BYTE (c); \ - if (c < 0) \ - goto invalid_code; \ nchars = c - 0xA0; \ - ADD_COMPOSITION_DATA (charbuf, nchars, method); \ - consumed_chars_limit = consumed_chars_base + nbytes; \ - if (method != COMPOSITION_RELATIVE) \ - { \ - int i = 0; \ - while (consumed_chars < consumed_chars_limit) \ - { \ - if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \ - DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \ - else \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \ - i++; \ - } \ - if (consumed_chars < consumed_chars_limit) \ - goto invalid_code; \ - charbuf_base[0] -= i; \ - } \ + if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \ + goto invalid_code; \ + cmp_status->old_form = 0; \ + cmp_status->method = method; \ + if (method == COMPOSITION_RELATIVE) \ + cmp_status->state = COMPOSING_CHAR; \ + else \ + cmp_status->state = COMPOSING_COMPONENT_CHAR; \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = nchars; \ + cmp_status->ncomps = nbytes - 4; \ + ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \ } while (0) -#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \ - do { \ - /* Emacs 20 style format for relative composition. */ \ - /* Store multibyte form of characters to be composed. */ \ - enum composition_method method = COMPOSITION_RELATIVE; \ - int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ - int *buf = components; \ - int i, j; \ - \ - src = src_base; \ - ONE_MORE_BYTE (c); /* skip 0x80 */ \ - for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ - if (i < 2) \ - goto invalid_code; \ - ADD_COMPOSITION_DATA (charbuf, i, method); \ - for (j = 0; j < i; j++) \ - *charbuf++ = components[j]; \ +/* Start of Emacs 20 style format for relative composition. */ + +#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \ + do { \ + cmp_status->old_form = 1; \ + cmp_status->method = COMPOSITION_RELATIVE; \ + cmp_status->state = COMPOSING_CHAR; \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = cmp_status->ncomps = 0; \ + ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \ } while (0) -#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \ +/* Start of Emacs 20 style format for rule-base composition. */ + +#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \ + do { \ + cmp_status->old_form = 1; \ + cmp_status->method = COMPOSITION_WITH_RULE; \ + cmp_status->state = COMPOSING_CHAR; \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = cmp_status->ncomps = 0; \ + ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \ + } while (0) + + +#define DECODE_EMACS_MULE_COMPOSITION_START() \ + do { \ + const unsigned char *current_src = src; \ + \ + ONE_MORE_BYTE (c); \ + if (c < 0) \ + goto invalid_code; \ + if (c - 0xF2 >= COMPOSITION_RELATIVE \ + && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \ + DECODE_EMACS_MULE_21_COMPOSITION (); \ + else if (c < 0xA0) \ + goto invalid_code; \ + else if (c < 0xC0) \ + { \ + DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \ + /* Re-read C as a composition component. */ \ + src = current_src; \ + } \ + else if (c == 0xFF) \ + DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \ + else \ + goto invalid_code; \ + } while (0) + +#define EMACS_MULE_COMPOSITION_END() \ do { \ - /* Emacs 20 style format for rule-base composition. */ \ - /* Store multibyte form of characters to be composed. */ \ - enum composition_method method = COMPOSITION_WITH_RULE; \ - int *charbuf_base = charbuf; \ - int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ - int *buf = components; \ - int i, j; \ + int idx = - cmp_status->length; \ \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ - for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \ - { \ - if (*src < 0xA0) \ - break; \ - DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \ - DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ - } \ - if (i <= 1 || (buf - components) % 2 == 0) \ - goto invalid_code; \ - if (charbuf + i + (i / 2) + 1 >= charbuf_end) \ - goto no_more_source; \ - ADD_COMPOSITION_DATA (charbuf, i, method); \ - i = i * 2 - 1; \ - for (j = 0; j < i; j++) \ - *charbuf++ = components[j]; \ - charbuf_base[0] -= i; \ - for (j = 0; j < i; j += 2) \ - *charbuf++ = components[j]; \ + if (cmp_status->old_form) \ + charbuf[idx + 2] = cmp_status->nchars; \ + else if (cmp_status->method > COMPOSITION_RELATIVE) \ + charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \ + cmp_status->state = COMPOSING_NO; \ + } while (0) + + +static int +emacs_mule_finish_composition (charbuf, cmp_status) + int *charbuf; + struct composition_status *cmp_status; +{ + int idx = - cmp_status->length; + int new_chars; + + if (cmp_status->old_form && cmp_status->nchars > 0) + { + charbuf[idx + 2] = cmp_status->nchars; + new_chars = 0; + if (cmp_status->method == COMPOSITION_WITH_RULE + && cmp_status->state == COMPOSING_CHAR) + { + /* The last rule was invalid. */ + int rule = charbuf[-1] + 0xA0; + + charbuf[-2] = BYTE8_TO_CHAR (rule); + charbuf[-1] = -1; + new_chars = 1; + } + } + else + { + charbuf[idx++] = BYTE8_TO_CHAR (0x80); + + if (cmp_status->method == COMPOSITION_WITH_RULE) + { + charbuf[idx++] = BYTE8_TO_CHAR (0xFF); + charbuf[idx++] = -3; + charbuf[idx++] = 0; + new_chars = 1; + } + else + { + int nchars = charbuf[idx + 1] + 0xA0; + int nbytes = charbuf[idx + 2] + 0xA0; + + charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method); + charbuf[idx++] = BYTE8_TO_CHAR (nbytes); + charbuf[idx++] = BYTE8_TO_CHAR (nchars); + charbuf[idx++] = -1; + new_chars = 4; + } + } + cmp_status->state = COMPOSING_NO; + return new_chars; +} + +#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \ + do { \ + if (cmp_status->state != COMPOSING_NO) \ + char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \ } while (0) @@ -2265,91 +2455,218 @@ decode_coding_emacs_mule (coding) const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; + /* We may produce two annotations (charset and composition) in one + loop and one more charset annotation at the end. */ int *charbuf_end - = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; + = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3); int consumed_chars = 0, consumed_chars_base; int multibytep = coding->src_multibyte; Lisp_Object attrs, charset_list; int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; + struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status; CODING_GET_INFO (coding, attrs, charset_list); + if (cmp_status->state != COMPOSING_NO) + { + int i; + + for (i = 0; i < cmp_status->length; i++) + *charbuf++ = cmp_status->carryover[i]; + coding->annotated = 1; + } + while (1) { - int c; + int c, id; src_base = src; consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c = byte_after_cr, byte_after_cr = -1; else ONE_MORE_BYTE (c); - if (c < 0) + + if (c < 0 || c == 0x80) { - *charbuf++ = -c; - char_offset++; + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + if (c < 0) + { + *charbuf++ = -c; + char_offset++; + } + else + DECODE_EMACS_MULE_COMPOSITION_START (); + continue; } - else if (c < 0x80) + + if (c < 0x80) { if (eol_crlf && c == '\r') ONE_MORE_BYTE (byte_after_cr); - *charbuf++ = c; - char_offset++; - } - else if (c == 0x80) - { - ONE_MORE_BYTE (c); - if (c < 0) - goto invalid_code; - if (c - 0xF2 >= COMPOSITION_RELATIVE - && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) - DECODE_EMACS_MULE_21_COMPOSITION (c); - else if (c < 0xC0) - DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c); - else if (c == 0xFF) - DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); - else - goto invalid_code; + id = charset_ascii; + if (cmp_status->state != COMPOSING_NO) + { + if (cmp_status->old_form) + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR) + cmp_status->ncomps--; + } } - else if (c < 0xA0 && emacs_mule_bytes[c] > 1) + else { - int nbytes, nchars; - int id; - - src = src_base; - consumed_chars = consumed_chars_base; - c = emacs_mule_char (coding, src, &nbytes, &nchars, &id); + int nchars, nbytes; + /* emacs_mule_char can load a charset map from a file, which + allocates a large structure and might cause buffer text + to be relocated as result. Thus, we need to remember the + original pointer to buffer text, and fix up all related + pointers after the call. */ + const unsigned char *orig = coding->source; + EMACS_INT offset; + + c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id, + cmp_status); + offset = coding->source - orig; + if (offset) + { + src += offset; + src_base += offset; + src_end += offset; + } if (c < 0) { + if (c == -1) + goto invalid_code; if (c == -2) break; - goto invalid_code; } + src = src_base + nbytes; + consumed_chars = consumed_chars_base + nchars; + if (cmp_status->state >= COMPOSING_COMPONENT_CHAR) + cmp_status->ncomps -= nchars; + } + + /* Now if C >= 0, we found a normally encoded character, if C < + 0, we found an old-style composition component character or + rule. */ + + if (cmp_status->state == COMPOSING_NO) + { if (last_id != id) { if (last_id != charset_ascii) - ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id); + ADD_CHARSET_DATA (charbuf, char_offset - last_offset, + last_id); last_id = id; last_offset = char_offset; } *charbuf++ = c; - src += nbytes; - consumed_chars += nchars; char_offset++; } - else - goto invalid_code; + else if (cmp_status->state == COMPOSING_CHAR) + { + if (cmp_status->old_form) + { + if (c >= 0) + { + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + *charbuf++ = c; + char_offset++; + } + else + { + *charbuf++ = -c; + cmp_status->nchars++; + cmp_status->length++; + if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS) + EMACS_MULE_COMPOSITION_END (); + else if (cmp_status->method == COMPOSITION_WITH_RULE) + cmp_status->state = COMPOSING_RULE; + } + } + else + { + *charbuf++ = c; + cmp_status->length++; + cmp_status->nchars--; + if (cmp_status->nchars == 0) + EMACS_MULE_COMPOSITION_END (); + } + } + else if (cmp_status->state == COMPOSING_RULE) + { + int rule; + + if (c >= 0) + { + EMACS_MULE_COMPOSITION_END (); + *charbuf++ = c; + char_offset++; + } + else + { + c = -c; + DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule); + if (rule < 0) + goto invalid_code; + *charbuf++ = -2; + *charbuf++ = rule; + cmp_status->length += 2; + cmp_status->state = COMPOSING_CHAR; + } + } + else if (cmp_status->state == COMPOSING_COMPONENT_CHAR) + { + *charbuf++ = c; + cmp_status->length++; + if (cmp_status->ncomps == 0) + cmp_status->state = COMPOSING_CHAR; + else if (cmp_status->ncomps > 0) + { + if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) + cmp_status->state = COMPOSING_COMPONENT_RULE; + } + else + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + } + else /* COMPOSING_COMPONENT_RULE */ + { + int rule; + + DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule); + if (rule < 0) + goto invalid_code; + *charbuf++ = -2; + *charbuf++ = rule; + cmp_status->length += 2; + cmp_status->ncomps--; + if (cmp_status->ncomps > 0) + cmp_status->state = COMPOSING_COMPONENT_CHAR; + else + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + } + continue; + + retry: + src = src_base; + consumed_chars = consumed_chars_base; continue; invalid_code: + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); src = src_base; consumed_chars = consumed_chars_base; ONE_MORE_BYTE (c); @@ -2359,6 +2676,19 @@ decode_coding_emacs_mule (coding) } no_more_source: + if (cmp_status->state != COMPOSING_NO) + { + if (coding->mode & CODING_MODE_LAST_BLOCK) + EMACS_MULE_MAYBE_FINISH_COMPOSITION (); + else + { + int i; + + charbuf -= cmp_status->length; + for (i = 0; i < cmp_status->length; i++) + cmp_status->carryover[i] = charbuf[i]; + } + } if (last_id != charset_ascii) ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id); coding->consumed_char += consumed_chars_base; @@ -2449,8 +2779,10 @@ encode_coding_emacs_mule (coding) if (preferred_charset_id >= 0) { charset = CHARSET_FROM_ID (preferred_charset_id); - if (! CHAR_CHARSET_P (c, charset)) - charset = char_charset (c, charset_list, NULL); + if (CHAR_CHARSET_P (c, charset)) + code = ENCODE_CHAR (charset, c); + else + charset = char_charset (c, charset_list, &code); } else charset = char_charset (c, charset_list, &code); @@ -2668,7 +3000,7 @@ enum iso_code_class_type iso_code_class[256]; #define SAFE_CHARSET_P(coding, id) \ ((id) <= (coding)->max_charset_id \ - && (coding)->safe_charsets[id] >= 0) + && (coding)->safe_charsets[id] != 255) #define SHIFT_OUT_OK(category) \ @@ -2706,8 +3038,8 @@ setup_iso_safe_charsets (attrs) max_charset_id = id; } - safe_charsets = Fmake_string (make_number (max_charset_id + 1), - make_number (255)); + safe_charsets = make_uninit_string (max_charset_id + 1); + memset (SDATA (safe_charsets), 255, max_charset_id + 1); request = AREF (attrs, coding_attr_iso_request); reg_usage = AREF (attrs, coding_attr_iso_usage); reg94 = XINT (XCAR (reg_usage)); @@ -2740,7 +3072,7 @@ setup_iso_safe_charsets (attrs) /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in one of ISO-2022 based codig systems. + Check if a text is encoded in one of ISO-2022 based coding systems. If it is, return 1, else return 0. */ static int @@ -2758,6 +3090,7 @@ detect_coding_iso_2022 (coding, detect_info) int i; int rejected = 0; int found = 0; + int composition_count = -1; detect_info->checked |= CATEGORY_MASK_ISO; @@ -2770,11 +3103,11 @@ detect_coding_iso_2022 (coding, detect_info) continue; attrs = CODING_ID_ATTRS (this->id); if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT - && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list)) + && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list)) setup_iso_safe_charsets (attrs); val = CODING_ATTR_SAFE_CHARSETS (attrs); this->max_charset_id = SCHARS (val) - 1; - this->safe_charsets = (char *) SDATA (val); + this->safe_charsets = SDATA (val); } /* A coding system of this category is always ASCII compatible. */ @@ -2826,10 +3159,20 @@ detect_coding_iso_2022 (coding, detect_info) rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT; break; } + else if (c == '1') + { + /* End of composition. */ + if (composition_count < 0 + || composition_count > MAX_COMPOSITION_COMPONENTS) + /* Invalid */ + break; + composition_count = -1; + found |= CATEGORY_MASK_ISO; + } else if (c >= '0' && c <= '4') { /* ESC for start/end composition. */ - found |= CATEGORY_MASK_ISO; + composition_count = 0; break; } else @@ -2900,6 +3243,8 @@ detect_coding_iso_2022 (coding, detect_info) continue; if (c < 0x80) { + if (composition_count >= 0) + composition_count++; single_shifting = 0; break; } @@ -2917,16 +3262,28 @@ detect_coding_iso_2022 (coding, detect_info) int i = 1; while (src < src_end) { + src_base = src; ONE_MORE_BYTE (c); if (c < 0xA0) - break; + { + src = src_base; + break; + } i++; } if (i & 1 && src < src_end) - rejected |= CATEGORY_MASK_ISO_8_2; + { + rejected |= CATEGORY_MASK_ISO_8_2; + if (composition_count >= 0) + composition_count += i; + } else - found |= CATEGORY_MASK_ISO_8_2; + { + found |= CATEGORY_MASK_ISO_8_2; + if (composition_count >= 0) + composition_count += i / 2; + } } break; } @@ -2990,132 +3347,237 @@ detect_coding_iso_2022 (coding, detect_info) } while (0) -#define MAYBE_FINISH_COMPOSITION() \ +/* Handle these composition sequence (ALT: alternate char): + + (1) relative composition: ESC 0 CHAR ... ESC 1 + (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 + (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1 + (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1 + + When the start sequence (ESC 0/2/3/4) is found, this annotation + header is produced. + + [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ] + + Then, upon reading CHAR or RULE (one or two bytes), these codes are + produced until the end sequence (ESC 1) is found: + + (1) CHAR ... CHAR + (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR + (3) ALT ... ALT -1 -1 CHAR ... CHAR + (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR + + When the end sequence (ESC 1) is found, LENGTH and NCHARS in the + annotation header is updated as below: + + (1) LENGTH: unchanged, NCHARS: number of CHARs + (2) LENGTH: unchanged, NCHARS: number of CHARs + (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs + (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs + + If an error is found while composing, the annotation header is + changed to: + + [ ESC '0'/'2'/'3'/'4' -2 0 ] + + and the sequence [ -2 DECODED-RULE ] is changed to the original + byte sequence as below: + o the original byte sequence is B: [ B -1 ] + o the original byte sequence is B1 B2: [ B1 B2 ] + and the sequence [ -1 -1 ] is changed to the original byte + sequence: + [ ESC '0' ] +*/ + +/* Decode a composition rule C1 and maybe one more byte from the + source, and set RULE to the encoded composition rule, NBYTES to the + length of the composition rule. If the rule is invalid, set RULE + to some negative value. */ + +#define DECODE_COMPOSITION_RULE(rule, nbytes) \ + do { \ + rule = c1 - 32; \ + if (rule < 0) \ + break; \ + if (rule < 81) /* old format (before ver.21) */ \ + { \ + int gref = (rule) / 9; \ + int nref = (rule) % 9; \ + if (gref == 4) gref = 10; \ + if (nref == 4) nref = 10; \ + rule = COMPOSITION_ENCODE_RULE (gref, nref); \ + nbytes = 1; \ + } \ + else /* new format (after ver.21) */ \ + { \ + int c; \ + \ + ONE_MORE_BYTE (c); \ + rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \ + if (rule >= 0) \ + rule += 0x100; /* to destinguish it from the old format */ \ + nbytes = 2; \ + } \ + } while (0) + +#define ENCODE_COMPOSITION_RULE(rule) \ do { \ - int i; \ - if (composition_state == COMPOSING_NO) \ - break; \ - /* It is assured that we have enough room for producing \ - characters stored in the table `components'. */ \ - if (charbuf + component_idx > charbuf_end) \ - goto no_more_source; \ - composition_state = COMPOSING_NO; \ - if (method == COMPOSITION_RELATIVE \ - || method == COMPOSITION_WITH_ALTCHARS) \ + int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \ + \ + if (rule < 0x100) /* old format */ \ { \ - for (i = 0; i < component_idx; i++) \ - *charbuf++ = components[i]; \ - char_offset += component_idx; \ + if (gref == 10) gref = 4; \ + if (nref == 10) nref = 4; \ + charbuf[idx] = 32 + gref * 9 + nref; \ + charbuf[idx + 1] = -1; \ + new_chars++; \ } \ - else \ + else /* new format */ \ { \ - for (i = 0; i < component_idx; i += 2) \ - *charbuf++ = components[i]; \ - char_offset += (component_idx / 2) + 1; \ + charbuf[idx] = 32 + 81 + gref; \ + charbuf[idx + 1] = 32 + nref; \ + new_chars += 2; \ } \ } while (0) +/* Finish the current composition as invalid. */ + +static int finish_composition P_ ((int *, struct composition_status *)); + +static int +finish_composition (charbuf, cmp_status) + int *charbuf; + struct composition_status *cmp_status; +{ + int idx = - cmp_status->length; + int new_chars; + + /* Recover the original ESC sequence */ + charbuf[idx++] = ISO_CODE_ESC; + charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0' + : cmp_status->method == COMPOSITION_WITH_RULE ? '2' + : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3' + /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */ + : '4'); + charbuf[idx++] = -2; + charbuf[idx++] = 0; + charbuf[idx++] = -1; + new_chars = cmp_status->nchars; + if (cmp_status->method >= COMPOSITION_WITH_RULE) + for (; idx < 0; idx++) + { + int elt = charbuf[idx]; + + if (elt == -2) + { + ENCODE_COMPOSITION_RULE (charbuf[idx + 1]); + idx++; + } + else if (elt == -1) + { + charbuf[idx++] = ISO_CODE_ESC; + charbuf[idx] = '0'; + new_chars += 2; + } + } + cmp_status->state = COMPOSING_NO; + return new_chars; +} + +/* If characters are under composition, finish the composition. */ +#define MAYBE_FINISH_COMPOSITION() \ + do { \ + if (cmp_status->state != COMPOSING_NO) \ + char_offset += finish_composition (charbuf, cmp_status); \ + } while (0) /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4. + ESC 0 : relative composition : ESC 0 CHAR ... ESC 1 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1 - */ -#define DECODE_COMPOSITION_START(c1) \ - do { \ - if (c1 == '0' \ - && composition_state == COMPOSING_COMPONENT_RULE) \ - { \ - component_len = component_idx; \ - composition_state = COMPOSING_CHAR; \ - } \ - else \ - { \ - const unsigned char *p; \ - \ - MAYBE_FINISH_COMPOSITION (); \ - if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \ - goto no_more_source; \ - for (p = src; p < src_end - 1; p++) \ - if (*p == ISO_CODE_ESC && p[1] == '1') \ - break; \ - if (p == src_end - 1) \ - { \ - /* The current composition doesn't end in the current \ - source. */ \ - record_conversion_result \ - (coding, CODING_RESULT_INSUFFICIENT_SRC); \ - goto no_more_source; \ - } \ - \ - /* This is surely the start of a composition. */ \ - method = (c1 == '0' ? COMPOSITION_RELATIVE \ - : c1 == '2' ? COMPOSITION_WITH_RULE \ - : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ - : COMPOSITION_WITH_RULE_ALTCHARS); \ - composition_state = (c1 <= '2' ? COMPOSING_CHAR \ - : COMPOSING_COMPONENT_CHAR); \ - component_idx = component_len = 0; \ - } \ + Produce this annotation sequence now: + + [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ] +*/ + +#define DECODE_COMPOSITION_START(c1) \ + do { \ + if (c1 == '0' \ + && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \ + && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \ + || (cmp_status->state == COMPOSING_COMPONENT_RULE \ + && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \ + { \ + *charbuf++ = -1; \ + *charbuf++= -1; \ + cmp_status->state = COMPOSING_CHAR; \ + cmp_status->length += 2; \ + } \ + else \ + { \ + MAYBE_FINISH_COMPOSITION (); \ + cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \ + : c1 == '2' ? COMPOSITION_WITH_RULE \ + : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ + : COMPOSITION_WITH_RULE_ALTCHARS); \ + cmp_status->state \ + = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \ + ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \ + cmp_status->length = MAX_ANNOTATION_LENGTH; \ + cmp_status->nchars = cmp_status->ncomps = 0; \ + coding->annotated = 1; \ + } \ } while (0) -/* Handle compositoin end sequence ESC 1. */ +/* Handle composition end sequence ESC 1. */ #define DECODE_COMPOSITION_END() \ do { \ - int nchars = (component_len > 0 ? component_idx - component_len \ - : method == COMPOSITION_RELATIVE ? component_idx \ - : (component_idx + 1) / 2); \ - int i; \ - int *saved_charbuf = charbuf; \ - \ - ADD_COMPOSITION_DATA (charbuf, nchars, method); \ - if (method != COMPOSITION_RELATIVE) \ + if (cmp_status->nchars == 0 \ + || ((cmp_status->state == COMPOSING_CHAR) \ + == (cmp_status->method == COMPOSITION_WITH_RULE))) \ { \ - if (component_len == 0) \ - for (i = 0; i < component_idx; i++) \ - *charbuf++ = components[i]; \ - else \ - for (i = 0; i < component_len; i++) \ - *charbuf++ = components[i]; \ - *saved_charbuf = saved_charbuf - charbuf; \ + MAYBE_FINISH_COMPOSITION (); \ + goto invalid_code; \ } \ - if (method == COMPOSITION_WITH_RULE) \ - for (i = 0; i < component_idx; i += 2, char_offset++) \ - *charbuf++ = components[i]; \ - else \ - for (i = component_len; i < component_idx; i++, char_offset++) \ - *charbuf++ = components[i]; \ - coding->annotated = 1; \ - composition_state = COMPOSING_NO; \ + if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \ + charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \ + else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \ + charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \ + charbuf[- cmp_status->length + 2] = cmp_status->nchars; \ + char_offset += cmp_status->nchars; \ + cmp_status->state = COMPOSING_NO; \ } while (0) +/* Store a composition rule RULE in charbuf, and update cmp_status. */ + +#define STORE_COMPOSITION_RULE(rule) \ + do { \ + *charbuf++ = -2; \ + *charbuf++ = rule; \ + cmp_status->length += 2; \ + cmp_status->state--; \ + } while (0) -/* Decode a composition rule from the byte C1 (and maybe one more byte - from SRC) and store one encoded composition rule in - coding->cmp_data. */ +/* Store a composed char or a component char C in charbuf, and update + cmp_status. */ -#define DECODE_COMPOSITION_RULE(c1) \ +#define STORE_COMPOSITION_CHAR(c) \ do { \ - (c1) -= 32; \ - if (c1 < 81) /* old format (before ver.21) */ \ - { \ - int gref = (c1) / 9; \ - int nref = (c1) % 9; \ - if (gref == 4) gref = 10; \ - if (nref == 4) nref = 10; \ - c1 = COMPOSITION_ENCODE_RULE (gref, nref); \ - } \ - else if (c1 < 93) /* new format (after ver.21) */ \ - { \ - ONE_MORE_BYTE (c2); \ - c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \ - } \ + *charbuf++ = (c); \ + cmp_status->length++; \ + if (cmp_status->state == COMPOSING_CHAR) \ + cmp_status->nchars++; \ else \ - c1 = 0; \ + cmp_status->ncomps++; \ + if (cmp_status->method == COMPOSITION_WITH_RULE \ + || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \ + && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \ + cmp_status->state++; \ } while (0) @@ -3129,8 +3591,10 @@ decode_coding_iso_2022 (coding) const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; + /* We may produce two annotations (charset and composition) in one + loop and one more charset annotation at the end. */ int *charbuf_end - = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH; + = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3); int consumed_chars = 0, consumed_chars_base; int multibytep = coding->src_multibyte; /* Charsets invoked to graphic plane 0 and 1 respectively. */ @@ -3139,40 +3603,42 @@ decode_coding_iso_2022 (coding) int charset_id_2, charset_id_3; struct charset *charset; int c; - /* For handling composition sequence. */ -#define COMPOSING_NO 0 -#define COMPOSING_CHAR 1 -#define COMPOSING_RULE 2 -#define COMPOSING_COMPONENT_CHAR 3 -#define COMPOSING_COMPONENT_RULE 4 - - int composition_state = COMPOSING_NO; - enum composition_method method; - int components[MAX_COMPOSITION_COMPONENTS * 2 + 1]; - int component_idx; - int component_len; + struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding); Lisp_Object attrs, charset_list; int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; + int i; CODING_GET_INFO (coding, attrs, charset_list); setup_iso_safe_charsets (attrs); /* Charset list may have been changed. */ charset_list = CODING_ATTR_CHARSET_LIST (attrs); - coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs)); + coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs)); + + if (cmp_status->state != COMPOSING_NO) + { + for (i = 0; i < cmp_status->length; i++) + *charbuf++ = cmp_status->carryover[i]; + coding->annotated = 1; + } while (1) { - int c1, c2; + int c1, c2, c3; src_base = src; consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c1 = byte_after_cr, byte_after_cr = -1; @@ -3181,21 +3647,58 @@ decode_coding_iso_2022 (coding) if (c1 < 0) goto invalid_code; - /* We produce at most one character. */ - switch (iso_code_class [c1]) + if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0) { - case ISO_0x20_or_0x7F: - if (composition_state != COMPOSING_NO) + *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); + char_offset++; + CODING_ISO_EXTSEGMENT_LEN (coding)--; + continue; + } + + if (CODING_ISO_EMBEDDED_UTF_8 (coding)) + { + if (c1 == ISO_CODE_ESC) { - if (composition_state == COMPOSING_RULE - || composition_state == COMPOSING_COMPONENT_RULE) + if (src + 1 >= src_end) + goto no_more_source; + *charbuf++ = ISO_CODE_ESC; + char_offset++; + if (src[0] == '%' && src[1] == '@') { - DECODE_COMPOSITION_RULE (c1); - components[component_idx++] = c1; - composition_state--; - continue; + src += 2; + consumed_chars += 2; + char_offset += 2; + /* We are sure charbuf can contain two more chars. */ + *charbuf++ = '%'; + *charbuf++ = '@'; + CODING_ISO_EMBEDDED_UTF_8 (coding) = 0; } } + else + { + *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); + char_offset++; + } + continue; + } + + if ((cmp_status->state == COMPOSING_RULE + || cmp_status->state == COMPOSING_COMPONENT_RULE) + && c1 != ISO_CODE_ESC) + { + int rule, nbytes; + + DECODE_COMPOSITION_RULE (rule, nbytes); + if (rule < 0) + goto invalid_code; + STORE_COMPOSITION_RULE (rule); + continue; + } + + /* We produce at most one character. */ + switch (iso_code_class [c1]) + { + case ISO_0x20_or_0x7F: if (charset_id_0 < 0 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0))) /* This is SPACE or DEL. */ @@ -3205,17 +3708,6 @@ decode_coding_iso_2022 (coding) break; case ISO_graphic_plane_0: - if (composition_state != COMPOSING_NO) - { - if (composition_state == COMPOSING_RULE - || composition_state == COMPOSING_COMPONENT_RULE) - { - DECODE_COMPOSITION_RULE (c1); - components[component_idx++] = c1; - composition_state--; - continue; - } - } if (charset_id_0 < 0) charset = CHARSET_FROM_ID (charset_ascii); else @@ -3243,7 +3735,6 @@ decode_coding_iso_2022 (coding) break; case ISO_control_1: - MAYBE_FINISH_COMPOSITION (); goto invalid_code; case ISO_shift_out: @@ -3262,6 +3753,8 @@ decode_coding_iso_2022 (coding) continue; case ISO_single_shift_2_7: + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)) + goto invalid_code; case ISO_single_shift_2: if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)) goto invalid_code; @@ -3381,21 +3874,27 @@ decode_coding_iso_2022 (coding) case '0': case '2': case '3': case '4': /* start composition */ if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)) goto invalid_code; + if (last_id != charset_ascii) + { + ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id); + last_id = charset_ascii; + last_offset = char_offset; + } DECODE_COMPOSITION_START (c1); continue; case '1': /* end composition */ - if (composition_state == COMPOSING_NO) + if (cmp_status->state == COMPOSING_NO) goto invalid_code; DECODE_COMPOSITION_END (); continue; case '[': /* specification of direction */ - if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION) + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)) goto invalid_code; /* For the moment, nested direction is not supported. So, `coding->mode & CODING_MODE_DIRECTION' zero means - left-to-right, and nozero means right-to-left. */ + left-to-right, and nonzero means right-to-left. */ ONE_MORE_BYTE (c1); switch (c1) { @@ -3436,10 +3935,16 @@ decode_coding_iso_2022 (coding) int size; ONE_MORE_BYTE (dim); + if (dim < '0' || dim > '4') + goto invalid_code; ONE_MORE_BYTE (M); + if (M < 128) + goto invalid_code; ONE_MORE_BYTE (L); + if (L < 128) + goto invalid_code; size = ((M - 128) * 128) + (L - 128); - if (charbuf + 8 + size > charbuf_end) + if (charbuf + 6 > charbuf_end) goto break_loop; *charbuf++ = ISO_CODE_ESC; *charbuf++ = '%'; @@ -3447,11 +3952,7 @@ decode_coding_iso_2022 (coding) *charbuf++ = dim; *charbuf++ = BYTE8_TO_CHAR (M); *charbuf++ = BYTE8_TO_CHAR (L); - while (size-- > 0) - { - ONE_MORE_BYTE (c1); - *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); - } + CODING_ISO_EXTSEGMENT_LEN (coding) = size; } else if (c1 == 'G') { @@ -3459,32 +3960,12 @@ decode_coding_iso_2022 (coding) ESC % G --UTF-8-BYTES-- ESC % @ We keep these bytes as is for the moment. They may be decoded by post-read-conversion. */ - int *p = charbuf; - - if (p + 6 > charbuf_end) - goto break_loop; - *p++ = ISO_CODE_ESC; - *p++ = '%'; - *p++ = 'G'; - while (p < charbuf_end) - { - ONE_MORE_BYTE (c1); - if (c1 == ISO_CODE_ESC - && src + 1 < src_end - && src[0] == '%' - && src[1] == '@') - { - src += 2; - break; - } - *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); - } - if (p + 3 > charbuf_end) + if (charbuf + 3 > charbuf_end) goto break_loop; - *p++ = ISO_CODE_ESC; - *p++ = '%'; - *p++ = '@'; - charbuf = p; + *charbuf++ = ISO_CODE_ESC; + *charbuf++ = '%'; + *charbuf++ = 'G'; + CODING_ISO_EMBEDDED_UTF_8 (coding) = 1; } else goto invalid_code; @@ -3522,7 +4003,8 @@ decode_coding_iso_2022 (coding) } } - if (charset->id != charset_ascii + if (cmp_status->state == COMPOSING_NO + && charset->id != charset_ascii && last_id != charset->id) { if (last_id != charset_ascii) @@ -3532,26 +4014,28 @@ decode_coding_iso_2022 (coding) } /* Now we know CHARSET and 1st position code C1 of a character. - Produce a decoded character while getting 2nd position code - C2 if necessary. */ - c1 &= 0x7F; + Produce a decoded character while getting 2nd and 3rd + position codes C2, C3 if necessary. */ if (CHARSET_DIMENSION (charset) > 1) { ONE_MORE_BYTE (c2); - if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)) + if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0) + || ((c1 & 0x80) != (c2 & 0x80))) /* C2 is not in a valid range. */ goto invalid_code; - c1 = (c1 << 8) | (c2 & 0x7F); - if (CHARSET_DIMENSION (charset) > 2) + if (CHARSET_DIMENSION (charset) == 2) + c1 = (c1 << 8) | c2; + else { - ONE_MORE_BYTE (c2); - if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)) - /* C2 is not in a valid range. */ + ONE_MORE_BYTE (c3); + if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0) + || ((c1 & 0x80) != (c3 & 0x80))) + /* C3 is not in a valid range. */ goto invalid_code; - c1 = (c1 << 8) | (c2 & 0x7F); + c1 = (c1 << 16) | (c2 << 8) | c2; } } - + c1 &= 0x7F7F7F; CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c); if (c < 0) { @@ -3564,19 +4048,23 @@ decode_coding_iso_2022 (coding) *charbuf++ = BYTE8_TO_CHAR (*src_base); } } - else if (composition_state == COMPOSING_NO) + else if (cmp_status->state == COMPOSING_NO) { *charbuf++ = c; char_offset++; } - else + else if ((cmp_status->state == COMPOSING_CHAR + ? cmp_status->nchars + : cmp_status->ncomps) + >= MAX_COMPOSITION_COMPONENTS) { - components[component_idx++] = c; - if (method == COMPOSITION_WITH_RULE - || (method == COMPOSITION_WITH_RULE_ALTCHARS - && composition_state == COMPOSING_COMPONENT_CHAR)) - composition_state++; + /* Too long composition. */ + MAYBE_FINISH_COMPOSITION (); + *charbuf++ = c; + char_offset++; } + else + STORE_COMPOSITION_CHAR (c); continue; invalid_code: @@ -3594,7 +4082,18 @@ decode_coding_iso_2022 (coding) } no_more_source: - if (last_id != charset_ascii) + if (cmp_status->state != COMPOSING_NO) + { + if (coding->mode & CODING_MODE_LAST_BLOCK) + MAYBE_FINISH_COMPOSITION (); + else + { + charbuf -= cmp_status->length; + for (i = 0; i < cmp_status->length; i++) + cmp_status->carryover[i] = charbuf[i]; + } + } + else if (last_id != charset_ascii) ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id); coding->consumed_char += consumed_chars_base; coding->consumed = src_base - coding->source; @@ -4020,16 +4519,19 @@ encode_coding_iso_2022 (coding) int preferred_charset_id = -1; CODING_GET_INFO (coding, attrs, charset_list); - eol_type = CODING_ID_EOL_TYPE (coding->id); + eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); if (VECTORP (eol_type)) eol_type = Qunix; setup_iso_safe_charsets (attrs); /* Charset list may have been changed. */ charset_list = CODING_ATTR_CHARSET_LIST (attrs); - coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs)); + coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs)); - ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); + ascii_compatible + = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) + && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION + | CODING_ISO_FLAG_LOCKING_SHIFT))); while (charbuf < charbuf_end) { @@ -4202,6 +4704,12 @@ detect_coding_sjis (coding, detect_info) int consumed_chars = 0; int found = 0; int c; + Lisp_Object attrs, charset_list; + int max_first_byte_of_2_byte_code; + + CODING_GET_INFO (coding, attrs, charset_list); + max_first_byte_of_2_byte_code + = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF); detect_info->checked |= CATEGORY_MASK_SJIS; /* A coding system of this category is always ASCII compatible. */ @@ -4213,7 +4721,8 @@ detect_coding_sjis (coding, detect_info) ONE_MORE_BYTE (c); if (c < 0x80) continue; - if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) + if ((c >= 0x81 && c <= 0x9F) + || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code)) { ONE_MORE_BYTE (c); if (c < 0x40 || c == 0x7F || c > 0xFC) @@ -4298,8 +4807,10 @@ decode_coding_sjis (coding) const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; + /* We may produce one charset annotation in one loop and one more at + the end. */ int *charbuf_end - = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; + = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2); int consumed_chars = 0, consumed_chars_base; int multibytep = coding->src_multibyte; struct charset *charset_roman, *charset_kanji, *charset_kana; @@ -4308,7 +4819,8 @@ decode_coding_sjis (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -4328,7 +4840,11 @@ decode_coding_sjis (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c = byte_after_cr, byte_after_cr = -1; @@ -4410,8 +4926,10 @@ decode_coding_big5 (coding) const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; + /* We may produce one charset annotation in one loop and one more at + the end. */ int *charbuf_end - = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; + = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2); int consumed_chars = 0, consumed_chars_base; int multibytep = coding->src_multibyte; struct charset *charset_roman, *charset_big5; @@ -4419,7 +4937,8 @@ decode_coding_big5 (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -4436,7 +4955,11 @@ decode_coding_big5 (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) c = byte_after_cr, byte_after_cr = -1; @@ -4572,7 +5095,8 @@ encode_coding_sjis (coding) int c1, c2; c1 = code >> 8; - if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25) + if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25) + || c1 == 0x28 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E) { JIS_TO_SJIS2 (code); @@ -4722,62 +5246,52 @@ decode_coding_ccl (coding) int *charbuf_end = coding->charbuf + coding->charbuf_size; int consumed_chars = 0; int multibytep = coding->src_multibyte; - struct ccl_program ccl; + struct ccl_program *ccl = &coding->spec.ccl->ccl; int source_charbuf[1024]; - int source_byteidx[1024]; + int source_byteidx[1025]; Lisp_Object attrs, charset_list; CODING_GET_INFO (coding, attrs, charset_list); - setup_ccl_program (&ccl, CODING_CCL_DECODER (coding)); - while (src < src_end) + while (1) { const unsigned char *p = src; - int *source, *source_end; int i = 0; if (multibytep) - while (i < 1024 && p < src_end) - { - source_byteidx[i] = p - src; - source_charbuf[i++] = STRING_CHAR_ADVANCE (p); - } + { + while (i < 1024 && p < src_end) + { + source_byteidx[i] = p - src; + source_charbuf[i++] = STRING_CHAR_ADVANCE (p); + } + source_byteidx[i] = p - src; + } else while (i < 1024 && p < src_end) source_charbuf[i++] = *p++; if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK) - ccl.last_block = 1; - - source = source_charbuf; - source_end = source + i; - while (source < source_end) - { - ccl_driver (&ccl, source, charbuf, - source_end - source, charbuf_end - charbuf, - charset_list); - source += ccl.consumed; - charbuf += ccl.produced; - if (ccl.status != CCL_STAT_SUSPEND_BY_DST) - break; - } - if (source < source_end) - src += source_byteidx[source - source_charbuf]; + ccl->last_block = 1; + ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf, + charset_list); + charbuf += ccl->produced; + if (multibytep) + src += source_byteidx[ccl->consumed]; else - src = p; - consumed_chars += source - source_charbuf; - - if (ccl.status != CCL_STAT_SUSPEND_BY_SRC - && ccl.status != CODING_RESULT_INSUFFICIENT_SRC) + src += ccl->consumed; + consumed_chars += ccl->consumed; + if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC) break; } - switch (ccl.status) + switch (ccl->status) { case CCL_STAT_SUSPEND_BY_SRC: record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC); break; case CCL_STAT_SUSPEND_BY_DST: + record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST); break; case CCL_STAT_QUIT: case CCL_STAT_INVALID_CMD: @@ -4796,7 +5310,7 @@ static int encode_coding_ccl (coding) struct coding_system *coding; { - struct ccl_program ccl; + struct ccl_program *ccl = &coding->spec.ccl->ccl; int multibytep = coding->dst_multibyte; int *charbuf = coding->charbuf; int *charbuf_end = charbuf + coding->charbuf_used; @@ -4807,35 +5321,34 @@ encode_coding_ccl (coding) Lisp_Object attrs, charset_list; CODING_GET_INFO (coding, attrs, charset_list); - setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding)); - - ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK; - ccl.dst_multibyte = coding->dst_multibyte; + if (coding->consumed_char == coding->src_chars + && coding->mode & CODING_MODE_LAST_BLOCK) + ccl->last_block = 1; while (charbuf < charbuf_end) { - ccl_driver (&ccl, charbuf, destination_charbuf, + ccl_driver (ccl, charbuf, destination_charbuf, charbuf_end - charbuf, 1024, charset_list); if (multibytep) { - ASSURE_DESTINATION (ccl.produced * 2); - for (i = 0; i < ccl.produced; i++) + ASSURE_DESTINATION (ccl->produced * 2); + for (i = 0; i < ccl->produced; i++) EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF); } else { - ASSURE_DESTINATION (ccl.produced); - for (i = 0; i < ccl.produced; i++) + ASSURE_DESTINATION (ccl->produced); + for (i = 0; i < ccl->produced; i++) *dst++ = destination_charbuf[i] & 0xFF; - produced_chars += ccl.produced; + produced_chars += ccl->produced; } - charbuf += ccl.consumed; - if (ccl.status == CCL_STAT_QUIT - || ccl.status == CCL_STAT_INVALID_CMD) + charbuf += ccl->consumed; + if (ccl->status == CCL_STAT_QUIT + || ccl->status == CCL_STAT_INVALID_CMD) break; } - switch (ccl.status) + switch (ccl->status) { case CCL_STAT_SUSPEND_BY_SRC: record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC); @@ -4867,7 +5380,8 @@ static void decode_coding_raw_text (coding) struct coding_system *coding; { - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); coding->chars_at_source = 1; coding->consumed_char = coding->src_chars; @@ -4986,9 +5500,12 @@ detect_coding_charset (coding, detect_info) attrs = CODING_ID_ATTRS (coding->id); valids = AREF (attrs, coding_attr_charset_valids); name = CODING_ID_NAME (coding->id); - if (VECTORP (Vlatin_extra_code_table) - && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-")) + if (strncmp ((char *) SDATA (SYMBOL_NAME (name)), + "iso-8859-", sizeof ("iso-8859-") - 1) == 0 + || strncmp ((char *) SDATA (SYMBOL_NAME (name)), + "iso-latin-", sizeof ("iso-latin-") - 1) == 0) check_latin_extra = 1; + if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) src += head_ascii; @@ -5010,7 +5527,8 @@ detect_coding_charset (coding, detect_info) { if (c < 0xA0 && check_latin_extra - && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) + && (!VECTORP (Vlatin_extra_code_table) + || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))) break; found = CATEGORY_MASK_CHARSET; } @@ -5074,15 +5592,18 @@ decode_coding_charset (coding) const unsigned char *src_end = coding->source + coding->src_bytes; const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; + /* We may produce one charset annotation in one loop and one more at + the end. */ int *charbuf_end - = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; + = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2); int consumed_chars = 0, consumed_chars_base; int multibytep = coding->src_multibyte; Lisp_Object attrs, charset_list, valids; int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; - int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int eol_crlf = + !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -5101,7 +5622,11 @@ decode_coding_charset (coding) consumed_chars_base = consumed_chars; if (charbuf >= charbuf_end) - break; + { + if (byte_after_cr >= 0) + src_base--; + break; + } if (byte_after_cr >= 0) { @@ -5119,7 +5644,7 @@ decode_coding_charset (coding) code = c; val = AREF (valids, c); - if (NILP (val)) + if (! INTEGERP (val) && ! CONSP (val)) goto invalid_code; if (INTEGERP (val)) { @@ -5275,7 +5800,7 @@ setup_coding_system (coding_system, coding) CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id); attrs = CODING_ID_ATTRS (coding->id); - eol_type = CODING_ID_EOL_TYPE (coding->id); + eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); coding->mode = 0; coding->head_ascii = -1; @@ -5296,8 +5821,9 @@ setup_coding_system (coding_system, coding) val = CODING_ATTR_SAFE_CHARSETS (attrs); coding->max_charset_id = SCHARS (val) - 1; - coding->safe_charsets = (char *) SDATA (val); + coding->safe_charsets = SDATA (val); coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs)); + coding->carryover_bytes = 0; coding_type = CODING_ATTR_TYPE (attrs); if (EQ (coding_type, Qundecided)) @@ -5341,9 +5867,13 @@ setup_coding_system (coding_system, coding) setup_iso_safe_charsets (attrs); val = CODING_ATTR_SAFE_CHARSETS (attrs); coding->max_charset_id = SCHARS (val) - 1; - coding->safe_charsets = (char *) SDATA (val); + coding->safe_charsets = SDATA (val); } CODING_ISO_FLAGS (coding) = flags; + CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO; + CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO; + CODING_ISO_EXTSEGMENT_LEN (coding) = 0; + CODING_ISO_EMBEDDED_UTF_8 (coding) = 0; } else if (EQ (coding_type, Qcharset)) { @@ -5401,6 +5931,7 @@ setup_coding_system (coding_system, coding) coding->encoder = encode_coding_emacs_mule; coding->common_flags |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + coding->spec.emacs_mule.full_support = 1; if (! NILP (AREF (attrs, coding_attr_emacs_mule_full)) && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list)) { @@ -5411,14 +5942,17 @@ setup_coding_system (coding_system, coding) tail = XCDR (tail)) if (max_charset_id < XFASTINT (XCAR (tail))) max_charset_id = XFASTINT (XCAR (tail)); - safe_charsets = Fmake_string (make_number (max_charset_id + 1), - make_number (255)); + safe_charsets = make_uninit_string (max_charset_id + 1); + memset (SDATA (safe_charsets), 255, max_charset_id + 1); for (tail = Vemacs_mule_charset_list; CONSP (tail); tail = XCDR (tail)) SSET (safe_charsets, XFASTINT (XCAR (tail)), 0); coding->max_charset_id = max_charset_id; - coding->safe_charsets = (char *) SDATA (safe_charsets); + coding->safe_charsets = SDATA (safe_charsets); + coding->spec.emacs_mule.full_support = 1; } + coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO; + coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO; } else if (EQ (coding_type, Qshift_jis)) { @@ -5477,6 +6011,39 @@ coding_charset_list (coding) } +/* Return a list of charsets supported by CODING-SYSTEM. */ + +Lisp_Object +coding_system_charset_list (coding_system) + Lisp_Object coding_system; +{ + int id; + Lisp_Object attrs, charset_list; + + CHECK_CODING_SYSTEM_GET_ID (coding_system, id); + attrs = CODING_ID_ATTRS (id); + + if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022)) + { + int flags = XINT (AREF (attrs, coding_attr_iso_flags)); + + if (flags & CODING_ISO_FLAG_FULL_SUPPORT) + charset_list = Viso_2022_charset_list; + else + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + } + else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule)) + { + charset_list = Vemacs_mule_charset_list; + } + else + { + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + } + return charset_list; +} + + /* Return raw-text or one of its subsidiaries that has the same eol_type as CODING-SYSTEM. */ @@ -5506,10 +6073,9 @@ raw_text_coding_system (coding_system) } -/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT - does, return one of the subsidiary that has the same eol-spec as - PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil, - inherit end-of-line format from the system's setting +/* If CODING_SYSTEM doesn't specify end-of-line format, return one of + the subsidiary that has the same eol-spec as PARENT (if it is not + nil and specifies end-of-line format) or the system's setting (system_eol_type). */ Lisp_Object @@ -5532,6 +6098,8 @@ coding_inherit_eol_type (coding_system, parent) parent_spec = CODING_SYSTEM_SPEC (parent); parent_eol_type = AREF (parent_spec, 2); + if (VECTORP (parent_eol_type)) + parent_eol_type = system_eol_type; } else parent_eol_type = system_eol_type; @@ -5545,6 +6113,46 @@ coding_inherit_eol_type (coding_system, parent) return coding_system; } + +/* Check if text-conversion and eol-conversion of CODING_SYSTEM are + decided for writing to a process. If not, complement them, and + return a new coding system. */ + +Lisp_Object +complement_process_encoding_system (coding_system) + Lisp_Object coding_system; +{ + Lisp_Object coding_base = Qnil, eol_base = Qnil; + Lisp_Object spec, attrs; + int i; + + for (i = 0; i < 3; i++) + { + if (i == 1) + coding_system = CDR_SAFE (Vdefault_process_coding_system); + else if (i == 2) + coding_system = preferred_coding_system (); + spec = CODING_SYSTEM_SPEC (coding_system); + if (NILP (spec)) + continue; + attrs = AREF (spec, 0); + if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided)) + coding_base = CODING_ATTR_BASE_NAME (attrs); + if (NILP (eol_base) && ! VECTORP (AREF (spec, 2))) + eol_base = coding_system; + if (! NILP (coding_base) && ! NILP (eol_base)) + break; + } + + if (i > 0) + /* The original CODING_SYSTEM didn't specify text-conversion or + eol-conversion. Be sure that we return a fully complemented + coding system. */ + coding_system = coding_inherit_eol_type (coding_base, eol_base); + return coding_system; +} + + /* Emacs has a mechanism to automatically detect a coding system if it is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But, it's impossible to distinguish some coding systems accurately @@ -5595,14 +6203,14 @@ coding_inherit_eol_type (coding_system, parent) o coding-category-iso-7-else The category for a coding system which has the same code range - as ISO2022 of 7-bit environemnt but uses locking shift or + as ISO2022 of 7-bit environment but uses locking shift or single shift functions. Assigned the coding-system (Lisp symbol) `iso-2022-7bit-lock' by default. o coding-category-iso-8-else The category for a coding system which has the same code range - as ISO2022 of 8-bit environemnt but uses locking shift or + as ISO2022 of 8-bit environment but uses locking shift or single shift functions. Assigned the coding-system (Lisp symbol) `iso-2022-8bit-ss2' by default. @@ -5701,16 +6309,26 @@ detect_eol (source, src_bytes, category) || src[lsb + 2] != '\n') this_eol = EOL_SEEN_CR; else - this_eol = EOL_SEEN_CRLF; + { + this_eol = EOL_SEEN_CRLF; + src += 2; + } if (eol_seen == EOL_SEEN_NONE) /* This is the first end-of-line. */ eol_seen = this_eol; else if (eol_seen != this_eol) { - /* The found type is different from what found before. */ - eol_seen = EOL_SEEN_LF; - break; + /* The found type is different from what found before. + Allow for stray ^M characters in DOS EOL files. */ + if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF + || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR) + eol_seen = EOL_SEEN_CRLF; + else + { + eol_seen = EOL_SEEN_LF; + break; + } } if (++total == MAX_EOL_CHECK_COUNT) break; @@ -5739,9 +6357,16 @@ detect_eol (source, src_bytes, category) eol_seen = this_eol; else if (eol_seen != this_eol) { - /* The found type is different from what found before. */ - eol_seen = EOL_SEEN_LF; - break; + /* The found type is different from what found before. + Allow for stray ^M characters in DOS EOL files. */ + if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF + || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR) + eol_seen = EOL_SEEN_CRLF; + else + { + eol_seen = EOL_SEEN_LF; + break; + } } if (++total == MAX_EOL_CHECK_COUNT) break; @@ -5787,6 +6412,7 @@ detect_coding (coding) struct coding_system *coding; { const unsigned char *src, *src_end; + int saved_mode = coding->mode; coding->consumed = coding->consumed_char = 0; coding->produced = coding->produced_char = 0; @@ -5835,7 +6461,7 @@ detect_coding (coding) break; } } - else if (! c) + else if (! c && !inhibit_null_byte_detection) { null_byte_found = 1; if (eight_bit_found) @@ -5957,6 +6583,7 @@ detect_coding (coding) setup_coding_system (XCDR (coding_systems), coding); } } + coding->mode = saved_mode; } @@ -5968,7 +6595,7 @@ decode_eol (coding) unsigned char *p, *pbeg, *pend; eol_type = CODING_ID_EOL_TYPE (coding->id); - if (EQ (eol_type, Qunix)) + if (EQ (eol_type, Qunix) || inhibit_eol_conversion) return; if (NILP (coding->dst_object)) @@ -5996,7 +6623,12 @@ decode_eol (coding) eol_seen |= EOL_SEEN_CR; } } - if (eol_seen != EOL_SEEN_NONE + /* Handle DOS-style EOLs in a file with stray ^M characters. */ + if ((eol_seen & EOL_SEEN_CRLF) != 0 + && (eol_seen & EOL_SEEN_CR) != 0 + && (eol_seen & EOL_SEEN_LF) == 0) + eol_seen = EOL_SEEN_CRLF; + else if (eol_seen != EOL_SEEN_NONE && eol_seen != EOL_SEEN_LF && eol_seen != EOL_SEEN_CRLF && eol_seen != EOL_SEEN_CR) @@ -6066,6 +6698,12 @@ get_translation_table (attrs, encodep, max_lookup) Lisp_Object standard, translation_table; Lisp_Object val; + if (NILP (Venable_character_translation)) + { + if (max_lookup) + *max_lookup = 0; + return Qnil; + } if (encodep) translation_table = CODING_ATTR_ENCODE_TBL (attrs), standard = Vstandard_translation_table_for_encode; @@ -6149,51 +6787,39 @@ get_translation_table (attrs, encodep, max_lookup) } while (0) +/* Return a translation of character(s) at BUF according to TRANS. + TRANS is TO-CHAR or ((FROM . TO) ...) where + FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...]. + The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a + translation is found, and Qnil if not found.. + If BUF is too short to lookup characters in FROM, return Qt. */ + static Lisp_Object -get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars) - Lisp_Object val; +get_translation (trans, buf, buf_end) + Lisp_Object trans; int *buf, *buf_end; - int last_block; - int *from_nchars, *to_nchars; { - /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or - [TO-CHAR ...]. */ - if (CONSP (val)) + + if (INTEGERP (trans)) + return trans; + for (; CONSP (trans); trans = XCDR (trans)) { - Lisp_Object from, tail; - int i, len; + Lisp_Object val = XCAR (trans); + Lisp_Object from = XCAR (val); + int len = ASIZE (from); + int i; - for (tail = val; CONSP (tail); tail = XCDR (tail)) + for (i = 0; i < len; i++) { - val = XCAR (tail); - from = XCAR (val); - len = ASIZE (from); - for (i = 0; i < len; i++) - { - if (buf + i == buf_end) - { - if (! last_block) - return Qt; - break; - } - if (XINT (AREF (from, i)) != buf[i]) - break; - } - if (i == len) - { - val = XCDR (val); - *from_nchars = len; - break; - } + if (buf + i == buf_end) + return Qt; + if (XINT (AREF (from, i)) != buf[i]) + break; } - if (! CONSP (tail)) - return Qnil; + if (i == len) + return val; } - if (VECTORP (val)) - *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val); - else - *buf = XINT (val); - return val; + return Qnil; } @@ -6233,11 +6859,23 @@ produce_chars (coding, translation_table, last_block) LOOKUP_TRANSLATION_TABLE (translation_table, c, trans); if (! NILP (trans)) { - trans = get_translation (trans, buf, buf_end, last_block, - &from_nchars, &to_nchars); - if (EQ (trans, Qt)) + trans = get_translation (trans, buf, buf_end); + if (INTEGERP (trans)) + c = XINT (trans); + else if (CONSP (trans)) + { + from_nchars = ASIZE (XCAR (trans)); + trans = XCDR (trans); + if (INTEGERP (trans)) + c = XINT (trans); + else + { + to_nchars = ASIZE (trans); + c = XINT (AREF (trans, 0)); + } + } + else if (EQ (trans, Qt) && ! last_block) break; - c = *buf; } if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end) @@ -6249,7 +6887,8 @@ produce_chars (coding, translation_table, last_block) if (EQ (coding->src_object, coding->dst_object)) { coding_set_source (coding); - dst_end = ((unsigned char *) coding->source) + coding->consumed; + dst_end = (((unsigned char *) coding->source) + + coding->consumed); } else dst_end = coding->destination + coding->dst_bytes; @@ -6266,9 +6905,7 @@ produce_chars (coding, translation_table, last_block) *dst++ = CHAR_TO_BYTE8 (c); } produced_chars += to_nchars; - *buf++ = to_nchars; - while (--from_nchars > 0) - *buf++ = 0; + buf += from_nchars; } else /* This is an annotation datum. (-C) is the length. */ @@ -6289,7 +6926,7 @@ produce_chars (coding, translation_table, last_block) if (coding->src_multibyte) { int multibytep = 1; - EMACS_INT consumed_chars; + EMACS_INT consumed_chars = 0; while (1) { @@ -6384,7 +7021,7 @@ produce_chars (coding, translation_table, last_block) /* Compose text in CODING->object according to the annotation data at CHARBUF. CHARBUF is an array: - [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ] + [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ] */ static INLINE void @@ -6398,33 +7035,33 @@ produce_composition (coding, charbuf, pos) enum composition_method method; Lisp_Object components; - len = -charbuf[0]; + len = -charbuf[0] - MAX_ANNOTATION_LENGTH; to = pos + charbuf[2]; - if (to <= pos) - return; - method = (enum composition_method) (charbuf[3]); + method = (enum composition_method) (charbuf[4]); if (method == COMPOSITION_RELATIVE) components = Qnil; - else if (method >= COMPOSITION_WITH_RULE - && method <= COMPOSITION_WITH_RULE_ALTCHARS) + else { Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1]; - int i; + int i, j; - len -= 4; - charbuf += 4; - for (i = 0; i < len; i++) + if (method == COMPOSITION_WITH_RULE) + len = charbuf[2] * 3 - 2; + charbuf += MAX_ANNOTATION_LENGTH; + /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */ + for (i = j = 0; i < len && charbuf[i] != -1; i++, j++) { - args[i] = make_number (charbuf[i]); - if (charbuf[i] < 0) - return; + if (charbuf[i] >= 0) + args[j] = make_number (charbuf[i]); + else + { + i++; + args[j] = make_number (charbuf[i] % 0x100); + } } - components = (method == COMPOSITION_WITH_ALTCHARS - ? Fstring (len, args) : Fvector (len, args)); + components = (i == j ? Fstring (j, args) : Fvector (j, args)); } - else - return; compose_text (pos, to, components, Qnil, coding->dst_object); } @@ -6453,7 +7090,7 @@ produce_charset (coding, charbuf, pos) #define ALLOC_CONVERSION_WORK_AREA(coding) \ do { \ - int size = CHARBUF_SIZE;; \ + int size = CHARBUF_SIZE; \ \ coding->charbuf = NULL; \ while (size > 1024) \ @@ -6486,21 +7123,21 @@ produce_annotation (coding, pos) while (charbuf < charbuf_end) { if (*charbuf >= 0) - pos += *charbuf++; + pos++, charbuf++; else { int len = -*charbuf; - switch (charbuf[1]) - { - case CODING_ANNOTATE_COMPOSITION_MASK: - produce_composition (coding, charbuf, pos); - break; - case CODING_ANNOTATE_CHARSET_MASK: - produce_charset (coding, charbuf, pos); - break; - default: - abort (); - } + + if (len > 2) + switch (charbuf[1]) + { + case CODING_ANNOTATE_COMPOSITION_MASK: + produce_composition (coding, charbuf, pos); + break; + case CODING_ANNOTATE_CHARSET_MASK: + produce_charset (coding, charbuf, pos); + break; + } charbuf += len; } } @@ -6536,6 +7173,7 @@ decode_coding (coding) Lisp_Object attrs; Lisp_Object undo_list; Lisp_Object translation_table; + struct ccl_spec cclspec; int carryover; int i; @@ -6568,6 +7206,11 @@ decode_coding (coding) translation_table = get_translation_table (attrs, 0, NULL); carryover = 0; + if (coding->decoder == decode_coding_ccl) + { + coding->spec.ccl = &cclspec; + setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding)); + } do { EMACS_INT pos = coding->dst_pos + coding->produced_char; @@ -6584,9 +7227,10 @@ decode_coding (coding) coding->charbuf[i] = coding->charbuf[coding->charbuf_used - carryover + i]; } - while (coding->consumed < coding->src_bytes - && (coding->result == CODING_RESULT_SUCCESS - || coding->result == CODING_RESULT_INVALID_SRC)); + while (coding->result == CODING_RESULT_INSUFFICIENT_DST + || (coding->consumed < coding->src_bytes + && (coding->result == CODING_RESULT_SUCCESS + || coding->result == CODING_RESULT_INVALID_SRC))); if (carryover > 0) { @@ -6611,6 +7255,8 @@ decode_coding (coding) that the number of data is less than the size of coding->charbuf. */ coding->charbuf_used = 0; + coding->chars_at_source = 0; + while (nbytes-- > 0) { int c = *src++; @@ -6628,6 +7274,8 @@ decode_coding (coding) coding->carryover. */ unsigned char *p = coding->carryover; + if (nbytes > sizeof coding->carryover) + nbytes = sizeof coding->carryover; coding->carryover_bytes = nbytes; while (nbytes-- > 0) *p++ = *src++; @@ -6635,7 +7283,8 @@ decode_coding (coding) coding->consumed = coding->src_bytes; } - if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)) + if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix) + && !inhibit_eol_conversion) decode_eol (coding); if (BUFFERP (coding->dst_object)) { @@ -6681,7 +7330,7 @@ handle_composition_annotation (pos, limit, coding, buf, stop) enum composition_method method = COMPOSITION_METHOD (prop); int nchars = COMPOSITION_LENGTH (prop); - ADD_COMPOSITION_DATA (buf, nchars, method); + ADD_COMPOSITION_DATA (buf, nchars, 0, method); if (method != COMPOSITION_RELATIVE) { Lisp_Object components; @@ -6690,7 +7339,7 @@ handle_composition_annotation (pos, limit, coding, buf, stop) components = COMPOSITION_COMPONENTS (prop); if (VECTORP (components)) { - len = XVECTOR (components)->size; + len = XVECTOR_SIZE (components); for (i = 0; i < len; i++) *buf++ = XINT (AREF (components, i)); } @@ -6786,7 +7435,7 @@ consume_chars (coding, translation_table, max_lookup) if (! NILP (translation_table)) lookup_buf = alloca (sizeof (int) * max_lookup); - eol_type = CODING_ID_EOL_TYPE (coding->id); + eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); if (VECTORP (eol_type)) eol_type = Qunix; @@ -6831,7 +7480,8 @@ consume_chars (coding, translation_table, max_lookup) { EMACS_INT bytes; - if (coding->encoder == encode_coding_raw_text) + if (coding->encoder == encode_coding_raw_text + || coding->encoder == encode_coding_ccl) c = *src++, pos++; else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0) c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes; @@ -6868,12 +7518,26 @@ consume_chars (coding, translation_table, max_lookup) for (i = 1; i < max_lookup && p < src_end; i++) lookup_buf[i] = STRING_CHAR_ADVANCE (p); lookup_buf_end = lookup_buf + i; - trans = get_translation (trans, lookup_buf, lookup_buf_end, 1, - &from_nchars, &to_nchars); - if (EQ (trans, Qt) - || buf + to_nchars > buf_end) + trans = get_translation (trans, lookup_buf, lookup_buf_end); + if (INTEGERP (trans)) + c = XINT (trans); + else if (CONSP (trans)) + { + from_nchars = ASIZE (XCAR (trans)); + trans = XCDR (trans); + if (INTEGERP (trans)) + c = XINT (trans); + else + { + to_nchars = ASIZE (trans); + if (buf + to_nchars > buf_end) + break; + c = XINT (AREF (trans, 0)); + } + } + else break; - *buf++ = *lookup_buf; + *buf++ = c; for (i = 1; i < to_nchars; i++) *buf++ = XINT (AREF (trans, i)); for (i = 1; i < from_nchars; i++, pos++) @@ -6916,6 +7580,7 @@ encode_coding (coding) Lisp_Object attrs; Lisp_Object translation_table; int max_lookup; + struct ccl_spec cclspec; attrs = CODING_ID_ATTRS (coding->id); if (coding->encoder == encode_coding_raw_text) @@ -6937,6 +7602,11 @@ encode_coding (coding) ALLOC_CONVERSION_WORK_AREA (coding); + if (coding->encoder == encode_coding_ccl) + { + coding->spec.ccl = &cclspec; + setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding)); + } do { coding_set_source (coding); consume_chars (coding, translation_table, max_lookup); @@ -6965,7 +7635,7 @@ static Lisp_Object Vcode_conversion_reused_workbuf; static int reused_workbuf_in_use; -/* Return a working buffer of code convesion. MULTIBYTE specifies the +/* Return a working buffer of code conversion. MULTIBYTE specifies the multibyteness of returning buffer. */ static Lisp_Object @@ -6982,13 +7652,17 @@ make_conversion_work_buffer (multibyte) } else { - if (NILP (Vcode_conversion_reused_workbuf)) + if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf))) Vcode_conversion_reused_workbuf = Fget_buffer_create (Vcode_conversion_workbuf_name); workbuf = Vcode_conversion_reused_workbuf; } current = current_buffer; set_buffer_internal (XBUFFER (workbuf)); + /* We can't allow modification hooks to run in the work buffer. For + instance, directory_files_internal assumes that file decoding + doesn't compile new regexps. */ + Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt); Ferase_buffer (); current_buffer->undo_list = Qt; current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil; @@ -7266,7 +7940,7 @@ decode_coding_object (coding, src_object, from, from_byte, to, to_byte, if (! destination) { record_conversion_result (coding, - CODING_RESULT_INSUFFICIENT_DST); + CODING_RESULT_INSUFFICIENT_MEM); unbind_to (count, Qnil); return; } @@ -7624,7 +8298,7 @@ function `define-coding-system'. */) /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If HIGHEST is nonzero, return the coding system of the highest - priority among the detected coding systems. Otherwize return a + priority among the detected coding systems. Otherwise return a list of detected coding systems sorted by their priorities. If MULTIBYTEP is nonzero, it is assumed that the bytes are in correct multibyte form but contains only ASCII and eight-bit chars. @@ -7649,7 +8323,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, { const unsigned char *src_end = src + src_bytes; Lisp_Object attrs, eol_type; - Lisp_Object val; + Lisp_Object val = Qnil; struct coding_system coding; int id; struct coding_detection_info detect_info; @@ -7713,7 +8387,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, break; } } - else if (! c) + else if (! c && !inhibit_null_byte_detection) { null_byte_found = 1; if (eight_bit_found) @@ -7781,10 +8455,11 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, } } - if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY) + if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY + || null_byte_found) { detect_info.found = CATEGORY_MASK_RAW_TEXT; - id = coding_categories[coding_category_raw_text].id; + id = CODING_SYSTEM_ID (Qno_conversion); val = Fcons (make_number (id), Qnil); } else if (! detect_info.rejected && ! detect_info.found) @@ -7814,7 +8489,6 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, { int mask = detect_info.rejected | detect_info.found; int found = 0; - val = Qnil; for (i = coding_category_raw_text - 1; i >= 0; i--) { @@ -7877,7 +8551,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, /* Then, detect eol-format if necessary. */ { - int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol; + int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1; Lisp_Object tail; if (VECTORP (eol_type)) @@ -7943,7 +8617,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, } } - return (highest ? XCAR (val) : val); + return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val); } @@ -7951,6 +8625,8 @@ DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, 2, 3, 0, doc: /* Detect coding system of the text in the region between START and END. Return a list of possible coding systems ordered by priority. +The coding systems to try and their priorities follows what +the function `coding-system-priority-list' (which see) returns. If only ASCII characters are found (except for such ISO-2022 control characters as ESC), it returns a list of single element `undecided' @@ -7988,6 +8664,8 @@ DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, 1, 2, 0, doc: /* Detect coding system of the text in STRING. Return a list of possible coding systems ordered by priority. +The coding systems to try and their priorities follows what +the function `coding-system-priority-list' (which see) returns. If only ASCII characters are found (except for such ISO-2022 control characters as ESC), it returns a list of single element `undecided' @@ -8048,7 +8726,7 @@ DEFUN ("find-coding-systems-region-internal", EMACS_INT start_byte, end_byte; const unsigned char *p, *pbeg, *pend; int c; - Lisp_Object tail, elt; + Lisp_Object tail, elt, work_table; if (STRINGP (start)) { @@ -8106,6 +8784,7 @@ DEFUN ("find-coding-systems-region-internal", while (p < pend && ASCII_BYTE_P (*p)) p++; while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--; + work_table = Fmake_char_table (Qnil, Qnil); while (p < pend) { if (ASCII_BYTE_P (*p)) @@ -8113,6 +8792,9 @@ DEFUN ("find-coding-systems-region-internal", else { c = STRING_CHAR_ADVANCE (p); + if (!NILP (char_table_ref (work_table, c))) + /* This character was already checked. Ignore it. */ + continue; charset_map_loaded = 0; for (tail = coding_attrs_list; CONSP (tail);) @@ -8144,6 +8826,7 @@ DEFUN ("find-coding-systems-region-internal", p = pbeg + p_offset; pend = pbeg + pend_offset; } + char_table_set (work_table, c, Qt); } } @@ -8282,7 +8965,10 @@ value is nil. START may be a string. In that case, check if the string is encodable, and the value contains indices to the string instead of -buffer positions. END is ignored. */) +buffer positions. END is ignored. + +If the current buffer (or START if it is a string) is unibyte, the value +is nil. */) (start, end, coding_system_list) Lisp_Object start, end, coding_system_list; { @@ -8296,7 +8982,7 @@ buffer positions. END is ignored. */) if (STRINGP (start)) { if (!STRING_MULTIBYTE (start) - && SCHARS (start) != SBYTES (start)) + || SCHARS (start) == SBYTES (start)) return Qnil; start_byte = 0; end_byte = SBYTES (start); @@ -8313,7 +8999,7 @@ buffer positions. END is ignored. */) start_byte = CHAR_TO_BYTE (XINT (start)); end_byte = CHAR_TO_BYTE (XINT (end)); if (XINT (end) - XINT (start) == end_byte - start_byte) - return Qt; + return Qnil; if (XINT (start) < GPT && XINT (end) > GPT) { @@ -8737,7 +9423,7 @@ DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_intern setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding); /* We had better not send unsafe characters to terminal. */ terminal_coding->mode |= CODING_MODE_SAFE_ENCODING; - /* Characer composition should be disabled. */ + /* Character composition should be disabled. */ terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; terminal_coding->src_multibyte = 1; terminal_coding->dst_multibyte = 0; @@ -8754,7 +9440,7 @@ DEFUN ("set-safe-terminal-coding-system-internal", CHECK_SYMBOL (coding_system); setup_coding_system (Fcheck_coding_system (coding_system), &safe_terminal_coding); - /* Characer composition should be disabled. */ + /* Character composition should be disabled. */ safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; safe_terminal_coding.src_multibyte = 1; safe_terminal_coding.dst_multibyte = 0; @@ -8764,7 +9450,7 @@ DEFUN ("set-safe-terminal-coding-system-internal", DEFUN ("terminal-coding-system", Fterminal_coding_system, Sterminal_coding_system, 0, 1, 0, doc: /* Return coding system specified for terminal output on the given terminal. -TERMINAL may be a terminal id, a frame, or nil for the selected +TERMINAL may be a terminal object, a frame, or nil for the selected frame's terminal device. */) (terminal) Lisp_Object terminal; @@ -8786,9 +9472,12 @@ DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_intern { struct terminal *t = get_terminal (terminal, 1); CHECK_SYMBOL (coding_system); - setup_coding_system (Fcheck_coding_system (coding_system), - TERMINAL_KEYBOARD_CODING (t)); - /* Characer composition should be disabled. */ + if (NILP (coding_system)) + coding_system = Qno_conversion; + else + Fcheck_coding_system (coding_system); + setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t)); + /* Character composition should be disabled. */ TERMINAL_KEYBOARD_CODING (t)->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; return Qnil; @@ -8982,6 +9671,9 @@ usage: (set-coding-system-priority &rest coding-systems) */) DEFUN ("coding-system-priority-list", Fcoding_system_priority_list, Scoding_system_priority_list, 0, 1, 0, doc: /* Return a list of coding systems ordered by their priorities. +The list contains a subset of coding systems; i.e. coding systems +assigned to each coding category (see `coding-category-list'). + HIGHESTP non-nil means just return the highest priority one. */) (highestp) Lisp_Object highestp; @@ -9005,7 +9697,7 @@ HIGHESTP non-nil means just return the highest priority one. */) return Fnreverse (val); } -static char *suffixes[] = { "-unix", "-dos", "-mac" }; +static const char *const suffixes[] = { "-unix", "-dos", "-mac" }; static Lisp_Object make_subsidiaries (base) @@ -9107,8 +9799,8 @@ usage: (define-coding-system-internal ...) */) } CODING_ATTR_CHARSET_LIST (attrs) = charset_list; - safe_charsets = Fmake_string (make_number (max_charset_id + 1), - make_number (255)); + safe_charsets = make_uninit_string (max_charset_id + 1); + memset (SDATA (safe_charsets), 255, max_charset_id + 1); for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) SSET (safe_charsets, XFASTINT (XCAR (tail)), 0); CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets; @@ -9162,7 +9854,7 @@ usage: (define-coding-system-internal ...) */) If Nth element is a list of charset IDs, N is the first byte of one of them. The list is sorted by dimensions of the - charsets. A charset of smaller dimension comes firtst. */ + charsets. A charset of smaller dimension comes first. */ val = Fmake_vector (make_number (256), Qnil); for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) @@ -9575,7 +10267,7 @@ DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put, CHECK_CHARACTER (val); CODING_ATTR_MNEMONIC (attrs) = val; } - else if (EQ (prop, QCdefalut_char)) + else if (EQ (prop, QCdefault_char)) { if (NILP (val)) val = make_number (' '); @@ -9799,7 +10491,7 @@ syms_of_coding () Vcode_conversion_reused_workbuf = Qnil; staticpro (&Vcode_conversion_workbuf_name); - Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*"); + Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*"); reused_workbuf_in_use = 0; @@ -9860,14 +10552,14 @@ syms_of_coding () DEFSYM (Qcoding_system_error, "coding-system-error"); Fput (Qcoding_system_error, Qerror_conditions, - Fcons (Qcoding_system_error, Fcons (Qerror, Qnil))); + pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil))); Fput (Qcoding_system_error, Qerror_message, - build_string ("Invalid coding system")); + make_pure_c_string ("Invalid coding system")); /* Intern this now in case it isn't already done. Setting this variable twice is harmless. But don't staticpro it here--that is done in alloc.c. */ - Qchar_table_extra_slots = intern ("char-table-extra-slots"); + Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots"); DEFSYM (Qtranslation_table, "translation-table"); Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2)); @@ -9881,7 +10573,7 @@ syms_of_coding () DEFSYM (QCcategory, ":category"); DEFSYM (QCmnemonic, ":mnemonic"); - DEFSYM (QCdefalut_char, ":default-char"); + DEFSYM (QCdefault_char, ":default-char"); DEFSYM (QCdecode_translation_table, ":decode-translation-table"); DEFSYM (QCencode_translation_table, ":encode-translation-table"); DEFSYM (QCpost_read_conversion, ":post-read-conversion"); @@ -9893,48 +10585,48 @@ syms_of_coding () staticpro (&Vcoding_category_table); /* Followings are target of code detection. */ ASET (Vcoding_category_table, coding_category_iso_7, - intern ("coding-category-iso-7")); + intern_c_string ("coding-category-iso-7")); ASET (Vcoding_category_table, coding_category_iso_7_tight, - intern ("coding-category-iso-7-tight")); + intern_c_string ("coding-category-iso-7-tight")); ASET (Vcoding_category_table, coding_category_iso_8_1, - intern ("coding-category-iso-8-1")); + intern_c_string ("coding-category-iso-8-1")); ASET (Vcoding_category_table, coding_category_iso_8_2, - intern ("coding-category-iso-8-2")); + intern_c_string ("coding-category-iso-8-2")); ASET (Vcoding_category_table, coding_category_iso_7_else, - intern ("coding-category-iso-7-else")); + intern_c_string ("coding-category-iso-7-else")); ASET (Vcoding_category_table, coding_category_iso_8_else, - intern ("coding-category-iso-8-else")); + intern_c_string ("coding-category-iso-8-else")); ASET (Vcoding_category_table, coding_category_utf_8_auto, - intern ("coding-category-utf-8-auto")); + intern_c_string ("coding-category-utf-8-auto")); ASET (Vcoding_category_table, coding_category_utf_8_nosig, - intern ("coding-category-utf-8")); + intern_c_string ("coding-category-utf-8")); ASET (Vcoding_category_table, coding_category_utf_8_sig, - intern ("coding-category-utf-8-sig")); + intern_c_string ("coding-category-utf-8-sig")); ASET (Vcoding_category_table, coding_category_utf_16_be, - intern ("coding-category-utf-16-be")); + intern_c_string ("coding-category-utf-16-be")); ASET (Vcoding_category_table, coding_category_utf_16_auto, - intern ("coding-category-utf-16-auto")); + intern_c_string ("coding-category-utf-16-auto")); ASET (Vcoding_category_table, coding_category_utf_16_le, - intern ("coding-category-utf-16-le")); + intern_c_string ("coding-category-utf-16-le")); ASET (Vcoding_category_table, coding_category_utf_16_be_nosig, - intern ("coding-category-utf-16-be-nosig")); + intern_c_string ("coding-category-utf-16-be-nosig")); ASET (Vcoding_category_table, coding_category_utf_16_le_nosig, - intern ("coding-category-utf-16-le-nosig")); + intern_c_string ("coding-category-utf-16-le-nosig")); ASET (Vcoding_category_table, coding_category_charset, - intern ("coding-category-charset")); + intern_c_string ("coding-category-charset")); ASET (Vcoding_category_table, coding_category_sjis, - intern ("coding-category-sjis")); + intern_c_string ("coding-category-sjis")); ASET (Vcoding_category_table, coding_category_big5, - intern ("coding-category-big5")); + intern_c_string ("coding-category-big5")); ASET (Vcoding_category_table, coding_category_ccl, - intern ("coding-category-ccl")); + intern_c_string ("coding-category-ccl")); ASET (Vcoding_category_table, coding_category_emacs_mule, - intern ("coding-category-emacs-mule")); + intern_c_string ("coding-category-emacs-mule")); /* Followings are NOT target of code detection. */ ASET (Vcoding_category_table, coding_category_raw_text, - intern ("coding-category-raw-text")); + intern_c_string ("coding-category-raw-text")); ASET (Vcoding_category_table, coding_category_undecided, - intern ("coding-category-undecided")); + intern_c_string ("coding-category-undecided")); DEFSYM (Qinsufficient_source, "insufficient-source"); DEFSYM (Qinconsistent_eol, "inconsistent-eol"); @@ -10135,22 +10827,22 @@ Also used for decoding keyboard input on X Window system. */); DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix, doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */); - eol_mnemonic_unix = build_string (":"); + eol_mnemonic_unix = make_pure_c_string (":"); DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos, doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */); - eol_mnemonic_dos = build_string ("\\"); + eol_mnemonic_dos = make_pure_c_string ("\\"); DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac, doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */); - eol_mnemonic_mac = build_string ("/"); + eol_mnemonic_mac = make_pure_c_string ("/"); DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided, doc: /* *String displayed in mode line when end-of-line format is not yet determined. */); - eol_mnemonic_undecided = build_string (":"); + eol_mnemonic_undecided = make_pure_c_string (":"); DEFVAR_LISP ("enable-character-translation", &Venable_character_translation, doc: /* @@ -10219,18 +10911,18 @@ called even if `coding-system-for-write' is non-nil. The command DEFVAR_BOOL ("inhibit-iso-escape-detection", &inhibit_iso_escape_detection, doc: /* -If non-nil, Emacs ignores ISO2022's escape sequence on code detection. +If non-nil, Emacs ignores ISO-2022 escape sequences during code detection. -By default, on reading a file, Emacs tries to detect how the text is -encoded. This code detection is sensitive to escape sequences. If -the sequence is valid as ISO2022, the code is determined as one of -the ISO2022 encodings, and the file is decoded by the corresponding -coding system (e.g. `iso-2022-7bit'). +When Emacs reads text, it tries to detect how the text is encoded. +This code detection is sensitive to escape sequences. If Emacs sees +a valid ISO-2022 escape sequence, it assumes the text is encoded in one +of the ISO2022 encodings, and decodes text by the corresponding coding +system (e.g. `iso-2022-7bit'). However, there may be a case that you want to read escape sequences in a file as is. In such a case, you can set this variable to non-nil. -Then, as the code detection ignores any escape sequences, no file is -detected as encoded in some ISO2022 encoding. The result is that all +Then the code detection will ignore any escape sequences, and no text is +detected as encoded in some ISO-2022 encoding. The result is that all escape sequences become visible in a buffer. The default value is nil, and it is strongly recommended not to change @@ -10240,14 +10932,31 @@ in Emacs's distribution, and they won't be decoded correctly on reading if you suppress escape sequence detection. The other way to read escape sequences in a file without decoding is -to explicitly specify some coding system that doesn't use ISO2022's +to explicitly specify some coding system that doesn't use ISO-2022 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */); inhibit_iso_escape_detection = 0; + DEFVAR_BOOL ("inhibit-null-byte-detection", + &inhibit_null_byte_detection, + doc: /* If non-nil, Emacs ignores null bytes on code detection. +By default, Emacs treats it as binary data, and does not attempt to +decode it. The effect is as if you specified `no-conversion' for +reading that text. + +Set this to non-nil when a regular text happens to include null bytes. +Examples are Index nodes of Info files and null-byte delimited output +from GNU Find and GNU Grep. Emacs will then ignore the null bytes and +decode text as usual. */); + inhibit_null_byte_detection = 0; + DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input, doc: /* Char table for translating self-inserting characters. This is applied to the result of input methods, not their input. -See also `keyboard-translate-table'. */); +See also `keyboard-translate-table'. + +Use of this variable for character code unification was rendered +obsolete in Emacs 23.1 and later, since Unicode is now the basis of +internal character representation. */); Vtranslation_table_for_input = Qnil; { @@ -10258,25 +10967,25 @@ See also `keyboard-translate-table'. */); for (i = 0; i < coding_arg_max; i++) args[i] = Qnil; - plist[0] = intern (":name"); + plist[0] = intern_c_string (":name"); plist[1] = args[coding_arg_name] = Qno_conversion; - plist[2] = intern (":mnemonic"); + plist[2] = intern_c_string (":mnemonic"); plist[3] = args[coding_arg_mnemonic] = make_number ('='); - plist[4] = intern (":coding-type"); + plist[4] = intern_c_string (":coding-type"); plist[5] = args[coding_arg_coding_type] = Qraw_text; - plist[6] = intern (":ascii-compatible-p"); + plist[6] = intern_c_string (":ascii-compatible-p"); plist[7] = args[coding_arg_ascii_compatible_p] = Qt; - plist[8] = intern (":default-char"); + plist[8] = intern_c_string (":default-char"); plist[9] = args[coding_arg_default_char] = make_number (0); - plist[10] = intern (":for-unibyte"); + plist[10] = intern_c_string (":for-unibyte"); plist[11] = args[coding_arg_for_unibyte] = Qt; - plist[12] = intern (":docstring"); - plist[13] = build_string ("Do no conversion.\n\ + plist[12] = intern_c_string (":docstring"); + plist[13] = make_pure_c_string ("Do no conversion.\n\ \n\ When you visit a file with this coding, the file is read into a\n\ unibyte buffer as is, thus each byte of a file is treated as a\n\ character."); - plist[14] = intern (":eol-type"); + plist[14] = intern_c_string (":eol-type"); plist[15] = args[coding_arg_eol_type] = Qunix; args[coding_arg_plist] = Flist (16, plist); Fdefine_coding_system_internal (coding_arg_max, args); @@ -10286,10 +10995,10 @@ character."); plist[5] = args[coding_arg_coding_type] = Qundecided; /* This is already set. plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */ - plist[8] = intern (":charset-list"); + plist[8] = intern_c_string (":charset-list"); plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil); plist[11] = args[coding_arg_for_unibyte] = Qnil; - plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding."); + plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding."); plist[15] = args[coding_arg_eol_type] = Qnil; args[coding_arg_plist] = Flist (16, plist); Fdefine_coding_system_internal (coding_arg_max, args);