/* Coding system handler (conversion, detection, etc).
Copyright (C) 2001, 2002, 2003, 2004, 2005,
- 2006, 2007, 2008 Free Software Foundation, Inc.
+ 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
- 2005, 2006, 2007, 2008
+ 2005, 2006, 2007, 2008, 2009
National Institute of Advanced Industrial Science and Technology (AIST)
Registration Number H14PRO021
Copyright (C) 2003
This file is part of GNU Emacs.
-GNU Emacs is free software; you can redistribute it and/or modify
+GNU Emacs is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with GNU Emacs; see the file COPYING. If not, write to
-the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-Boston, MA 02110-1301, USA. */
+along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
/*** TABLE OF CONTENTS ***
Lisp_Object Qbig, Qlittle;
Lisp_Object Qcoding_system_history;
Lisp_Object Qvalid_codes;
-Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
+Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
Lisp_Object QCdecode_translation_table, QCencode_translation_table;
Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
Lisp_Object QCascii_compatible_p;
/* Flag to inhibit ISO2022 escape sequence detection. */
int inhibit_iso_escape_detection;
+/* Flag to inhibit detection of binary files through null bytes. */
+int inhibit_null_byte_detection;
+
/* Flag to make buffer-file-coding-system inherit from process-coding. */
int inherit_process_coding_system;
character is prohibited by CODING_ISO_FLAG_SAFE. */
#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
+/* UTF-8 section */
+#define CODING_UTF_8_BOM(coding) \
+ ((coding)->spec.utf_8_bom)
/* UTF-16 section */
#define CODING_UTF_16_BOM(coding) \
coding_category_iso_8_2,
coding_category_iso_7_else,
coding_category_iso_8_else,
- coding_category_utf_8,
+ coding_category_utf_8_auto,
+ coding_category_utf_8_nosig,
+ coding_category_utf_8_sig,
coding_category_utf_16_auto,
coding_category_utf_16_be,
coding_category_utf_16_le,
#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
-#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
+#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
+#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
+#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
| CATEGORY_MASK_ISO_8_2 \
| CATEGORY_MASK_ISO_7_ELSE \
| CATEGORY_MASK_ISO_8_ELSE \
- | CATEGORY_MASK_UTF_8 \
+ | CATEGORY_MASK_UTF_8_AUTO \
+ | CATEGORY_MASK_UTF_8_NOSIG \
+ | CATEGORY_MASK_UTF_8_SIG \
| CATEGORY_MASK_UTF_16_AUTO \
| CATEGORY_MASK_UTF_16_BE \
| CATEGORY_MASK_UTF_16_LE \
| CATEGORY_MASK_UTF_16_BE_NOSIG \
| CATEGORY_MASK_UTF_16_LE_NOSIG)
+#define CATEGORY_MASK_UTF_8 \
+ (CATEGORY_MASK_UTF_8_AUTO \
+ | CATEGORY_MASK_UTF_8_NOSIG \
+ | CATEGORY_MASK_UTF_8_SIG)
/* List of symbols `coding-category-xxx' ordered by priority. This
variable is exposed to Emacs Lisp. */
consumed_chars++; \
} while (0)
+/* Safely get two bytes from the source text pointed by SRC which ends
+ at SRC_END, and set C1 and C2 to those bytes while skipping the
+ heading multibyte characters. If there are not enough bytes in the
+ source, it jumps to `no_more_source'. If multibytep is nonzero and
+ a multibyte character is found for C2, set C2 to the negative value
+ of the character code. The caller should declare and set these
+ variables appropriately in advance:
+ src, src_end, multibytep
+ It is intended that this macro is used in detect_coding_utf_16. */
+
+#define TWO_MORE_BYTES(c1, c2) \
+ do { \
+ do { \
+ if (src == src_end) \
+ goto no_more_source; \
+ c1 = *src++; \
+ if (multibytep && (c1 & 0x80)) \
+ { \
+ if ((c1 & 0xFE) == 0xC0) \
+ c1 = ((c1 & 1) << 6) | *src++; \
+ else \
+ { \
+ src += BYTES_BY_CHAR_HEAD (c1) - 1; \
+ c1 = -1; \
+ } \
+ } \
+ } while (c1 < 0); \
+ if (src == src_end) \
+ goto no_more_source; \
+ c2 = *src++; \
+ if (multibytep && (c2 & 0x80)) \
+ { \
+ if ((c2 & 0xFE) == 0xC0) \
+ c2 = ((c2 & 1) << 6) | *src++; \
+ else \
+ c2 = -1; \
+ } \
+ } while (0)
+
#define ONE_MORE_BYTE_NO_CHECK(c) \
do { \
#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
+#define UTF_BOM 0xFEFF
+#define UTF_8_BOM_1 0xEF
+#define UTF_8_BOM_2 0xBB
+#define UTF_8_BOM_3 0xBF
+
static int
detect_coding_utf_8 (coding, detect_info)
struct coding_system *coding;
const unsigned char *src_end = coding->source + coding->src_bytes;
int multibytep = coding->src_multibyte;
int consumed_chars = 0;
+ int bom_found = 0;
int found = 0;
detect_info->checked |= CATEGORY_MASK_UTF_8;
break;
if (UTF_8_2_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
continue;
}
ONE_MORE_BYTE (c2);
break;
if (UTF_8_3_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
+ if (src_base == coding->source
+ && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
+ bom_found = 1;
continue;
}
ONE_MORE_BYTE (c3);
break;
if (UTF_8_4_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
continue;
}
ONE_MORE_BYTE (c4);
break;
if (UTF_8_5_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
continue;
}
break;
detect_info->rejected |= CATEGORY_MASK_UTF_8;
return 0;
}
- detect_info->found |= found;
+ if (bom_found)
+ {
+ /* The first character 0xFFFE doesn't necessarily mean a BOM. */
+ detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
+ }
+ else
+ {
+ detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
+ if (found)
+ detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
+ }
return 1;
}
const unsigned char *src_base;
int *charbuf = coding->charbuf + coding->charbuf_used;
int *charbuf_end = coding->charbuf + coding->charbuf_size;
- int consumed_chars = 0, consumed_chars_base;
+ int consumed_chars = 0, consumed_chars_base = 0;
int multibytep = coding->src_multibyte;
+ enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
Lisp_Object attr, charset_list;
int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
int byte_after_cr = -1;
CODING_GET_INFO (coding, attr, charset_list);
+ if (bom != utf_without_bom)
+ {
+ int c1, c2, c3;
+
+ src_base = src;
+ ONE_MORE_BYTE (c1);
+ if (! UTF_8_3_OCTET_LEADING_P (c1))
+ src = src_base;
+ else
+ {
+ ONE_MORE_BYTE (c2);
+ if (! UTF_8_EXTRA_OCTET_P (c2))
+ src = src_base;
+ else
+ {
+ ONE_MORE_BYTE (c3);
+ if (! UTF_8_EXTRA_OCTET_P (c3))
+ src = src_base;
+ else
+ {
+ if ((c1 != UTF_8_BOM_1)
+ || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
+ src = src_base;
+ else
+ CODING_UTF_8_BOM (coding) = utf_without_bom;
+ }
+ }
+ }
+ }
+ CODING_UTF_8_BOM (coding) = utf_without_bom;
+
+
+
while (1)
{
int c, c1, c2, c3, c4, c5;
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
if (byte_after_cr >= 0)
c1 = byte_after_cr, byte_after_cr = -1;
int produced_chars = 0;
int c;
+ if (CODING_UTF_8_BOM (coding) == utf_with_bom)
+ {
+ ASSURE_DESTINATION (3);
+ EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
+ CODING_UTF_8_BOM (coding) = utf_without_bom;
+ }
+
if (multibytep)
{
int safe_room = MAX_MULTIBYTE_LENGTH * 2;
return 0;
}
- ONE_MORE_BYTE (c1);
- ONE_MORE_BYTE (c2);
+ TWO_MORE_BYTES (c1, c2);
if ((c1 == 0xFF) && (c2 == 0xFE))
{
detect_info->found |= (CATEGORY_MASK_UTF_16_LE
| CATEGORY_MASK_UTF_16_BE_NOSIG
| CATEGORY_MASK_UTF_16_LE_NOSIG);
}
+ else if (c2 < 0)
+ {
+ detect_info->rejected |= CATEGORY_MASK_UTF_16;
+ return 0;
+ }
else
{
/* We check the dispersion of Eth and Oth bytes where E is even and
while (1)
{
- ONE_MORE_BYTE (c1);
- ONE_MORE_BYTE (c2);
+ TWO_MORE_BYTES (c1, c2);
+ if (c2 < 0)
+ break;
if (! e[c1])
{
e[c1] = 1;
const unsigned char *src_base;
int *charbuf = coding->charbuf + coding->charbuf_used;
int *charbuf_end = coding->charbuf + coding->charbuf_size;
- int consumed_chars = 0, consumed_chars_base;
+ int consumed_chars = 0, consumed_chars_base = 0;
int multibytep = coding->src_multibyte;
- enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
int surrogate = CODING_UTF_16_SURROGATE (coding);
Lisp_Object attr, charset_list;
CODING_GET_INFO (coding, attr, charset_list);
- if (bom == utf_16_with_bom)
+ if (bom == utf_with_bom)
{
int c, c1, c2;
src = src_base;
coding->errors++;
}
- CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ CODING_UTF_16_BOM (coding) = utf_without_bom;
}
- else if (bom == utf_16_detect_bom)
+ else if (bom == utf_detect_bom)
{
/* We have already tried to detect BOM and failed in
detect_coding. */
- CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ CODING_UTF_16_BOM (coding) = utf_without_bom;
}
while (1)
consumed_chars_base = consumed_chars;
if (charbuf + 2 >= charbuf_end)
- break;
+ {
+ if (byte_after_cr1 >= 0)
+ src_base -= 2;
+ break;
+ }
if (byte_after_cr1 >= 0)
c1 = byte_after_cr1, byte_after_cr1 = -1;
unsigned char *dst = coding->destination + coding->produced;
unsigned char *dst_end = coding->destination + coding->dst_bytes;
int safe_room = 8;
- enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
int produced_chars = 0;
Lisp_Object attrs, charset_list;
CODING_GET_INFO (coding, attrs, charset_list);
- if (bom != utf_16_without_bom)
+ if (bom != utf_without_bom)
{
ASSURE_DESTINATION (safe_room);
if (big_endian)
EMIT_TWO_BYTES (0xFE, 0xFF);
else
EMIT_TWO_BYTES (0xFF, 0xFE);
- CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ CODING_UTF_16_BOM (coding) = utf_without_bom;
}
while (charbuf < charbuf_end)
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
if (byte_after_cr >= 0)
c = byte_after_cr, byte_after_cr = -1;
if (preferred_charset_id >= 0)
{
charset = CHARSET_FROM_ID (preferred_charset_id);
- if (! CHAR_CHARSET_P (c, charset))
- charset = char_charset (c, charset_list, NULL);
+ if (CHAR_CHARSET_P (c, charset))
+ code = ENCODE_CHAR (charset, c);
+ else
+ charset = char_charset (c, charset_list, &code);
}
else
charset = char_charset (c, charset_list, &code);
int i;
int rejected = 0;
int found = 0;
+ int composition_count = -1;
detect_info->checked |= CATEGORY_MASK_ISO;
struct coding_system *this = &(coding_categories[i]);
Lisp_Object attrs, val;
+ if (this->id < 0)
+ continue;
attrs = CODING_ID_ATTRS (this->id);
if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
&& ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
break;
}
+ else if (c == '1')
+ {
+ /* End of composition. */
+ if (composition_count < 0
+ || composition_count > MAX_COMPOSITION_COMPONENTS)
+ /* Invalid */
+ break;
+ composition_count = -1;
+ found |= CATEGORY_MASK_ISO;
+ }
else if (c >= '0' && c <= '4')
{
/* ESC <Fp> for start/end composition. */
- found |= CATEGORY_MASK_ISO;
+ composition_count = 0;
break;
}
else
continue;
if (c < 0x80)
{
+ if (composition_count >= 0)
+ composition_count++;
single_shifting = 0;
break;
}
}
if (i & 1 && src < src_end)
- rejected |= CATEGORY_MASK_ISO_8_2;
+ {
+ rejected |= CATEGORY_MASK_ISO_8_2;
+ if (composition_count >= 0)
+ composition_count += i;
+ }
else
- found |= CATEGORY_MASK_ISO_8_2;
+ {
+ found |= CATEGORY_MASK_ISO_8_2;
+ if (composition_count >= 0)
+ composition_count += i / 2;
+ }
}
break;
}
break; \
if (p == src_end - 1) \
{ \
+ if (coding->mode & CODING_MODE_LAST_BLOCK) \
+ goto invalid_code; \
/* The current composition doesn't end in the current \
source. */ \
record_conversion_result \
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
if (byte_after_cr >= 0)
c1 = byte_after_cr, byte_after_cr = -1;
if (composition_state == COMPOSING_RULE
|| composition_state == COMPOSING_COMPONENT_RULE)
{
- DECODE_COMPOSITION_RULE (c1);
- components[component_idx++] = c1;
- composition_state--;
- continue;
+ if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
+ {
+ DECODE_COMPOSITION_RULE (c1);
+ components[component_idx++] = c1;
+ composition_state--;
+ continue;
+ }
+ /* Too long composition. */
+ MAYBE_FINISH_COMPOSITION ();
}
}
if (charset_id_0 < 0
if (composition_state == COMPOSING_RULE
|| composition_state == COMPOSING_COMPONENT_RULE)
{
- DECODE_COMPOSITION_RULE (c1);
- components[component_idx++] = c1;
- composition_state--;
- continue;
+ if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
+ {
+ DECODE_COMPOSITION_RULE (c1);
+ components[component_idx++] = c1;
+ composition_state--;
+ continue;
+ }
+ MAYBE_FINISH_COMPOSITION ();
}
}
if (charset_id_0 < 0)
}
else
{
- components[component_idx++] = c;
- if (method == COMPOSITION_WITH_RULE
- || (method == COMPOSITION_WITH_RULE_ALTCHARS
- && composition_state == COMPOSING_COMPONENT_CHAR))
- composition_state++;
+ if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
+ {
+ components[component_idx++] = c;
+ if (method == COMPOSITION_WITH_RULE
+ || (method == COMPOSITION_WITH_RULE_ALTCHARS
+ && composition_state == COMPOSING_COMPONENT_CHAR))
+ composition_state++;
+ }
+ else
+ {
+ MAYBE_FINISH_COMPOSITION ();
+ *charbuf++ = c;
+ char_offset++;
+ }
}
continue;
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
if (byte_after_cr >= 0)
c = byte_after_cr, byte_after_cr = -1;
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
if (byte_after_cr >= 0)
c = byte_after_cr, byte_after_cr = -1;
*dst++ = CHAR_TO_BYTE8 (c);
else
CHAR_STRING_ADVANCE (c, dst);
- produced_chars++;
}
}
else
ASSURE_DESTINATION (charbuf_end - charbuf);
while (charbuf < charbuf_end && dst < dst_end)
*dst++ = *charbuf++;
- produced_chars = dst - (coding->destination + coding->dst_bytes);
}
+ produced_chars = dst - (coding->destination + coding->produced);
}
record_conversion_result (coding, CODING_RESULT_SUCCESS);
coding->produced_char += produced_chars;
const unsigned char *src_end = coding->source + coding->src_bytes;
int multibytep = coding->src_multibyte;
int consumed_chars = 0;
- Lisp_Object attrs, valids;
+ Lisp_Object attrs, valids, name;
int found = 0;
int head_ascii = coding->head_ascii;
+ int check_latin_extra = 0;
detect_info->checked |= CATEGORY_MASK_CHARSET;
coding = &coding_categories[coding_category_charset];
attrs = CODING_ID_ATTRS (coding->id);
valids = AREF (attrs, coding_attr_charset_valids);
-
+ name = CODING_ID_NAME (coding->id);
+ if (VECTORP (Vlatin_extra_code_table)
+ && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-") == 0)
+ check_latin_extra = 1;
if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
src += head_ascii;
if (NILP (val))
break;
if (c >= 0x80)
- found = CATEGORY_MASK_CHARSET;
+ {
+ if (c < 0xA0
+ && check_latin_extra
+ && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
+ break;
+ found = CATEGORY_MASK_CHARSET;
+ }
if (INTEGERP (val))
{
charset = CHARSET_FROM_ID (XFASTINT (val));
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
if (byte_after_cr >= 0)
{
code = c;
val = AREF (valids, c);
- if (NILP (val))
+ if (! INTEGERP (val) && ! CONSP (val))
goto invalid_code;
if (INTEGERP (val))
{
}
else if (EQ (coding_type, Qutf_8))
{
+ val = AREF (attrs, coding_attr_utf_bom);
+ CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
+ : EQ (val, Qt) ? utf_with_bom
+ : utf_without_bom);
coding->detector = detect_coding_utf_8;
coding->decoder = decode_coding_utf_8;
coding->encoder = encode_coding_utf_8;
coding->common_flags
|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
+ if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
+ coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
}
else if (EQ (coding_type, Qutf_16))
{
- val = AREF (attrs, coding_attr_utf_16_bom);
- CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
- : EQ (val, Qt) ? utf_16_with_bom
- : utf_16_without_bom);
+ val = AREF (attrs, coding_attr_utf_bom);
+ CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
+ : EQ (val, Qt) ? utf_with_bom
+ : utf_without_bom);
val = AREF (attrs, coding_attr_utf_16_endian);
CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
: utf_16_little_endian);
coding->encoder = encode_coding_utf_16;
coding->common_flags
|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
- if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
+ if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
}
else if (EQ (coding_type, Qccl))
}
+/* Return a list of charsets supported by CODING-SYSTEM. */
+
+Lisp_Object
+coding_system_charset_list (coding_system)
+ Lisp_Object coding_system;
+{
+ int id;
+ Lisp_Object attrs, charset_list;
+
+ CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
+ attrs = CODING_ID_ATTRS (id);
+
+ if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
+ {
+ int flags = XINT (AREF (attrs, coding_attr_iso_flags));
+
+ if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
+ charset_list = Viso_2022_charset_list;
+ else
+ charset_list = CODING_ATTR_CHARSET_LIST (attrs);
+ }
+ else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
+ {
+ charset_list = Vemacs_mule_charset_list;
+ }
+ else
+ {
+ charset_list = CODING_ATTR_CHARSET_LIST (attrs);
+ }
+ return charset_list;
+}
+
+
/* Return raw-text or one of its subsidiaries that has the same
eol_type as CODING-SYSTEM. */
|| src[lsb + 2] != '\n')
this_eol = EOL_SEEN_CR;
else
- this_eol = EOL_SEEN_CRLF;
+ {
+ this_eol = EOL_SEEN_CRLF;
+ src += 2;
+ }
if (eol_seen == EOL_SEEN_NONE)
/* This is the first end-of-line. */
eol_seen = this_eol;
else if (eol_seen != this_eol)
{
- /* The found type is different from what found before. */
- eol_seen = EOL_SEEN_LF;
- break;
+ /* The found type is different from what found before.
+ Allow for stray ^M characters in DOS EOL files. */
+ if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
+ || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
+ eol_seen = EOL_SEEN_CRLF;
+ else
+ {
+ eol_seen = EOL_SEEN_LF;
+ break;
+ }
}
if (++total == MAX_EOL_CHECK_COUNT)
break;
eol_seen = this_eol;
else if (eol_seen != this_eol)
{
- /* The found type is different from what found before. */
- eol_seen = EOL_SEEN_LF;
- break;
+ /* The found type is different from what found before.
+ Allow for stray ^M characters in DOS EOL files. */
+ if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
+ || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
+ eol_seen = EOL_SEEN_CRLF;
+ else
+ {
+ eol_seen = EOL_SEEN_LF;
+ break;
+ }
}
if (++total == MAX_EOL_CHECK_COUNT)
break;
struct coding_system *coding;
{
const unsigned char *src, *src_end;
+ int saved_mode = coding->mode;
coding->consumed = coding->consumed_char = 0;
coding->produced = coding->produced_char = 0;
coding_set_source (coding);
src_end = coding->source + coding->src_bytes;
+ coding->head_ascii = 0;
/* If we have not yet decided the text encoding type, detect it
now. */
int null_byte_found = 0, eight_bit_found = 0;
detect_info.checked = detect_info.found = detect_info.rejected = 0;
- coding->head_ascii = -1;
for (src = coding->source; src < src_end; src++)
{
c = *src;
if (c & 0x80)
{
eight_bit_found = 1;
- if (coding->head_ascii < 0)
- coding->head_ascii = src - coding->source;
if (null_byte_found)
break;
}
&& ! inhibit_iso_escape_detection
&& ! detect_info.checked)
{
- if (coding->head_ascii < 0)
- coding->head_ascii = src - coding->source;
if (detect_coding_iso_2022 (coding, &detect_info))
{
/* We have scanned the whole data. */
if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
- /* We didn't find an 8-bit code. We may have
- found a null-byte, but it's very rare that
- a binary file confirm to ISO-2022. */
- src = src_end;
+ {
+ /* We didn't find an 8-bit code. We may
+ have found a null-byte, but it's very
+ rare that a binary file confirm to
+ ISO-2022. */
+ src = src_end;
+ coding->head_ascii = src - coding->source;
+ }
+ detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
break;
}
}
- else if (! c)
+ else if (! c && !inhibit_null_byte_detection)
{
null_byte_found = 1;
if (eight_bit_found)
break;
}
+ if (! eight_bit_found)
+ coding->head_ascii++;
}
+ else if (! eight_bit_found)
+ coding->head_ascii++;
}
- if (coding->head_ascii < 0)
- coding->head_ascii = src - coding->source;
if (null_byte_found || eight_bit_found
|| coding->head_ascii < coding->src_bytes
break;
}
}
-
- if (i < coding_category_raw_text)
- setup_coding_system (CODING_ID_NAME (this->id), coding);
- else if (null_byte_found)
- setup_coding_system (Qno_conversion, coding);
- else if ((detect_info.rejected & CATEGORY_MASK_ANY)
- == CATEGORY_MASK_ANY)
- setup_coding_system (Qraw_text, coding);
- else if (detect_info.rejected)
- for (i = 0; i < coding_category_raw_text; i++)
- if (! (detect_info.rejected & (1 << coding_priorities[i])))
- {
- this = coding_categories + coding_priorities[i];
- setup_coding_system (CODING_ID_NAME (this->id), coding);
- break;
- }
}
+
+ if (i < coding_category_raw_text)
+ setup_coding_system (CODING_ID_NAME (this->id), coding);
+ else if (null_byte_found)
+ setup_coding_system (Qno_conversion, coding);
+ else if ((detect_info.rejected & CATEGORY_MASK_ANY)
+ == CATEGORY_MASK_ANY)
+ setup_coding_system (Qraw_text, coding);
+ else if (detect_info.rejected)
+ for (i = 0; i < coding_category_raw_text; i++)
+ if (! (detect_info.rejected & (1 << coding_priorities[i])))
+ {
+ this = coding_categories + coding_priorities[i];
+ setup_coding_system (CODING_ID_NAME (this->id), coding);
+ break;
+ }
+ }
+ }
+ else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
+ == coding_category_utf_8_auto)
+ {
+ Lisp_Object coding_systems;
+ struct coding_detection_info detect_info;
+
+ coding_systems
+ = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
+ detect_info.found = detect_info.rejected = 0;
+ coding->head_ascii = 0;
+ if (CONSP (coding_systems)
+ && detect_coding_utf_8 (coding, &detect_info))
+ {
+ if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+ setup_coding_system (XCAR (coding_systems), coding);
+ else
+ setup_coding_system (XCDR (coding_systems), coding);
}
}
else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
struct coding_detection_info detect_info;
coding_systems
- = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
+ = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0;
+ coding->head_ascii = 0;
if (CONSP (coding_systems)
&& detect_coding_utf_16 (coding, &detect_info))
{
setup_coding_system (XCDR (coding_systems), coding);
}
}
+ coding->mode = saved_mode;
}
eol_seen |= EOL_SEEN_CR;
}
}
- if (eol_seen != EOL_SEEN_NONE
+ /* Handle DOS-style EOLs in a file with stray ^M characters. */
+ if ((eol_seen & EOL_SEEN_CRLF) != 0
+ && (eol_seen & EOL_SEEN_CR) != 0
+ && (eol_seen & EOL_SEEN_LF) == 0)
+ eol_seen = EOL_SEEN_CRLF;
+ else if (eol_seen != EOL_SEEN_NONE
&& eol_seen != EOL_SEEN_LF
&& eol_seen != EOL_SEEN_CRLF
&& eol_seen != EOL_SEEN_CR)
if (coding->src_multibyte)
{
int multibytep = 1;
- EMACS_INT consumed_chars;
+ EMACS_INT consumed_chars = 0;
while (1)
{
#define ALLOC_CONVERSION_WORK_AREA(coding) \
do { \
- int size = CHARBUF_SIZE;; \
+ int size = CHARBUF_SIZE; \
\
coding->charbuf = NULL; \
while (size > 1024) \
that the number of data is less than the size of
coding->charbuf. */
coding->charbuf_used = 0;
+ coding->chars_at_source = 0;
+
while (nbytes-- > 0)
{
int c = *src++;
}
else
{
- name = Vcode_conversion_workbuf_name;
- workbuf = Fget_buffer_create (name);
- if (NILP (Vcode_conversion_reused_workbuf))
- Vcode_conversion_reused_workbuf = workbuf;
+ if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
+ Vcode_conversion_reused_workbuf
+ = Fget_buffer_create (Vcode_conversion_workbuf_name);
+ workbuf = Vcode_conversion_reused_workbuf;
}
current = current_buffer;
set_buffer_internal (XBUFFER (workbuf));
+ /* We can't allow modification hooks to run in the work buffer. For
+ instance, directory_files_internal assumes that file decoding
+ doesn't compile new regexps. */
+ Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
Ferase_buffer ();
current_buffer->undo_list = Qt;
current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
}
else
{
- coding->dst_pos = BUF_PT (XBUFFER (dst_object));
- coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
+ struct buffer *current = current_buffer;
+
+ set_buffer_temp (XBUFFER (dst_object));
+ coding->dst_pos = PT;
+ coding->dst_pos_byte = PT_BYTE;
+ move_gap_both (coding->dst_pos, coding->dst_pos_byte);
+ set_buffer_temp (current);
}
coding->dst_multibyte
= ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
doc: /* Return t if OBJECT is nil or a coding-system.
See the documentation of `define-coding-system' for information
about coding-system objects. */)
- (obj)
- Lisp_Object obj;
+ (object)
+ Lisp_Object object;
{
- if (NILP (obj)
- || CODING_SYSTEM_ID (obj) >= 0)
+ if (NILP (object)
+ || CODING_SYSTEM_ID (object) >= 0)
return Qt;
- if (! SYMBOLP (obj)
- || NILP (Fget (obj, Qcoding_system_define_form)))
+ if (! SYMBOLP (object)
+ || NILP (Fget (object, Qcoding_system_define_form)))
return Qnil;
return Qt;
}
{
const unsigned char *src_end = src + src_bytes;
Lisp_Object attrs, eol_type;
- Lisp_Object val;
+ Lisp_Object val = Qnil;
struct coding_system coding;
int id;
struct coding_detection_info detect_info;
coding.src_multibyte = multibytep;
coding.consumed = 0;
coding.mode |= CODING_MODE_LAST_BLOCK;
+ coding.head_ascii = 0;
detect_info.checked = detect_info.found = detect_info.rejected = 0;
struct coding_system *this;
int c, i;
- coding.head_ascii = -1;
/* Skip all ASCII bytes except for a few ISO2022 controls. */
for (; src < src_end; src++)
{
if (c & 0x80)
{
eight_bit_found = 1;
- if (coding.head_ascii < 0)
- coding.head_ascii = src - coding.source;
if (null_byte_found)
break;
}
- if (c < 0x20)
+ else if (c < 0x20)
{
if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
&& ! inhibit_iso_escape_detection
&& ! detect_info.checked)
{
- if (coding.head_ascii < 0)
- coding.head_ascii = src - coding.source;
if (detect_coding_iso_2022 (&coding, &detect_info))
{
/* We have scanned the whole data. */
if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
- /* We didn't find an 8-bit code. We may have
- found a null-byte, but it's very rare that
- a binary file confirm to ISO-2022. */
- src = src_end;
+ {
+ /* We didn't find an 8-bit code. We may
+ have found a null-byte, but it's very
+ rare that a binary file confirm to
+ ISO-2022. */
+ src = src_end;
+ coding.head_ascii = src - coding.source;
+ }
+ detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
break;
}
}
- else if (! c)
+ else if (! c && !inhibit_null_byte_detection)
{
null_byte_found = 1;
if (eight_bit_found)
break;
}
+ if (! eight_bit_found)
+ coding.head_ascii++;
}
+ else if (! eight_bit_found)
+ coding.head_ascii++;
}
- if (coding.head_ascii < 0)
- coding.head_ascii = src - coding.source;
if (null_byte_found || eight_bit_found
|| coding.head_ascii < coding.src_bytes
}
}
- if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
+ if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
+ || null_byte_found)
{
detect_info.found = CATEGORY_MASK_RAW_TEXT;
- id = coding_categories[coding_category_raw_text].id;
+ id = CODING_SYSTEM_ID (Qno_conversion);
val = Fcons (make_number (id), Qnil);
}
else if (! detect_info.rejected && ! detect_info.found)
{
int mask = detect_info.rejected | detect_info.found;
int found = 0;
- val = Qnil;
for (i = coding_category_raw_text - 1; i >= 0; i--)
{
detect_info.found |= found;
}
}
+ else if (base_category == coding_category_utf_8_auto)
+ {
+ if (detect_coding_utf_8 (&coding, &detect_info))
+ {
+ struct coding_system *this;
+
+ if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+ this = coding_categories + coding_category_utf_8_sig;
+ else
+ this = coding_categories + coding_category_utf_8_nosig;
+ val = Fcons (make_number (this->id), Qnil);
+ }
+ }
else if (base_category == coding_category_utf_16_auto)
{
if (detect_coding_utf_16 (&coding, &detect_info))
/* Then, detect eol-format if necessary. */
{
- int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
+ int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
Lisp_Object tail;
if (VECTORP (eol_type))
}
}
- return (highest ? XCAR (val) : val);
+ return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
}
Return a list of possible coding systems ordered by priority.
If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
If optional argument HIGHEST is non-nil, return the coding system of
highest priority. */)
Return a list of possible coding systems ordered by priority.
If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
If optional argument HIGHEST is non-nil, return the coding system of
highest priority. */)
Sunencodable_char_position, 3, 5, 0,
doc: /*
Return position of first un-encodable character in a region.
-START and END specfiy the region and CODING-SYSTEM specifies the
+START and END specify the region and CODING-SYSTEM specifies the
encoding to check. Return nil if CODING-SYSTEM does encode the region.
If optional 4th argument COUNT is non-nil, it specifies at most how
CODING-SYSTEM-LIST is a list of coding systems to check.
The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
-CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
+CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
whole region, POS0, POS1, ... are buffer positions where non-encodable
characters are found.
START may be a string. In that case, check if the string is
encodable, and the value contains indices to the string instead of
-buffer positions. END is ignored. */)
+buffer positions. END is ignored.
+
+If the current buffer (or START if it is a string) is unibyte, the value
+is nil. */)
(start, end, coding_system_list)
Lisp_Object start, end, coding_system_list;
{
if (STRINGP (start))
{
if (!STRING_MULTIBYTE (start)
- && SCHARS (start) != SBYTES (start))
+ || SCHARS (start) == SBYTES (start))
return Qnil;
start_byte = 0;
end_byte = SBYTES (start);
start_byte = CHAR_TO_BYTE (XINT (start));
end_byte = CHAR_TO_BYTE (XINT (end));
if (XINT (end) - XINT (start) == end_byte - start_byte)
- return Qt;
+ return Qnil;
if (XINT (start) < GPT && XINT (end) > GPT)
{
Optional 4th arguments DESTINATION specifies where the decoded text goes.
If nil, the region between START and END is replaced by the decoded text.
-If buffer, the decoded text is inserted in the buffer.
-If t, the decoded text is returned.
+If buffer, the decoded text is inserted in that buffer after point (point
+does not move).
+In those cases, the length of the decoded text is returned.
+If DESTINATION is t, the decoded text is returned.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified.)
-It returns the length of the decoded text. */)
+not fully specified.) */)
(start, end, coding_system, destination)
Lisp_Object start, end, coding_system, destination;
{
DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3, 4, "r\nzCoding system: ",
doc: /* Encode the current region by specified coding system.
-When called from a program, takes three arguments:
-START, END, and CODING-SYSTEM. START and END are buffer positions.
+When called from a program, takes four arguments:
+ START, END, CODING-SYSTEM and DESTINATION.
+START and END are buffer positions.
Optional 4th arguments DESTINATION specifies where the encoded text goes.
If nil, the region between START and END is replace by the encoded text.
-If buffer, the encoded text is inserted in the buffer.
-If t, the encoded text is returned.
+If buffer, the encoded text is inserted in that buffer after point (point
+does not move).
+In those cases, the length of the encoded text is returned.
+If DESTINATION is t, the encoded text is returned.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified.)
-It returns the length of the encoded text. */)
+not fully specified.) */)
(start, end, coding_system, destination)
Lisp_Object start, end, coding_system, destination;
{
Optional third arg NOCOPY non-nil means it is OK to return STRING itself
if the decoding operation is trivial.
-Optional fourth arg BUFFER non-nil meant that the decoded text is
-inserted in BUFFER instead of returned as a string. In this case,
-the return value is BUFFER.
+Optional fourth arg BUFFER non-nil means that the decoded text is
+inserted in that buffer after point (point does not move). In this
+case, the return value is the length of the decoded text.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified. */)
+not fully specified.) */)
(string, coding_system, nocopy, buffer)
Lisp_Object string, coding_system, nocopy, buffer;
{
Optional third arg NOCOPY non-nil means it is OK to return STRING
itself if the encoding operation is trivial.
-Optional fourth arg BUFFER non-nil meant that the encoded text is
-inserted in BUFFER instead of returned as a string. In this case,
-the return value is BUFFER.
+Optional fourth arg BUFFER non-nil means that the encoded text is
+inserted in that buffer after point (point does not move). In this
+case, the return value is the length of the encoded text.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
TARGET has a meaning which depends on OPERATION:
For file I/O, TARGET is a file name (except for the special case below).
For process I/O, TARGET is a process name.
- For network I/O, TARGET is a service name or a port number
+ For network I/O, TARGET is a service name or a port number.
-This function looks up what specified for TARGET in,
+This function looks up what is specified for TARGET in
`file-coding-system-alist', `process-coding-system-alist',
or `network-coding-system-alist' depending on OPERATION.
They may specify a coding system, a cons of coding systems,
DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
Sset_coding_system_priority, 0, MANY, 0,
doc: /* Assign higher priority to the coding systems given as arguments.
-If multiple coding systems belongs to the same category,
+If multiple coding systems belong to the same category,
all but the first one are ignored.
-usage: (set-coding-system-priority ...) */)
+usage: (set-coding-system-priority &rest coding-systems) */)
(nargs, args)
int nargs;
Lisp_Object *args;
val = XCDR (bom);
CHECK_CODING_SYSTEM (val);
}
- ASET (attrs, coding_attr_utf_16_bom, bom);
+ ASET (attrs, coding_attr_utf_bom, bom);
endian = args[coding_arg_utf16_endian];
CHECK_SYMBOL (endian);
}
else if (EQ (coding_type, Qutf_8))
{
- category = coding_category_utf_8;
+ Lisp_Object bom;
+
CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
+
+ if (nargs < coding_arg_utf8_max)
+ goto short_args;
+
+ bom = args[coding_arg_utf8_bom];
+ if (! NILP (bom) && ! EQ (bom, Qt))
+ {
+ CHECK_CONS (bom);
+ val = XCAR (bom);
+ CHECK_CODING_SYSTEM (val);
+ val = XCDR (bom);
+ CHECK_CODING_SYSTEM (val);
+ }
+ ASET (attrs, coding_attr_utf_bom, bom);
+
+ category = (CONSP (bom) ? coding_category_utf_8_auto
+ : NILP (bom) ? coding_category_utf_8_nosig
+ : coding_category_utf_8_sig);
}
else if (EQ (coding_type, Qundecided))
category = coding_category_undecided;
CHECK_CHARACTER (val);
CODING_ATTR_MNEMONIC (attrs) = val;
}
- else if (EQ (prop, QCdefalut_char))
+ else if (EQ (prop, QCdefault_char))
{
if (NILP (val))
val = make_number (' ');
CHECK_SYMBOL (alias);
CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
aliases = AREF (spec, 1);
- /* ALISES should be a list of length more than zero, and the first
+ /* ALIASES should be a list of length more than zero, and the first
element is a base coding system. Append ALIAS at the tail of the
list. */
while (!NILP (XCDR (aliases)))
DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
Scoding_system_eol_type, 1, 1, 0,
doc: /* Return eol-type of CODING-SYSTEM.
-An eol-type is integer 0, 1, 2, or a vector of coding systems.
+An eol-type is an integer 0, 1, 2, or a vector of coding systems.
Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
and CR respectively.
DEFSYM (QCcategory, ":category");
DEFSYM (QCmnemonic, ":mnemonic");
- DEFSYM (QCdefalut_char, ":default-char");
+ DEFSYM (QCdefault_char, ":default-char");
DEFSYM (QCdecode_translation_table, ":decode-translation-table");
DEFSYM (QCencode_translation_table, ":encode-translation-table");
DEFSYM (QCpost_read_conversion, ":post-read-conversion");
intern ("coding-category-iso-7-else"));
ASET (Vcoding_category_table, coding_category_iso_8_else,
intern ("coding-category-iso-8-else"));
- ASET (Vcoding_category_table, coding_category_utf_8,
+ ASET (Vcoding_category_table, coding_category_utf_8_auto,
+ intern ("coding-category-utf-8-auto"));
+ ASET (Vcoding_category_table, coding_category_utf_8_nosig,
intern ("coding-category-utf-8"));
+ ASET (Vcoding_category_table, coding_category_utf_8_sig,
+ intern ("coding-category-utf-8-sig"));
ASET (Vcoding_category_table, coding_category_utf_16_be,
intern ("coding-category-utf-16-be"));
ASET (Vcoding_category_table, coding_category_utf_16_auto,
DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
doc: /* Alist of coding system names.
Each element is one element list of coding system name.
-This variable is given to `completing-read' as TABLE argument.
+This variable is given to `completing-read' as COLLECTION argument.
Do not alter the value of this variable manually. This variable should be
updated by the functions `make-coding-system' and
doc: /* Specify the coding system for read operations.
It is useful to bind this variable with `let', but do not set it globally.
If the value is a coding system, it is used for decoding on read operation.
-If not, an appropriate element is used from one of the coding system alists:
-There are three such tables, `file-coding-system-alist',
+If not, an appropriate element is used from one of the coding system alists.
+There are three such tables: `file-coding-system-alist',
`process-coding-system-alist', and `network-coding-system-alist'. */);
Vcoding_system_for_read = Qnil;
when writing it to a file and when sending it to a file or subprocess.
If this does not specify a coding system, an appropriate element
-is used from one of the coding system alists:
-There are three such tables, `file-coding-system-alist',
+is used from one of the coding system alists.
+There are three such tables: `file-coding-system-alist',
`process-coding-system-alist', and `network-coding-system-alist'.
For output to files, if the above procedure does not specify a coding system,
the value of `buffer-file-coding-system' is used. */);
a coding system of ISO 2022 variant which has a flag
`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
or reading output of a subprocess.
-Only 128th through 159th elements has a meaning. */);
+Only 128th through 159th elements have a meaning. */);
Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
DEFVAR_LISP ("select-safe-coding-system-function",
DEFVAR_BOOL ("inhibit-iso-escape-detection",
&inhibit_iso_escape_detection,
doc: /*
-If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
+If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
-By default, on reading a file, Emacs tries to detect how the text is
-encoded. This code detection is sensitive to escape sequences. If
-the sequence is valid as ISO2022, the code is determined as one of
-the ISO2022 encodings, and the file is decoded by the corresponding
-coding system (e.g. `iso-2022-7bit').
+When Emacs reads text, it tries to detect how the text is encoded.
+This code detection is sensitive to escape sequences. If Emacs sees
+a valid ISO-2022 escape sequence, it assumes the text is encoded in one
+of the ISO2022 encodings, and decodes text by the corresponding coding
+system (e.g. `iso-2022-7bit').
However, there may be a case that you want to read escape sequences in
a file as is. In such a case, you can set this variable to non-nil.
-Then, as the code detection ignores any escape sequences, no file is
-detected as encoded in some ISO2022 encoding. The result is that all
+Then the code detection will ignore any escape sequences, and no text is
+detected as encoded in some ISO-2022 encoding. The result is that all
escape sequences become visible in a buffer.
The default value is nil, and it is strongly recommended not to change
reading if you suppress escape sequence detection.
The other way to read escape sequences in a file without decoding is
-to explicitly specify some coding system that doesn't use ISO2022's
+to explicitly specify some coding system that doesn't use ISO-2022
escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
inhibit_iso_escape_detection = 0;
+ DEFVAR_BOOL ("inhibit-null-byte-detection",
+ &inhibit_null_byte_detection,
+ doc: /* If non-nil, Emacs ignores null bytes on code detection.
+By default, Emacs treats it as binary data, and does not attempt to
+decode it. The effect is as if you specified `no-conversion' for
+reading that text.
+
+Set this to non-nil when a regular text happens to include null bytes.
+Examples are Index nodes of Info files and null-byte delimited output
+from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
+decode text as usual. */);
+ inhibit_null_byte_detection = 0;
+
DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
doc: /* Char table for translating self-inserting characters.
-This is applied to the result of input methods, not their input. See also
-`keyboard-translate-table'. */);
+This is applied to the result of input methods, not their input.
+See also `keyboard-translate-table'.
+
+Use of this variable for character code unification was rendered
+obsolete in Emacs 23.1 and later, since Unicode is now the basis of
+internal character representation. */);
Vtranslation_table_for_input = Qnil;
{