/* Coding system handler (conversion, detection, etc).
Copyright (C) 2001, 2002, 2003, 2004, 2005,
- 2006, 2007, 2008 Free Software Foundation, Inc.
+ 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
- 2005, 2006, 2007, 2008
+ 2005, 2006, 2007, 2008, 2009
National Institute of Advanced Industrial Science and Technology (AIST)
Registration Number H14PRO021
Copyright (C) 2003
This file is part of GNU Emacs.
-GNU Emacs is free software; you can redistribute it and/or modify
+GNU Emacs is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with GNU Emacs; see the file COPYING. If not, write to
-the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-Boston, MA 02110-1301, USA. */
+along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
/*** TABLE OF CONTENTS ***
Lisp_Object Qbig, Qlittle;
Lisp_Object Qcoding_system_history;
Lisp_Object Qvalid_codes;
-Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
+Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
Lisp_Object QCdecode_translation_table, QCencode_translation_table;
Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
Lisp_Object QCascii_compatible_p;
/* Flag to inhibit ISO2022 escape sequence detection. */
int inhibit_iso_escape_detection;
+/* Flag to inhibit detection of binary files through null bytes. */
+int inhibit_null_byte_detection;
+
/* Flag to make buffer-file-coding-system inherit from process-coding. */
int inherit_process_coding_system;
reg)))
-#define CODING_ISO_REQUEST(coding, charset_id) \
- ((charset_id <= (coding)->max_charset_id \
- ? (coding)->safe_charsets[charset_id] \
+#define CODING_ISO_REQUEST(coding, charset_id) \
+ (((charset_id) <= (coding)->max_charset_id \
+ ? ((coding)->safe_charsets[charset_id] != 255 \
+ ? (coding)->safe_charsets[charset_id] \
+ : -1) \
: -1))
((coding)->spec.iso_2022.bol)
#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
+#define CODING_ISO_CMP_STATUS(coding) \
+ (&(coding)->spec.iso_2022.cmp_status)
+#define CODING_ISO_EXTSEGMENT_LEN(coding) \
+ ((coding)->spec.iso_2022.ctext_extended_segment_len)
+#define CODING_ISO_EMBEDDED_UTF_8(coding) \
+ ((coding)->spec.iso_2022.embedded_utf_8)
/* Control characters of ISO2022. */
/* code */ /* function */
character is prohibited by CODING_ISO_FLAG_SAFE. */
#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
+/* UTF-8 section */
+#define CODING_UTF_8_BOM(coding) \
+ ((coding)->spec.utf_8_bom)
/* UTF-16 section */
#define CODING_UTF_16_BOM(coding) \
coding_category_iso_8_2,
coding_category_iso_7_else,
coding_category_iso_8_else,
- coding_category_utf_8,
+ coding_category_utf_8_auto,
+ coding_category_utf_8_nosig,
+ coding_category_utf_8_sig,
coding_category_utf_16_auto,
coding_category_utf_16_be,
coding_category_utf_16_le,
#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
-#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
+#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
+#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
+#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
| CATEGORY_MASK_ISO_8_2 \
| CATEGORY_MASK_ISO_7_ELSE \
| CATEGORY_MASK_ISO_8_ELSE \
- | CATEGORY_MASK_UTF_8 \
+ | CATEGORY_MASK_UTF_8_AUTO \
+ | CATEGORY_MASK_UTF_8_NOSIG \
+ | CATEGORY_MASK_UTF_8_SIG \
+ | CATEGORY_MASK_UTF_16_AUTO \
| CATEGORY_MASK_UTF_16_BE \
| CATEGORY_MASK_UTF_16_LE \
| CATEGORY_MASK_UTF_16_BE_NOSIG \
| CATEGORY_MASK_ISO_ELSE)
#define CATEGORY_MASK_UTF_16 \
- (CATEGORY_MASK_UTF_16_BE \
+ (CATEGORY_MASK_UTF_16_AUTO \
+ | CATEGORY_MASK_UTF_16_BE \
| CATEGORY_MASK_UTF_16_LE \
| CATEGORY_MASK_UTF_16_BE_NOSIG \
| CATEGORY_MASK_UTF_16_LE_NOSIG)
+#define CATEGORY_MASK_UTF_8 \
+ (CATEGORY_MASK_UTF_8_AUTO \
+ | CATEGORY_MASK_UTF_8_NOSIG \
+ | CATEGORY_MASK_UTF_8_SIG)
/* List of symbols `coding-category-xxx' ordered by priority. This
variable is exposed to Emacs Lisp. */
consumed_chars++; \
} while (0)
+/* Safely get two bytes from the source text pointed by SRC which ends
+ at SRC_END, and set C1 and C2 to those bytes while skipping the
+ heading multibyte characters. If there are not enough bytes in the
+ source, it jumps to `no_more_source'. If multibytep is nonzero and
+ a multibyte character is found for C2, set C2 to the negative value
+ of the character code. The caller should declare and set these
+ variables appropriately in advance:
+ src, src_end, multibytep
+ It is intended that this macro is used in detect_coding_utf_16. */
+
+#define TWO_MORE_BYTES(c1, c2) \
+ do { \
+ do { \
+ if (src == src_end) \
+ goto no_more_source; \
+ c1 = *src++; \
+ if (multibytep && (c1 & 0x80)) \
+ { \
+ if ((c1 & 0xFE) == 0xC0) \
+ c1 = ((c1 & 1) << 6) | *src++; \
+ else \
+ { \
+ src += BYTES_BY_CHAR_HEAD (c1) - 1; \
+ c1 = -1; \
+ } \
+ } \
+ } while (c1 < 0); \
+ if (src == src_end) \
+ goto no_more_source; \
+ c2 = *src++; \
+ if (multibytep && (c2 & 0x80)) \
+ { \
+ if ((c2 & 0xFE) == 0xC0) \
+ c2 = ((c2 & 1) << 6) | *src++; \
+ else \
+ c2 = -1; \
+ } \
+ } while (0)
+
#define ONE_MORE_BYTE_NO_CHECK(c) \
do { \
static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
static void decode_eol P_ ((struct coding_system *));
static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
-static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
- int, int *, int *));
+static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
-static INLINE void produce_composition P_ ((struct coding_system *, int *,
- EMACS_INT));
static INLINE void produce_charset P_ ((struct coding_system *, int *,
EMACS_INT));
static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
static int decode_coding P_ ((struct coding_system *));
static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
- struct coding_system *,
+ struct coding_system *,
int *, EMACS_INT *));
static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
struct coding_system *,
} while (0)
+/* If there are at least BYTES length of room at dst, allocate memory
+ for coding->destination and update dst and dst_end. We don't have
+ to take care of coding->source which will be relocated. It is
+ handled by calling coding_set_source in encode_coding. */
+
#define ASSURE_DESTINATION(bytes) \
do { \
if (dst + (bytes) >= dst_end) \
} while (0)
+/* Store multibyte form of the character C in P, and advance P to the
+ end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
+ never calls MAYBE_UNIFY_CHAR. */
+
+#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
+ do { \
+ if ((c) <= MAX_1_BYTE_CHAR) \
+ *(p)++ = (c); \
+ else if ((c) <= MAX_2_BYTE_CHAR) \
+ *(p)++ = (0xC0 | ((c) >> 6)), \
+ *(p)++ = (0x80 | ((c) & 0x3F)); \
+ else if ((c) <= MAX_3_BYTE_CHAR) \
+ *(p)++ = (0xE0 | ((c) >> 12)), \
+ *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
+ *(p)++ = (0x80 | ((c) & 0x3F)); \
+ else if ((c) <= MAX_4_BYTE_CHAR) \
+ *(p)++ = (0xF0 | (c >> 18)), \
+ *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
+ *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
+ *(p)++ = (0x80 | (c & 0x3F)); \
+ else if ((c) <= MAX_5_BYTE_CHAR) \
+ *(p)++ = 0xF8, \
+ *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
+ *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
+ *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
+ *(p)++ = (0x80 | (c & 0x3F)); \
+ else \
+ (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
+ } while (0)
+
+
+/* Return the character code of character whose multibyte form is at
+ P, and advance P to the end of the multibyte form. This is like
+ STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
+
+#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
+ (!((p)[0] & 0x80) \
+ ? *(p)++ \
+ : ! ((p)[0] & 0x20) \
+ ? ((p) += 2, \
+ ((((p)[-2] & 0x1F) << 6) \
+ | ((p)[-1] & 0x3F) \
+ | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
+ : ! ((p)[0] & 0x10) \
+ ? ((p) += 3, \
+ ((((p)[-3] & 0x0F) << 12) \
+ | (((p)[-2] & 0x3F) << 6) \
+ | ((p)[-1] & 0x3F))) \
+ : ! ((p)[0] & 0x08) \
+ ? ((p) += 4, \
+ ((((p)[-4] & 0xF) << 18) \
+ | (((p)[-3] & 0x3F) << 12) \
+ | (((p)[-2] & 0x3F) << 6) \
+ | ((p)[-1] & 0x3F))) \
+ : ((p) += 5, \
+ ((((p)[-4] & 0x3F) << 18) \
+ | (((p)[-3] & 0x3F) << 12) \
+ | (((p)[-2] & 0x3F) << 6) \
+ | ((p)[-1] & 0x3F))))
+
static void
coding_set_source (coding)
{
if (coding->src_pos < 0)
{
- coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
+ coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
coding->dst_bytes = (GAP_END_ADDR
- (coding->src_bytes - coding->consumed)
- coding->destination);
/* We are sure that coding->dst_pos_byte is before the gap
of the buffer. */
coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
- + coding->dst_pos_byte - 1);
+ + coding->dst_pos_byte - BEG_BYTE);
coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
- coding->destination);
}
}
static void
-coding_alloc_by_making_gap (coding, offset, bytes)
+coding_alloc_by_making_gap (coding, gap_head_used, bytes)
struct coding_system *coding;
- EMACS_INT offset, bytes;
+ EMACS_INT gap_head_used, bytes;
{
- if (BUFFERP (coding->dst_object)
- && EQ (coding->src_object, coding->dst_object))
+ if (EQ (coding->src_object, coding->dst_object))
{
- EMACS_INT add = offset + (coding->src_bytes - coding->consumed);
+ /* The gap may contain the produced data at the head and not-yet
+ consumed data at the tail. To preserve those data, we at
+ first make the gap size to zero, then increase the gap
+ size. */
+ EMACS_INT add = GAP_SIZE;
- GPT += offset, GPT_BYTE += offset;
- GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
+ GPT += gap_head_used, GPT_BYTE += gap_head_used;
+ GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
make_gap (bytes);
GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
- GPT -= offset, GPT_BYTE -= offset;
+ GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
}
else
{
EMACS_INT offset = dst - coding->destination;
if (BUFFERP (coding->dst_object))
- coding_alloc_by_making_gap (coding, offset, nbytes);
+ {
+ struct buffer *buf = XBUFFER (coding->dst_object);
+
+ coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
+ }
else
coding_alloc_by_realloc (coding, nbytes);
record_conversion_result (coding, CODING_RESULT_SUCCESS);
/** Macros for annotations. */
-/* Maximum length of annotation data (sum of annotations for
- composition and charset). */
-#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
-
/* An annotation data is stored in the array coding->charbuf in this
format:
[ -LENGTH ANNOTATION_MASK NCHARS ... ]
In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
follows:
- ... METHOD [ COMPOSITION-COMPONENTS ... ]
+ ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
+
+ NBYTES is the number of bytes specified in the header part of
+ old-style emacs-mule encoding, or 0 for the other kind of
+ composition.
+
METHOD is one of enum composition_method.
+
Optionnal COMPOSITION-COMPONENTS are characters and composition
rules.
In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
- follows. */
+ follows.
+
+ If ANNOTATION_MASK is 0, this annotation is just a space holder to
+ recover from an invalid annotation, and should be skipped by
+ produce_annotation. */
+
+/* Maximum length of the header of annotation data. */
+#define MAX_ANNOTATION_LENGTH 5
#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
do { \
coding->annotated = 1; \
} while (0);
-#define ADD_COMPOSITION_DATA(buf, nchars, method) \
+#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
do { \
- ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
+ ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
+ *buf++ = nbytes; \
*buf++ = method; \
} while (0)
#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
+#define UTF_BOM 0xFEFF
+#define UTF_8_BOM_1 0xEF
+#define UTF_8_BOM_2 0xBB
+#define UTF_8_BOM_3 0xBF
+
static int
detect_coding_utf_8 (coding, detect_info)
struct coding_system *coding;
const unsigned char *src_end = coding->source + coding->src_bytes;
int multibytep = coding->src_multibyte;
int consumed_chars = 0;
+ int bom_found = 0;
int found = 0;
detect_info->checked |= CATEGORY_MASK_UTF_8;
break;
if (UTF_8_2_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
continue;
}
ONE_MORE_BYTE (c2);
break;
if (UTF_8_3_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
+ if (src_base == coding->source
+ && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
+ bom_found = 1;
continue;
}
ONE_MORE_BYTE (c3);
break;
if (UTF_8_4_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
continue;
}
ONE_MORE_BYTE (c4);
break;
if (UTF_8_5_OCTET_LEADING_P (c))
{
- found = CATEGORY_MASK_UTF_8;
+ found = 1;
continue;
}
break;
detect_info->rejected |= CATEGORY_MASK_UTF_8;
return 0;
}
- detect_info->found |= found;
+ if (bom_found)
+ {
+ /* The first character 0xFFFE doesn't necessarily mean a BOM. */
+ detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
+ }
+ else
+ {
+ detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
+ if (found)
+ detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
+ }
return 1;
}
const unsigned char *src_base;
int *charbuf = coding->charbuf + coding->charbuf_used;
int *charbuf_end = coding->charbuf + coding->charbuf_size;
- int consumed_chars = 0, consumed_chars_base;
+ int consumed_chars = 0, consumed_chars_base = 0;
int multibytep = coding->src_multibyte;
+ enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
Lisp_Object attr, charset_list;
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+ int byte_after_cr = -1;
CODING_GET_INFO (coding, attr, charset_list);
+ if (bom != utf_without_bom)
+ {
+ int c1, c2, c3;
+
+ src_base = src;
+ ONE_MORE_BYTE (c1);
+ if (! UTF_8_3_OCTET_LEADING_P (c1))
+ src = src_base;
+ else
+ {
+ ONE_MORE_BYTE (c2);
+ if (! UTF_8_EXTRA_OCTET_P (c2))
+ src = src_base;
+ else
+ {
+ ONE_MORE_BYTE (c3);
+ if (! UTF_8_EXTRA_OCTET_P (c3))
+ src = src_base;
+ else
+ {
+ if ((c1 != UTF_8_BOM_1)
+ || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
+ src = src_base;
+ else
+ CODING_UTF_8_BOM (coding) = utf_without_bom;
+ }
+ }
+ }
+ }
+ CODING_UTF_8_BOM (coding) = utf_without_bom;
+
+
+
while (1)
{
int c, c1, c2, c3, c4, c5;
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
- ONE_MORE_BYTE (c1);
+ if (byte_after_cr >= 0)
+ c1 = byte_after_cr, byte_after_cr = -1;
+ else
+ ONE_MORE_BYTE (c1);
if (c1 < 0)
{
c = - c1;
}
else if (UTF_8_1_OCTET_P(c1))
{
+ if (eol_crlf && c1 == '\r')
+ ONE_MORE_BYTE (byte_after_cr);
c = c1;
}
else
int produced_chars = 0;
int c;
+ if (CODING_UTF_8_BOM (coding) == utf_with_bom)
+ {
+ ASSURE_DESTINATION (3);
+ EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
+ CODING_UTF_8_BOM (coding) = utf_without_bom;
+ }
+
if (multibytep)
{
int safe_room = MAX_MULTIBYTE_LENGTH * 2;
}
else
{
- CHAR_STRING_ADVANCE (c, pend);
+ CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
for (p = str; p < pend; p++)
EMIT_ONE_BYTE (*p);
}
if (CHAR_BYTE8_P (c))
*dst++ = CHAR_TO_BYTE8 (c);
else
- dst += CHAR_STRING (c, dst);
+ CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
produced_chars++;
}
}
return 0;
}
- ONE_MORE_BYTE (c1);
- ONE_MORE_BYTE (c2);
+ TWO_MORE_BYTES (c1, c2);
if ((c1 == 0xFF) && (c2 == 0xFE))
{
detect_info->found |= (CATEGORY_MASK_UTF_16_LE
| CATEGORY_MASK_UTF_16_BE_NOSIG
| CATEGORY_MASK_UTF_16_LE_NOSIG);
}
- else if (c1 >= 0 && c2 >= 0)
+ else if (c2 < 0)
{
+ detect_info->rejected |= CATEGORY_MASK_UTF_16;
+ return 0;
+ }
+ else
+ {
+ /* We check the dispersion of Eth and Oth bytes where E is even and
+ O is odd. If both are high, we assume binary data.*/
+ unsigned char e[256], o[256];
+ unsigned e_num = 1, o_num = 1;
+
+ memset (e, 0, 256);
+ memset (o, 0, 256);
+ e[c1] = 1;
+ o[c2] = 1;
+
detect_info->rejected
|= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
+
+ while (1)
+ {
+ TWO_MORE_BYTES (c1, c2);
+ if (c2 < 0)
+ break;
+ if (! e[c1])
+ {
+ e[c1] = 1;
+ e_num++;
+ if (e_num >= 128)
+ break;
+ }
+ if (! o[c2])
+ {
+ o[c1] = 1;
+ o_num++;
+ if (o_num >= 128)
+ break;
+ }
+ }
+ detect_info->rejected |= CATEGORY_MASK_UTF_16;
+ return 0;
}
+
no_more_source:
return 1;
}
const unsigned char *src_base;
int *charbuf = coding->charbuf + coding->charbuf_used;
int *charbuf_end = coding->charbuf + coding->charbuf_size;
- int consumed_chars = 0, consumed_chars_base;
+ int consumed_chars = 0, consumed_chars_base = 0;
int multibytep = coding->src_multibyte;
- enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
int surrogate = CODING_UTF_16_SURROGATE (coding);
Lisp_Object attr, charset_list;
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+ int byte_after_cr1 = -1, byte_after_cr2 = -1;
CODING_GET_INFO (coding, attr, charset_list);
- if (bom == utf_16_with_bom)
+ if (bom == utf_with_bom)
{
int c, c1, c2;
src = src_base;
coding->errors++;
}
- CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ CODING_UTF_16_BOM (coding) = utf_without_bom;
}
- else if (bom == utf_16_detect_bom)
+ else if (bom == utf_detect_bom)
{
/* We have already tried to detect BOM and failed in
detect_coding. */
- CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ CODING_UTF_16_BOM (coding) = utf_without_bom;
}
while (1)
consumed_chars_base = consumed_chars;
if (charbuf + 2 >= charbuf_end)
- break;
+ {
+ if (byte_after_cr1 >= 0)
+ src_base -= 2;
+ break;
+ }
- ONE_MORE_BYTE (c1);
+ if (byte_after_cr1 >= 0)
+ c1 = byte_after_cr1, byte_after_cr1 = -1;
+ else
+ ONE_MORE_BYTE (c1);
if (c1 < 0)
{
*charbuf++ = -c1;
continue;
}
- ONE_MORE_BYTE (c2);
+ if (byte_after_cr2 >= 0)
+ c2 = byte_after_cr2, byte_after_cr2 = -1;
+ else
+ ONE_MORE_BYTE (c2);
if (c2 < 0)
{
*charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
}
c = (endian == utf_16_big_endian
? ((c1 << 8) | c2) : ((c2 << 8) | c1));
+
if (surrogate)
{
if (! UTF_16_LOW_SURROGATE_P (c))
if (UTF_16_HIGH_SURROGATE_P (c))
CODING_UTF_16_SURROGATE (coding) = surrogate = c;
else
- *charbuf++ = c;
+ {
+ if (eol_crlf && c == '\r')
+ {
+ ONE_MORE_BYTE (byte_after_cr1);
+ ONE_MORE_BYTE (byte_after_cr2);
+ }
+ *charbuf++ = c;
+ }
}
}
unsigned char *dst = coding->destination + coding->produced;
unsigned char *dst_end = coding->destination + coding->dst_bytes;
int safe_room = 8;
- enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+ enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
int produced_chars = 0;
Lisp_Object attrs, charset_list;
CODING_GET_INFO (coding, attrs, charset_list);
- if (bom != utf_16_without_bom)
+ if (bom != utf_without_bom)
{
ASSURE_DESTINATION (safe_room);
if (big_endian)
EMIT_TWO_BYTES (0xFE, 0xFF);
else
EMIT_TWO_BYTES (0xFF, 0xFE);
- CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+ CODING_UTF_16_BOM (coding) = utf_without_bom;
}
while (charbuf < charbuf_end)
Next, character composition data are represented by the byte
sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
where,
- METHOD is 0xF0 plus one of composition method (enum
+ METHOD is 0xF2 plus one of composition method (enum
composition_method),
BYTES is 0xA0 plus a byte length of this composition data,
- CHARS is 0x20 plus a number of characters composed by this
+ CHARS is 0xA0 plus a number of characters composed by this
data,
COMPONENTs are characters of multibye form or composition
char emacs_mule_bytes[256];
+
+/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ Check if a text is encoded in `emacs-mule'. If it is, return 1,
+ else return 0. */
+
+static int
+detect_coding_emacs_mule (coding, detect_info)
+ struct coding_system *coding;
+ struct coding_detection_info *detect_info;
+{
+ const unsigned char *src = coding->source, *src_base;
+ const unsigned char *src_end = coding->source + coding->src_bytes;
+ int multibytep = coding->src_multibyte;
+ int consumed_chars = 0;
+ int c;
+ int found = 0;
+
+ detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
+ /* A coding system of this category is always ASCII compatible. */
+ src += coding->head_ascii;
+
+ while (1)
+ {
+ src_base = src;
+ ONE_MORE_BYTE (c);
+ if (c < 0)
+ continue;
+ if (c == 0x80)
+ {
+ /* Perhaps the start of composite character. We simply skip
+ it because analyzing it is too heavy for detecting. But,
+ at least, we check that the composite character
+ constitutes of more than 4 bytes. */
+ const unsigned char *src_base;
+
+ repeat:
+ src_base = src;
+ do
+ {
+ ONE_MORE_BYTE (c);
+ }
+ while (c >= 0xA0);
+
+ if (src - src_base <= 4)
+ break;
+ found = CATEGORY_MASK_EMACS_MULE;
+ if (c == 0x80)
+ goto repeat;
+ }
+
+ if (c < 0x80)
+ {
+ if (c < 0x20
+ && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
+ break;
+ }
+ else
+ {
+ int more_bytes = emacs_mule_bytes[*src_base] - 1;
+
+ while (more_bytes > 0)
+ {
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ {
+ src--; /* Unread the last byte. */
+ break;
+ }
+ more_bytes--;
+ }
+ if (more_bytes != 0)
+ break;
+ found = CATEGORY_MASK_EMACS_MULE;
+ }
+ }
+ detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
+ return 0;
+
+ no_more_source:
+ if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
+ return 0;
+ }
+ detect_info->found |= found;
+ return 1;
+}
+
+
+/* Parse emacs-mule multibyte sequence at SRC and return the decoded
+ character. If CMP_STATUS indicates that we must expect MSEQ or
+ RULE described above, decode it and return the negative value of
+ the deocded character or rule. If an invalid byte is found, return
+ -1. If SRC is too short, return -2. */
+
int
-emacs_mule_char (coding, src, nbytes, nchars, id)
+emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
struct coding_system *coding;
const unsigned char *src;
int *nbytes, *nchars, *id;
+ struct composition_status *cmp_status;
{
const unsigned char *src_end = coding->source + coding->src_bytes;
const unsigned char *src_base = src;
unsigned code;
int c;
int consumed_chars = 0;
+ int mseq_found = 0;
ONE_MORE_BYTE (c);
if (c < 0)
{
if (c >= 0xA0)
{
- /* Old style component character of a compostion. */
- if (c == 0xA0)
+ if (cmp_status->state != COMPOSING_NO
+ && cmp_status->old_form)
{
- ONE_MORE_BYTE (c);
- c -= 0x80;
+ if (cmp_status->state == COMPOSING_CHAR)
+ {
+ if (c == 0xA0)
+ {
+ ONE_MORE_BYTE (c);
+ c -= 0x80;
+ if (c < 0)
+ goto invalid_code;
+ }
+ else
+ c -= 0x20;
+ mseq_found = 1;
+ }
+ else
+ {
+ *nbytes = src - src_base;
+ *nchars = consumed_chars;
+ return -c;
+ }
}
else
- c -= 0x20;
+ goto invalid_code;
}
switch (emacs_mule_bytes[c])
*nchars = consumed_chars;
if (id)
*id = charset->id;
- return c;
+ return (mseq_found ? -c : c);
no_more_source:
return -2;
}
-/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
- Check if a text is encoded in `emacs-mule'. If it is, return 1,
- else return 0. */
+/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
-static int
-detect_coding_emacs_mule (coding, detect_info)
- struct coding_system *coding;
- struct coding_detection_info *detect_info;
-{
- const unsigned char *src = coding->source, *src_base;
- const unsigned char *src_end = coding->source + coding->src_bytes;
- int multibytep = coding->src_multibyte;
- int consumed_chars = 0;
- int c;
- int found = 0;
+/* Handle these composition sequence ('|': the end of header elements,
+ BYTES and CHARS >= 0xA0):
- detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
- /* A coding system of this category is always ASCII compatible. */
- src += coding->head_ascii;
+ (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
+ (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
+ (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
- while (1)
- {
- src_base = src;
- ONE_MORE_BYTE (c);
- if (c < 0)
- continue;
- if (c == 0x80)
- {
- /* Perhaps the start of composite character. We simple skip
- it because analyzing it is too heavy for detecting. But,
- at least, we check that the composite character
- constitues of more than 4 bytes. */
- const unsigned char *src_base;
+ and these old form:
+
+ (4) relative composition: 0x80 | MSEQ ... MSEQ
+ (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
- repeat:
- src_base = src;
- do
- {
- ONE_MORE_BYTE (c);
- }
- while (c >= 0xA0);
+ When the starter 0x80 and the following header elements are found,
+ this annotation header is produced.
- if (src - src_base <= 4)
- break;
- found = CATEGORY_MASK_EMACS_MULE;
- if (c == 0x80)
- goto repeat;
- }
+ [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
- if (c < 0x80)
- {
- if (c < 0x20
- && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
- break;
- }
- else
- {
- int more_bytes = emacs_mule_bytes[*src_base] - 1;
+ NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
+ NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
- while (more_bytes > 0)
- {
- ONE_MORE_BYTE (c);
- if (c < 0xA0)
- {
- src--; /* Unread the last byte. */
- break;
- }
- more_bytes--;
- }
- if (more_bytes != 0)
- break;
- found = CATEGORY_MASK_EMACS_MULE;
- }
- }
- detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
- return 0;
+ Then, upon reading the following elements, these codes are produced
+ until the composition end is found:
- no_more_source:
- if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
- {
- detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
- return 0;
- }
- detect_info->found |= found;
- return 1;
-}
+ (1) CHAR ... CHAR
+ (2) ALT ... ALT CHAR ... CHAR
+ (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
+ (4) CHAR ... CHAR
+ (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
+ When the composition end is found, LENGTH and NCHARS in the
+ annotation header is updated as below:
-/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
+ (1) LENGTH: unchanged, NCHARS: unchanged
+ (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
+ (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
+ (4) LENGTH: unchanged, NCHARS: number of CHARs
+ (5) LENGTH: unchanged, NCHARS: number of CHARs
-/* Decode a character represented as a component of composition
- sequence of Emacs 20/21 style at SRC. Set C to that character and
- update SRC to the head of next character (or an encoded composition
- rule). If SRC doesn't points a composition component, set C to -1.
- If SRC points an invalid byte sequence, global exit by a return
- value 0. */
-
-#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
- if (1) \
- { \
- int c; \
- int nbytes, nchars; \
- \
- if (src == src_end) \
- break; \
- c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
- if (c < 0) \
- { \
- if (c == -2) \
- break; \
- goto invalid_code; \
- } \
- *buf++ = c; \
- src += nbytes; \
- consumed_chars += nchars; \
- } \
- else
+ If an error is found while composing, the annotation header is
+ changed to the original composition header (plus filler -1s) as
+ below:
+
+ (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
+ (5) [ 0x80 0xFF -1 -1- -1 ]
+
+ and the sequence [ -2 DECODED-RULE ] is changed to the original
+ byte sequence as below:
+ o the original byte sequence is B: [ B -1 ]
+ o the original byte sequence is B1 B2: [ B1 B2 ]
+ Most of the routines are implemented by macros because many
+ variables and labels in the caller decode_coding_emacs_mule must be
+ accessible, and they are usually called just once (thus doesn't
+ increase the size of compiled object). */
-/* Decode a composition rule represented as a component of composition
- sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
- and increment BUF. If SRC points an invalid byte sequence, set C
- to -1. */
+/* Decode a composition rule represented by C as a component of
+ composition sequence of Emacs 20 style. Set RULE to the decoded
+ rule. */
-#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
+#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
do { \
- int c, gref, nref; \
- \
- if (src >= src_end) \
- goto invalid_code; \
- ONE_MORE_BYTE_NO_CHECK (c); \
+ int gref, nref; \
+ \
c -= 0xA0; \
if (c < 0 || c >= 81) \
goto invalid_code; \
- \
gref = c / 9, nref = c % 9; \
- *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
+ if (gref == 4) gref = 10; \
+ if (nref == 4) nref = 10; \
+ rule = COMPOSITION_ENCODE_RULE (gref, nref); \
} while (0)
-/* Decode a composition rule represented as a component of composition
- sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
- and increment BUF. If SRC points an invalid byte sequence, set C
- to -1. */
+/* Decode a composition rule represented by C and the following byte
+ at SRC as a component of composition sequence of Emacs 21 style.
+ Set RULE to the decoded rule. */
-#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
+#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
do { \
int gref, nref; \
- \
- if (src + 1>= src_end) \
+ \
+ gref = c - 0x20; \
+ if (gref < 0 || gref >= 81) \
goto invalid_code; \
- ONE_MORE_BYTE_NO_CHECK (gref); \
- gref -= 0x20; \
- ONE_MORE_BYTE_NO_CHECK (nref); \
- nref -= 0x20; \
- if (gref < 0 || gref >= 81 \
- || nref < 0 || nref >= 81) \
+ ONE_MORE_BYTE (c); \
+ nref = c - 0x20; \
+ if (nref < 0 || nref >= 81) \
goto invalid_code; \
- *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
+ rule = COMPOSITION_ENCODE_RULE (gref, nref); \
} while (0)
-#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
+/* Start of Emacs 21 style format. The first three bytes at SRC are
+ (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
+ byte length of this composition information, CHARS is the number of
+ characters composed by this composition. */
+
+#define DECODE_EMACS_MULE_21_COMPOSITION() \
do { \
- /* Emacs 21 style format. The first three bytes at SRC are \
- (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
- the byte length of this composition information, CHARS is the \
- number of characters composed by this composition. */ \
enum composition_method method = c - 0xF2; \
int *charbuf_base = charbuf; \
- int consumed_chars_limit; \
int nbytes, nchars; \
- \
+ \
ONE_MORE_BYTE (c); \
if (c < 0) \
goto invalid_code; \
nbytes = c - 0xA0; \
- if (nbytes < 3) \
+ if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
goto invalid_code; \
ONE_MORE_BYTE (c); \
- if (c < 0) \
- goto invalid_code; \
nchars = c - 0xA0; \
- ADD_COMPOSITION_DATA (charbuf, nchars, method); \
- consumed_chars_limit = consumed_chars_base + nbytes; \
- if (method != COMPOSITION_RELATIVE) \
- { \
- int i = 0; \
- while (consumed_chars < consumed_chars_limit) \
- { \
- if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
- DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
- else \
- DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
- i++; \
- } \
- if (consumed_chars < consumed_chars_limit) \
- goto invalid_code; \
- charbuf_base[0] -= i; \
- } \
+ if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
+ goto invalid_code; \
+ cmp_status->old_form = 0; \
+ cmp_status->method = method; \
+ if (method == COMPOSITION_RELATIVE) \
+ cmp_status->state = COMPOSING_CHAR; \
+ else \
+ cmp_status->state = COMPOSING_COMPONENT_CHAR; \
+ cmp_status->length = MAX_ANNOTATION_LENGTH; \
+ cmp_status->nchars = nchars; \
+ cmp_status->ncomps = nbytes - 4; \
+ ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
} while (0)
-#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
- do { \
- /* Emacs 20 style format for relative composition. */ \
- /* Store multibyte form of characters to be composed. */ \
- enum composition_method method = COMPOSITION_RELATIVE; \
- int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
- int *buf = components; \
- int i, j; \
- \
- src = src_base; \
- ONE_MORE_BYTE (c); /* skip 0x80 */ \
- for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
- DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
- if (i < 2) \
- goto invalid_code; \
- ADD_COMPOSITION_DATA (charbuf, i, method); \
- for (j = 0; j < i; j++) \
- *charbuf++ = components[j]; \
+/* Start of Emacs 20 style format for relative composition. */
+
+#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
+ do { \
+ cmp_status->old_form = 1; \
+ cmp_status->method = COMPOSITION_RELATIVE; \
+ cmp_status->state = COMPOSING_CHAR; \
+ cmp_status->length = MAX_ANNOTATION_LENGTH; \
+ cmp_status->nchars = cmp_status->ncomps = 0; \
+ ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
} while (0)
-#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
+/* Start of Emacs 20 style format for rule-base composition. */
+
+#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
do { \
- /* Emacs 20 style format for rule-base composition. */ \
- /* Store multibyte form of characters to be composed. */ \
- enum composition_method method = COMPOSITION_WITH_RULE; \
- int *charbuf_base = charbuf; \
- int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
- int *buf = components; \
- int i, j; \
+ cmp_status->old_form = 1; \
+ cmp_status->method = COMPOSITION_WITH_RULE; \
+ cmp_status->state = COMPOSING_CHAR; \
+ cmp_status->length = MAX_ANNOTATION_LENGTH; \
+ cmp_status->nchars = cmp_status->ncomps = 0; \
+ ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
+ } while (0)
+
+
+#define DECODE_EMACS_MULE_COMPOSITION_START() \
+ do { \
+ const unsigned char *current_src = src; \
+ \
+ ONE_MORE_BYTE (c); \
+ if (c < 0) \
+ goto invalid_code; \
+ if (c - 0xF2 >= COMPOSITION_RELATIVE \
+ && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
+ DECODE_EMACS_MULE_21_COMPOSITION (); \
+ else if (c < 0xA0) \
+ goto invalid_code; \
+ else if (c < 0xC0) \
+ { \
+ DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
+ /* Re-read C as a composition component. */ \
+ src = current_src; \
+ } \
+ else if (c == 0xFF) \
+ DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
+ else \
+ goto invalid_code; \
+ } while (0)
+
+#define EMACS_MULE_COMPOSITION_END() \
+ do { \
+ int idx = - cmp_status->length; \
\
- DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
- for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
- { \
- if (*src < 0xA0) \
- break; \
- DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
- DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
- } \
- if (i <= 1 || (buf - components) % 2 == 0) \
- goto invalid_code; \
- if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
- goto no_more_source; \
- ADD_COMPOSITION_DATA (charbuf, i, method); \
- i = i * 2 - 1; \
- for (j = 0; j < i; j++) \
- *charbuf++ = components[j]; \
- charbuf_base[0] -= i; \
- for (j = 0; j < i; j += 2) \
- *charbuf++ = components[j]; \
+ if (cmp_status->old_form) \
+ charbuf[idx + 2] = cmp_status->nchars; \
+ else if (cmp_status->method > COMPOSITION_RELATIVE) \
+ charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
+ cmp_status->state = COMPOSING_NO; \
+ } while (0)
+
+
+static int
+emacs_mule_finish_composition (charbuf, cmp_status)
+ int *charbuf;
+ struct composition_status *cmp_status;
+{
+ int idx = - cmp_status->length;
+ int new_chars;
+
+ if (cmp_status->old_form && cmp_status->nchars > 0)
+ {
+ charbuf[idx + 2] = cmp_status->nchars;
+ new_chars = 0;
+ if (cmp_status->method == COMPOSITION_WITH_RULE
+ && cmp_status->state == COMPOSING_CHAR)
+ {
+ /* The last rule was invalid. */
+ int rule = charbuf[-1] + 0xA0;
+
+ charbuf[-2] = BYTE8_TO_CHAR (rule);
+ charbuf[-1] = -1;
+ new_chars = 1;
+ }
+ }
+ else
+ {
+ charbuf[idx++] = BYTE8_TO_CHAR (0x80);
+
+ if (cmp_status->method == COMPOSITION_WITH_RULE)
+ {
+ charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
+ charbuf[idx++] = -3;
+ charbuf[idx++] = 0;
+ new_chars = 1;
+ }
+ else
+ {
+ int nchars = charbuf[idx + 1] + 0xA0;
+ int nbytes = charbuf[idx + 2] + 0xA0;
+
+ charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
+ charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
+ charbuf[idx++] = BYTE8_TO_CHAR (nchars);
+ charbuf[idx++] = -1;
+ new_chars = 4;
+ }
+ }
+ cmp_status->state = COMPOSING_NO;
+ return new_chars;
+}
+
+#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
+ do { \
+ if (cmp_status->state != COMPOSING_NO) \
+ char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
} while (0)
int char_offset = coding->produced_char;
int last_offset = char_offset;
int last_id = charset_ascii;
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+ int byte_after_cr = -1;
+ struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
CODING_GET_INFO (coding, attrs, charset_list);
+ if (cmp_status->state != COMPOSING_NO)
+ {
+ int i;
+
+ for (i = 0; i < cmp_status->length; i++)
+ *charbuf++ = cmp_status->carryover[i];
+ coding->annotated = 1;
+ }
+
while (1)
{
- int c;
+ int c, id;
src_base = src;
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
- ONE_MORE_BYTE (c);
- if (c < 0)
+ if (byte_after_cr >= 0)
+ c = byte_after_cr, byte_after_cr = -1;
+ else
+ ONE_MORE_BYTE (c);
+
+ if (c < 0 || c == 0x80)
{
- *charbuf++ = -c;
- char_offset++;
+ EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
+ if (c < 0)
+ {
+ *charbuf++ = -c;
+ char_offset++;
+ }
+ else
+ DECODE_EMACS_MULE_COMPOSITION_START ();
+ continue;
}
- else if (c < 0x80)
+
+ if (c < 0x80)
+ {
+ if (eol_crlf && c == '\r')
+ ONE_MORE_BYTE (byte_after_cr);
+ id = charset_ascii;
+ if (cmp_status->state != COMPOSING_NO)
+ {
+ if (cmp_status->old_form)
+ EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
+ else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
+ cmp_status->ncomps--;
+ }
+ }
+ else
{
+ int nchars, nbytes;
+
+ c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
+ cmp_status);
+ if (c < 0)
+ {
+ if (c == -1)
+ goto invalid_code;
+ if (c == -2)
+ break;
+ }
+ src = src_base + nbytes;
+ consumed_chars = consumed_chars_base + nchars;
+ if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
+ cmp_status->ncomps -= nchars;
+ }
+
+ /* Now if C >= 0, we found a normally encoded characer, if C <
+ 0, we found an old-style composition component character or
+ rule. */
+
+ if (cmp_status->state == COMPOSING_NO)
+ {
+ if (last_id != id)
+ {
+ if (last_id != charset_ascii)
+ ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
+ last_id);
+ last_id = id;
+ last_offset = char_offset;
+ }
*charbuf++ = c;
char_offset++;
}
- else if (c == 0x80)
+ else if (cmp_status->state == COMPOSING_CHAR)
{
- ONE_MORE_BYTE (c);
- if (c < 0)
- goto invalid_code;
- if (c - 0xF2 >= COMPOSITION_RELATIVE
- && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
- DECODE_EMACS_MULE_21_COMPOSITION (c);
- else if (c < 0xC0)
- DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
- else if (c == 0xFF)
- DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
+ if (cmp_status->old_form)
+ {
+ if (c >= 0)
+ {
+ EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
+ *charbuf++ = c;
+ char_offset++;
+ }
+ else
+ {
+ *charbuf++ = -c;
+ cmp_status->nchars++;
+ cmp_status->length++;
+ if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
+ EMACS_MULE_COMPOSITION_END ();
+ else if (cmp_status->method == COMPOSITION_WITH_RULE)
+ cmp_status->state = COMPOSING_RULE;
+ }
+ }
else
- goto invalid_code;
+ {
+ *charbuf++ = c;
+ cmp_status->length++;
+ cmp_status->nchars--;
+ if (cmp_status->nchars == 0)
+ EMACS_MULE_COMPOSITION_END ();
+ }
}
- else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
+ else if (cmp_status->state == COMPOSING_RULE)
{
- int nbytes, nchars;
- int id;
+ int rule;
- src = src_base;
- consumed_chars = consumed_chars_base;
- c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
- if (c < 0)
+ if (c >= 0)
{
- if (c == -2)
- break;
- goto invalid_code;
+ EMACS_MULE_COMPOSITION_END ();
+ *charbuf++ = c;
+ char_offset++;
}
- if (last_id != id)
+ else
{
- if (last_id != charset_ascii)
- ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
- last_id = id;
- last_offset = char_offset;
+ c = -c;
+ DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
+ if (rule < 0)
+ goto invalid_code;
+ *charbuf++ = -2;
+ *charbuf++ = rule;
+ cmp_status->length += 2;
+ cmp_status->state = COMPOSING_CHAR;
}
+ }
+ else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
+ {
*charbuf++ = c;
- src += nbytes;
- consumed_chars += nchars;
- char_offset++;
+ cmp_status->length++;
+ if (cmp_status->ncomps == 0)
+ cmp_status->state = COMPOSING_CHAR;
+ else if (cmp_status->ncomps > 0)
+ {
+ if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
+ cmp_status->state = COMPOSING_COMPONENT_RULE;
+ }
+ else
+ EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
}
- else
- goto invalid_code;
+ else /* COMPOSING_COMPONENT_RULE */
+ {
+ int rule;
+
+ DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
+ if (rule < 0)
+ goto invalid_code;
+ *charbuf++ = -2;
+ *charbuf++ = rule;
+ cmp_status->length += 2;
+ cmp_status->ncomps--;
+ if (cmp_status->ncomps > 0)
+ cmp_status->state = COMPOSING_COMPONENT_CHAR;
+ else
+ EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
+ }
+ continue;
+
+ retry:
+ src = src_base;
+ consumed_chars = consumed_chars_base;
continue;
invalid_code:
+ EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
src = src_base;
consumed_chars = consumed_chars_base;
ONE_MORE_BYTE (c);
}
no_more_source:
+ if (cmp_status->state != COMPOSING_NO)
+ {
+ if (coding->mode & CODING_MODE_LAST_BLOCK)
+ EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
+ else
+ {
+ int i;
+
+ charbuf -= cmp_status->length;
+ for (i = 0; i < cmp_status->length; i++)
+ cmp_status->carryover[i] = charbuf[i];
+ }
+ }
if (last_id != charset_ascii)
ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
coding->consumed_char += consumed_chars_base;
if (preferred_charset_id >= 0)
{
charset = CHARSET_FROM_ID (preferred_charset_id);
- if (! CHAR_CHARSET_P (c, charset))
- charset = char_charset (c, charset_list, NULL);
+ if (CHAR_CHARSET_P (c, charset))
+ code = ENCODE_CHAR (charset, c);
+ else
+ charset = char_charset (c, charset_list, &code);
}
else
charset = char_charset (c, charset_list, &code);
#define SAFE_CHARSET_P(coding, id) \
((id) <= (coding)->max_charset_id \
- && (coding)->safe_charsets[id] >= 0)
+ && (coding)->safe_charsets[id] != 255)
#define SHIFT_OUT_OK(category) \
max_charset_id = id;
}
- safe_charsets = Fmake_string (make_number (max_charset_id + 1),
- make_number (255));
+ safe_charsets = make_uninit_string (max_charset_id + 1);
+ memset (SDATA (safe_charsets), 255, max_charset_id + 1);
request = AREF (attrs, coding_attr_iso_request);
reg_usage = AREF (attrs, coding_attr_iso_usage);
reg94 = XINT (XCAR (reg_usage));
int i;
int rejected = 0;
int found = 0;
+ int composition_count = -1;
detect_info->checked |= CATEGORY_MASK_ISO;
struct coding_system *this = &(coding_categories[i]);
Lisp_Object attrs, val;
+ if (this->id < 0)
+ continue;
attrs = CODING_ID_ATTRS (this->id);
if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
- && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
+ && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
setup_iso_safe_charsets (attrs);
val = CODING_ATTR_SAFE_CHARSETS (attrs);
this->max_charset_id = SCHARS (val) - 1;
- this->safe_charsets = (char *) SDATA (val);
+ this->safe_charsets = SDATA (val);
}
/* A coding system of this category is always ASCII compatible. */
rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
break;
}
+ else if (c == '1')
+ {
+ /* End of composition. */
+ if (composition_count < 0
+ || composition_count > MAX_COMPOSITION_COMPONENTS)
+ /* Invalid */
+ break;
+ composition_count = -1;
+ found |= CATEGORY_MASK_ISO;
+ }
else if (c >= '0' && c <= '4')
{
/* ESC <Fp> for start/end composition. */
- found |= CATEGORY_MASK_ISO;
+ composition_count = 0;
break;
}
else
continue;
if (c < 0x80)
{
+ if (composition_count >= 0)
+ composition_count++;
single_shifting = 0;
break;
}
}
if (i & 1 && src < src_end)
- rejected |= CATEGORY_MASK_ISO_8_2;
+ {
+ rejected |= CATEGORY_MASK_ISO_8_2;
+ if (composition_count >= 0)
+ composition_count += i;
+ }
else
- found |= CATEGORY_MASK_ISO_8_2;
+ {
+ found |= CATEGORY_MASK_ISO_8_2;
+ if (composition_count >= 0)
+ composition_count += i / 2;
+ }
}
break;
}
} while (0)
-#define MAYBE_FINISH_COMPOSITION() \
+/* Handle these composition sequence (ALT: alternate char):
+
+ (1) relative composition: ESC 0 CHAR ... ESC 1
+ (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
+ (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
+ (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
+
+ When the start sequence (ESC 0/2/3/4) is found, this annotation
+ header is produced.
+
+ [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
+
+ Then, upon reading CHAR or RULE (one or two bytes), these codes are
+ produced until the end sequence (ESC 1) is found:
+
+ (1) CHAR ... CHAR
+ (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
+ (3) ALT ... ALT -1 -1 CHAR ... CHAR
+ (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
+
+ When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
+ annotation header is updated as below:
+
+ (1) LENGTH: unchanged, NCHARS: number of CHARs
+ (2) LENGTH: unchanged, NCHARS: number of CHARs
+ (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
+ (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
+
+ If an error is found while composing, the annotation header is
+ changed to:
+
+ [ ESC '0'/'2'/'3'/'4' -2 0 ]
+
+ and the sequence [ -2 DECODED-RULE ] is changed to the original
+ byte sequence as below:
+ o the original byte sequence is B: [ B -1 ]
+ o the original byte sequence is B1 B2: [ B1 B2 ]
+ and the sequence [ -1 -1 ] is changed to the original byte
+ sequence:
+ [ ESC '0' ]
+*/
+
+/* Decode a composition rule C1 and maybe one more byte from the
+ source, and set RULE to the encoded composition rule, NBYTES to the
+ length of the composition rule. If the rule is invalid, set RULE
+ to some negative value. */
+
+#define DECODE_COMPOSITION_RULE(rule, nbytes) \
+ do { \
+ rule = c1 - 32; \
+ if (rule < 0) \
+ break; \
+ if (rule < 81) /* old format (before ver.21) */ \
+ { \
+ int gref = (rule) / 9; \
+ int nref = (rule) % 9; \
+ if (gref == 4) gref = 10; \
+ if (nref == 4) nref = 10; \
+ rule = COMPOSITION_ENCODE_RULE (gref, nref); \
+ nbytes = 1; \
+ } \
+ else /* new format (after ver.21) */ \
+ { \
+ int c; \
+ \
+ ONE_MORE_BYTE (c); \
+ rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
+ if (rule >= 0) \
+ rule += 0x100; /* to destinguish it from the old format */ \
+ nbytes = 2; \
+ } \
+ } while (0)
+
+#define ENCODE_COMPOSITION_RULE(rule) \
do { \
- int i; \
- if (composition_state == COMPOSING_NO) \
- break; \
- /* It is assured that we have enough room for producing \
- characters stored in the table `components'. */ \
- if (charbuf + component_idx > charbuf_end) \
- goto no_more_source; \
- composition_state = COMPOSING_NO; \
- if (method == COMPOSITION_RELATIVE \
- || method == COMPOSITION_WITH_ALTCHARS) \
+ int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
+ \
+ if (rule < 0x100) /* old format */ \
{ \
- for (i = 0; i < component_idx; i++) \
- *charbuf++ = components[i]; \
- char_offset += component_idx; \
+ if (gref == 10) gref = 4; \
+ if (nref == 10) nref = 4; \
+ charbuf[idx] = 32 + gref * 9 + nref; \
+ charbuf[idx + 1] = -1; \
+ new_chars++; \
} \
- else \
+ else /* new format */ \
{ \
- for (i = 0; i < component_idx; i += 2) \
- *charbuf++ = components[i]; \
- char_offset += (component_idx / 2) + 1; \
+ charbuf[idx] = 32 + 81 + gref; \
+ charbuf[idx + 1] = 32 + nref; \
+ new_chars += 2; \
} \
} while (0)
+/* Finish the current composition as invalid. */
+
+static int finish_composition P_ ((int *, struct composition_status *));
+
+static int
+finish_composition (charbuf, cmp_status)
+ int *charbuf;
+ struct composition_status *cmp_status;
+{
+ int idx = - cmp_status->length;
+ int new_chars;
+
+ /* Recover the original ESC sequence */
+ charbuf[idx++] = ISO_CODE_ESC;
+ charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
+ : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
+ : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
+ /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
+ : '4');
+ charbuf[idx++] = -2;
+ charbuf[idx++] = 0;
+ charbuf[idx++] = -1;
+ new_chars = cmp_status->nchars;
+ if (cmp_status->method >= COMPOSITION_WITH_RULE)
+ for (; idx < 0; idx++)
+ {
+ int elt = charbuf[idx];
+
+ if (elt == -2)
+ {
+ ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
+ idx++;
+ }
+ else if (elt == -1)
+ {
+ charbuf[idx++] = ISO_CODE_ESC;
+ charbuf[idx] = '0';
+ new_chars += 2;
+ }
+ }
+ cmp_status->state = COMPOSING_NO;
+ return new_chars;
+}
+
+/* If characers are under composition, finish the composition. */
+#define MAYBE_FINISH_COMPOSITION() \
+ do { \
+ if (cmp_status->state != COMPOSING_NO) \
+ char_offset += finish_composition (charbuf, cmp_status); \
+ } while (0)
/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
+
ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
- */
-#define DECODE_COMPOSITION_START(c1) \
- do { \
- if (c1 == '0' \
- && composition_state == COMPOSING_COMPONENT_RULE) \
- { \
- component_len = component_idx; \
- composition_state = COMPOSING_CHAR; \
- } \
- else \
- { \
- const unsigned char *p; \
- \
- MAYBE_FINISH_COMPOSITION (); \
- if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
- goto no_more_source; \
- for (p = src; p < src_end - 1; p++) \
- if (*p == ISO_CODE_ESC && p[1] == '1') \
- break; \
- if (p == src_end - 1) \
- { \
- /* The current composition doesn't end in the current \
- source. */ \
- record_conversion_result \
- (coding, CODING_RESULT_INSUFFICIENT_SRC); \
- goto no_more_source; \
- } \
- \
- /* This is surely the start of a composition. */ \
- method = (c1 == '0' ? COMPOSITION_RELATIVE \
- : c1 == '2' ? COMPOSITION_WITH_RULE \
- : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
- : COMPOSITION_WITH_RULE_ALTCHARS); \
- composition_state = (c1 <= '2' ? COMPOSING_CHAR \
- : COMPOSING_COMPONENT_CHAR); \
- component_idx = component_len = 0; \
- } \
+ Produce this annotation sequence now:
+
+ [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
+*/
+
+#define DECODE_COMPOSITION_START(c1) \
+ do { \
+ if (c1 == '0' \
+ && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
+ && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
+ || (cmp_status->state == COMPOSING_COMPONENT_RULE \
+ && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
+ { \
+ *charbuf++ = -1; \
+ *charbuf++= -1; \
+ cmp_status->state = COMPOSING_CHAR; \
+ cmp_status->length += 2; \
+ } \
+ else \
+ { \
+ MAYBE_FINISH_COMPOSITION (); \
+ cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
+ : c1 == '2' ? COMPOSITION_WITH_RULE \
+ : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
+ : COMPOSITION_WITH_RULE_ALTCHARS); \
+ cmp_status->state \
+ = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
+ ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
+ cmp_status->length = MAX_ANNOTATION_LENGTH; \
+ cmp_status->nchars = cmp_status->ncomps = 0; \
+ coding->annotated = 1; \
+ } \
} while (0)
-/* Handle compositoin end sequence ESC 1. */
+/* Handle composition end sequence ESC 1. */
#define DECODE_COMPOSITION_END() \
do { \
- int nchars = (component_len > 0 ? component_idx - component_len \
- : method == COMPOSITION_RELATIVE ? component_idx \
- : (component_idx + 1) / 2); \
- int i; \
- int *saved_charbuf = charbuf; \
- \
- ADD_COMPOSITION_DATA (charbuf, nchars, method); \
- if (method != COMPOSITION_RELATIVE) \
+ if (cmp_status->nchars == 0 \
+ || ((cmp_status->state == COMPOSING_CHAR) \
+ == (cmp_status->method == COMPOSITION_WITH_RULE))) \
{ \
- if (component_len == 0) \
- for (i = 0; i < component_idx; i++) \
- *charbuf++ = components[i]; \
- else \
- for (i = 0; i < component_len; i++) \
- *charbuf++ = components[i]; \
- *saved_charbuf = saved_charbuf - charbuf; \
+ MAYBE_FINISH_COMPOSITION (); \
+ goto invalid_code; \
} \
- if (method == COMPOSITION_WITH_RULE) \
- for (i = 0; i < component_idx; i += 2, char_offset++) \
- *charbuf++ = components[i]; \
- else \
- for (i = component_len; i < component_idx; i++, char_offset++) \
- *charbuf++ = components[i]; \
- coding->annotated = 1; \
- composition_state = COMPOSING_NO; \
+ if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
+ charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
+ else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
+ charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
+ charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
+ char_offset += cmp_status->nchars; \
+ cmp_status->state = COMPOSING_NO; \
} while (0)
+/* Store a composition rule RULE in charbuf, and update cmp_status. */
-/* Decode a composition rule from the byte C1 (and maybe one more byte
- from SRC) and store one encoded composition rule in
- coding->cmp_data. */
+#define STORE_COMPOSITION_RULE(rule) \
+ do { \
+ *charbuf++ = -2; \
+ *charbuf++ = rule; \
+ cmp_status->length += 2; \
+ cmp_status->state--; \
+ } while (0)
-#define DECODE_COMPOSITION_RULE(c1) \
+/* Store a composed char or a component char C in charbuf, and update
+ cmp_status. */
+
+#define STORE_COMPOSITION_CHAR(c) \
do { \
- (c1) -= 32; \
- if (c1 < 81) /* old format (before ver.21) */ \
- { \
- int gref = (c1) / 9; \
- int nref = (c1) % 9; \
- if (gref == 4) gref = 10; \
- if (nref == 4) nref = 10; \
- c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
- } \
- else if (c1 < 93) /* new format (after ver.21) */ \
- { \
- ONE_MORE_BYTE (c2); \
- c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
- } \
+ *charbuf++ = (c); \
+ cmp_status->length++; \
+ if (cmp_status->state == COMPOSING_CHAR) \
+ cmp_status->nchars++; \
else \
- c1 = 0; \
+ cmp_status->ncomps++; \
+ if (cmp_status->method == COMPOSITION_WITH_RULE \
+ || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
+ && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
+ cmp_status->state++; \
} while (0)
const unsigned char *src_base;
int *charbuf = coding->charbuf + coding->charbuf_used;
int *charbuf_end
- = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
+ = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
int consumed_chars = 0, consumed_chars_base;
int multibytep = coding->src_multibyte;
/* Charsets invoked to graphic plane 0 and 1 respectively. */
int charset_id_2, charset_id_3;
struct charset *charset;
int c;
- /* For handling composition sequence. */
-#define COMPOSING_NO 0
-#define COMPOSING_CHAR 1
-#define COMPOSING_RULE 2
-#define COMPOSING_COMPONENT_CHAR 3
-#define COMPOSING_COMPONENT_RULE 4
-
- int composition_state = COMPOSING_NO;
- enum composition_method method;
- int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
- int component_idx;
- int component_len;
+ struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
Lisp_Object attrs, charset_list;
int char_offset = coding->produced_char;
int last_offset = char_offset;
int last_id = charset_ascii;
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+ int byte_after_cr = -1;
+ int i;
CODING_GET_INFO (coding, attrs, charset_list);
setup_iso_safe_charsets (attrs);
/* Charset list may have been changed. */
charset_list = CODING_ATTR_CHARSET_LIST (attrs);
- coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
+ coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
+
+ if (cmp_status->state != COMPOSING_NO)
+ {
+ for (i = 0; i < cmp_status->length; i++)
+ *charbuf++ = cmp_status->carryover[i];
+ coding->annotated = 1;
+ }
while (1)
{
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
- ONE_MORE_BYTE (c1);
+ if (byte_after_cr >= 0)
+ c1 = byte_after_cr, byte_after_cr = -1;
+ else
+ ONE_MORE_BYTE (c1);
if (c1 < 0)
goto invalid_code;
- /* We produce at most one character. */
- switch (iso_code_class [c1])
+ if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
{
- case ISO_0x20_or_0x7F:
- if (composition_state != COMPOSING_NO)
+ *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
+ char_offset++;
+ CODING_ISO_EXTSEGMENT_LEN (coding)--;
+ continue;
+ }
+
+ if (CODING_ISO_EMBEDDED_UTF_8 (coding))
+ {
+ if (c1 == ISO_CODE_ESC)
{
- if (composition_state == COMPOSING_RULE
- || composition_state == COMPOSING_COMPONENT_RULE)
+ if (src + 1 >= src_end)
+ goto no_more_source;
+ *charbuf++ = ISO_CODE_ESC;
+ char_offset++;
+ if (src[0] == '%' && src[1] == '@')
{
- DECODE_COMPOSITION_RULE (c1);
- components[component_idx++] = c1;
- composition_state--;
- continue;
+ src += 2;
+ consumed_chars += 2;
+ char_offset += 2;
+ /* We are sure charbuf can contain two more chars. */
+ *charbuf++ = '%';
+ *charbuf++ = '@';
+ CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
}
}
+ else
+ {
+ *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
+ char_offset++;
+ }
+ continue;
+ }
+
+ if ((cmp_status->state == COMPOSING_RULE
+ || cmp_status->state == COMPOSING_COMPONENT_RULE)
+ && c1 != ISO_CODE_ESC)
+ {
+ int rule, nbytes;
+
+ DECODE_COMPOSITION_RULE (rule, nbytes);
+ if (rule < 0)
+ goto invalid_code;
+ STORE_COMPOSITION_RULE (rule);
+ continue;
+ }
+
+ /* We produce at most one character. */
+ switch (iso_code_class [c1])
+ {
+ case ISO_0x20_or_0x7F:
if (charset_id_0 < 0
|| ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
/* This is SPACE or DEL. */
break;
case ISO_graphic_plane_0:
- if (composition_state != COMPOSING_NO)
- {
- if (composition_state == COMPOSING_RULE
- || composition_state == COMPOSING_COMPONENT_RULE)
- {
- DECODE_COMPOSITION_RULE (c1);
- components[component_idx++] = c1;
- composition_state--;
- continue;
- }
- }
if (charset_id_0 < 0)
charset = CHARSET_FROM_ID (charset_ascii);
else
break;
case ISO_control_0:
+ if (eol_crlf && c1 == '\r')
+ ONE_MORE_BYTE (byte_after_cr);
MAYBE_FINISH_COMPOSITION ();
charset = CHARSET_FROM_ID (charset_ascii);
break;
case ISO_control_1:
- MAYBE_FINISH_COMPOSITION ();
goto invalid_code;
case ISO_shift_out:
case '0': case '2': case '3': case '4': /* start composition */
if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
goto invalid_code;
+ if (last_id != charset_ascii)
+ {
+ ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
+ last_id = charset_ascii;
+ last_offset = char_offset;
+ }
DECODE_COMPOSITION_START (c1);
continue;
case '1': /* end composition */
- if (composition_state == COMPOSING_NO)
+ if (cmp_status->state == COMPOSING_NO)
goto invalid_code;
DECODE_COMPOSITION_END ();
continue;
int size;
ONE_MORE_BYTE (dim);
+ if (dim < 0 || dim > 4)
+ goto invalid_code;
ONE_MORE_BYTE (M);
+ if (M < 128)
+ goto invalid_code;
ONE_MORE_BYTE (L);
+ if (L < 128)
+ goto invalid_code;
size = ((M - 128) * 128) + (L - 128);
- if (charbuf + 8 + size > charbuf_end)
+ if (charbuf + 6 > charbuf_end)
goto break_loop;
*charbuf++ = ISO_CODE_ESC;
*charbuf++ = '%';
*charbuf++ = dim;
*charbuf++ = BYTE8_TO_CHAR (M);
*charbuf++ = BYTE8_TO_CHAR (L);
- while (size-- > 0)
- {
- ONE_MORE_BYTE (c1);
- *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
- }
+ CODING_ISO_EXTSEGMENT_LEN (coding) = size;
}
else if (c1 == 'G')
{
ESC % G --UTF-8-BYTES-- ESC % @
We keep these bytes as is for the moment.
They may be decoded by post-read-conversion. */
- int *p = charbuf;
-
- if (p + 6 > charbuf_end)
- goto break_loop;
- *p++ = ISO_CODE_ESC;
- *p++ = '%';
- *p++ = 'G';
- while (p < charbuf_end)
- {
- ONE_MORE_BYTE (c1);
- if (c1 == ISO_CODE_ESC
- && src + 1 < src_end
- && src[0] == '%'
- && src[1] == '@')
- {
- src += 2;
- break;
- }
- *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
- }
- if (p + 3 > charbuf_end)
+ if (charbuf + 3 > charbuf_end)
goto break_loop;
- *p++ = ISO_CODE_ESC;
- *p++ = '%';
- *p++ = '@';
- charbuf = p;
+ *charbuf++ = ISO_CODE_ESC;
+ *charbuf++ = '%';
+ *charbuf++ = 'G';
+ CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
}
else
goto invalid_code;
}
}
- if (charset->id != charset_ascii
+ if (cmp_status->state == COMPOSING_NO
+ && charset->id != charset_ascii
&& last_id != charset->id)
{
if (last_id != charset_ascii)
*charbuf++ = BYTE8_TO_CHAR (*src_base);
}
}
- else if (composition_state == COMPOSING_NO)
+ else if (cmp_status->state == COMPOSING_NO)
{
*charbuf++ = c;
char_offset++;
}
- else
+ else if ((cmp_status->state == COMPOSING_CHAR
+ ? cmp_status->nchars
+ : cmp_status->ncomps)
+ >= MAX_COMPOSITION_COMPONENTS)
{
- components[component_idx++] = c;
- if (method == COMPOSITION_WITH_RULE
- || (method == COMPOSITION_WITH_RULE_ALTCHARS
- && composition_state == COMPOSING_COMPONENT_CHAR))
- composition_state++;
+ /* Too long composition. */
+ MAYBE_FINISH_COMPOSITION ();
+ *charbuf++ = c;
+ char_offset++;
}
+ else
+ STORE_COMPOSITION_CHAR (c);
continue;
invalid_code:
}
no_more_source:
- if (last_id != charset_ascii)
+ if (cmp_status->state != COMPOSING_NO)
+ {
+ if (coding->mode & CODING_MODE_LAST_BLOCK)
+ MAYBE_FINISH_COMPOSITION ();
+ else
+ {
+ charbuf -= cmp_status->length;
+ for (i = 0; i < cmp_status->length; i++)
+ cmp_status->carryover[i] = charbuf[i];
+ }
+ }
+ else if (last_id != charset_ascii)
ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
coding->consumed_char += consumed_chars_base;
coding->consumed = src_base - coding->source;
int preferred_charset_id = -1;
CODING_GET_INFO (coding, attrs, charset_list);
- eol_type = CODING_ID_EOL_TYPE (coding->id);
+ eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
if (VECTORP (eol_type))
eol_type = Qunix;
setup_iso_safe_charsets (attrs);
/* Charset list may have been changed. */
charset_list = CODING_ATTR_CHARSET_LIST (attrs);
- coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
+ coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
int char_offset = coding->produced_char;
int last_offset = char_offset;
int last_id = charset_ascii;
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+ int byte_after_cr = -1;
CODING_GET_INFO (coding, attrs, charset_list);
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
- ONE_MORE_BYTE (c);
+ if (byte_after_cr >= 0)
+ c = byte_after_cr, byte_after_cr = -1;
+ else
+ ONE_MORE_BYTE (c);
if (c < 0)
goto invalid_code;
if (c < 0x80)
- charset = charset_roman;
+ {
+ if (eol_crlf && c == '\r')
+ ONE_MORE_BYTE (byte_after_cr);
+ charset = charset_roman;
+ }
else if (c == 0x80 || c == 0xA0)
goto invalid_code;
else if (c >= 0xA1 && c <= 0xDF)
int char_offset = coding->produced_char;
int last_offset = char_offset;
int last_id = charset_ascii;
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+ int byte_after_cr = -1;
CODING_GET_INFO (coding, attrs, charset_list);
val = charset_list;
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
- ONE_MORE_BYTE (c);
+ if (byte_after_cr >= 0)
+ c = byte_after_cr, byte_after_cr = -1;
+ else
+ ONE_MORE_BYTE (c);
if (c < 0)
goto invalid_code;
if (c < 0x80)
- charset = charset_roman;
+ {
+ if (eol_crlf && c == '\r')
+ ONE_MORE_BYTE (byte_after_cr);
+ charset = charset_roman;
+ }
else
{
/* BIG5 -> Big5 */
else
{
ASSURE_DESTINATION (ccl.produced);
- for (i = 0; i < ccl.produced; i++)
+ for (i = 0; i < ccl.produced; i++)
*dst++ = destination_charbuf[i] & 0xFF;
produced_chars += ccl.produced;
}
decode_coding_raw_text (coding)
struct coding_system *coding;
{
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+
coding->chars_at_source = 1;
- coding->consumed_char = 0;
- coding->consumed = 0;
- record_conversion_result (coding, CODING_RESULT_SUCCESS);
+ coding->consumed_char = coding->src_chars;
+ coding->consumed = coding->src_bytes;
+ if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
+ {
+ coding->consumed_char--;
+ coding->consumed--;
+ record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
+ }
+ else
+ record_conversion_result (coding, CODING_RESULT_SUCCESS);
}
static int
*dst++ = CHAR_TO_BYTE8 (c);
else
CHAR_STRING_ADVANCE (c, dst);
- produced_chars++;
}
}
else
ASSURE_DESTINATION (charbuf_end - charbuf);
while (charbuf < charbuf_end && dst < dst_end)
*dst++ = *charbuf++;
- produced_chars = dst - (coding->destination + coding->dst_bytes);
}
+ produced_chars = dst - (coding->destination + coding->produced);
}
record_conversion_result (coding, CODING_RESULT_SUCCESS);
coding->produced_char += produced_chars;
const unsigned char *src_end = coding->source + coding->src_bytes;
int multibytep = coding->src_multibyte;
int consumed_chars = 0;
- Lisp_Object attrs, valids;
+ Lisp_Object attrs, valids, name;
int found = 0;
int head_ascii = coding->head_ascii;
+ int check_latin_extra = 0;
detect_info->checked |= CATEGORY_MASK_CHARSET;
coding = &coding_categories[coding_category_charset];
attrs = CODING_ID_ATTRS (coding->id);
valids = AREF (attrs, coding_attr_charset_valids);
+ name = CODING_ID_NAME (coding->id);
+ if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
+ "iso-8859-", sizeof ("iso-8859-") - 1) == 0
+ || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
+ "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
+ check_latin_extra = 1;
if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
src += head_ascii;
if (NILP (val))
break;
if (c >= 0x80)
- found = CATEGORY_MASK_CHARSET;
+ {
+ if (c < 0xA0
+ && check_latin_extra
+ && (!VECTORP (Vlatin_extra_code_table)
+ || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
+ break;
+ found = CATEGORY_MASK_CHARSET;
+ }
if (INTEGERP (val))
{
charset = CHARSET_FROM_ID (XFASTINT (val));
if (src == src_end)
goto too_short;
ONE_MORE_BYTE (c);
- if (c < charset->code_space[(dim - 1 - idx) * 2]
+ if (c < charset->code_space[(dim - 1 - idx) * 2]
|| c > charset->code_space[(dim - 1 - idx) * 2 + 1])
break;
}
int char_offset = coding->produced_char;
int last_offset = char_offset;
int last_id = charset_ascii;
+ int eol_crlf =
+ !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
+ int byte_after_cr = -1;
CODING_GET_INFO (coding, attrs, charset_list);
valids = AREF (attrs, coding_attr_charset_valids);
consumed_chars_base = consumed_chars;
if (charbuf >= charbuf_end)
- break;
+ {
+ if (byte_after_cr >= 0)
+ src_base--;
+ break;
+ }
- ONE_MORE_BYTE (c);
+ if (byte_after_cr >= 0)
+ {
+ c = byte_after_cr;
+ byte_after_cr = -1;
+ }
+ else
+ {
+ ONE_MORE_BYTE (c);
+ if (eol_crlf && c == '\r')
+ ONE_MORE_BYTE (byte_after_cr);
+ }
if (c < 0)
goto invalid_code;
code = c;
val = AREF (valids, c);
- if (NILP (val))
+ if (! INTEGERP (val) && ! CONSP (val))
goto invalid_code;
if (INTEGERP (val))
{
CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
attrs = CODING_ID_ATTRS (coding->id);
- eol_type = CODING_ID_EOL_TYPE (coding->id);
+ eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
coding->mode = 0;
coding->head_ascii = -1;
val = CODING_ATTR_SAFE_CHARSETS (attrs);
coding->max_charset_id = SCHARS (val) - 1;
- coding->safe_charsets = (char *) SDATA (val);
+ coding->safe_charsets = SDATA (val);
coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
coding_type = CODING_ATTR_TYPE (attrs);
setup_iso_safe_charsets (attrs);
val = CODING_ATTR_SAFE_CHARSETS (attrs);
coding->max_charset_id = SCHARS (val) - 1;
- coding->safe_charsets = (char *) SDATA (val);
+ coding->safe_charsets = SDATA (val);
}
CODING_ISO_FLAGS (coding) = flags;
+ CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
+ CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
+ CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
+ CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
}
else if (EQ (coding_type, Qcharset))
{
}
else if (EQ (coding_type, Qutf_8))
{
+ val = AREF (attrs, coding_attr_utf_bom);
+ CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
+ : EQ (val, Qt) ? utf_with_bom
+ : utf_without_bom);
coding->detector = detect_coding_utf_8;
coding->decoder = decode_coding_utf_8;
coding->encoder = encode_coding_utf_8;
coding->common_flags
|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
+ if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
+ coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
}
else if (EQ (coding_type, Qutf_16))
{
- val = AREF (attrs, coding_attr_utf_16_bom);
- CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
- : EQ (val, Qt) ? utf_16_with_bom
- : utf_16_without_bom);
+ val = AREF (attrs, coding_attr_utf_bom);
+ CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
+ : EQ (val, Qt) ? utf_with_bom
+ : utf_without_bom);
val = AREF (attrs, coding_attr_utf_16_endian);
CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
: utf_16_little_endian);
coding->encoder = encode_coding_utf_16;
coding->common_flags
|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
- if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
+ if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
}
else if (EQ (coding_type, Qccl))
coding->encoder = encode_coding_emacs_mule;
coding->common_flags
|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
+ coding->spec.emacs_mule.full_support = 1;
if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
&& ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
{
tail = XCDR (tail))
if (max_charset_id < XFASTINT (XCAR (tail)))
max_charset_id = XFASTINT (XCAR (tail));
- safe_charsets = Fmake_string (make_number (max_charset_id + 1),
- make_number (255));
+ safe_charsets = make_uninit_string (max_charset_id + 1);
+ memset (SDATA (safe_charsets), 255, max_charset_id + 1);
for (tail = Vemacs_mule_charset_list; CONSP (tail);
tail = XCDR (tail))
SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
coding->max_charset_id = max_charset_id;
- coding->safe_charsets = (char *) SDATA (safe_charsets);
+ coding->safe_charsets = SDATA (safe_charsets);
+ coding->spec.emacs_mule.full_support = 1;
}
+ coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
+ coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
}
else if (EQ (coding_type, Qshift_jis))
{
}
+/* Return a list of charsets supported by CODING-SYSTEM. */
+
+Lisp_Object
+coding_system_charset_list (coding_system)
+ Lisp_Object coding_system;
+{
+ int id;
+ Lisp_Object attrs, charset_list;
+
+ CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
+ attrs = CODING_ID_ATTRS (id);
+
+ if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
+ {
+ int flags = XINT (AREF (attrs, coding_attr_iso_flags));
+
+ if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
+ charset_list = Viso_2022_charset_list;
+ else
+ charset_list = CODING_ATTR_CHARSET_LIST (attrs);
+ }
+ else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
+ {
+ charset_list = Vemacs_mule_charset_list;
+ }
+ else
+ {
+ charset_list = CODING_ATTR_CHARSET_LIST (attrs);
+ }
+ return charset_list;
+}
+
+
/* Return raw-text or one of its subsidiaries that has the same
eol_type as CODING-SYSTEM. */
|| src[lsb + 2] != '\n')
this_eol = EOL_SEEN_CR;
else
- this_eol = EOL_SEEN_CRLF;
+ {
+ this_eol = EOL_SEEN_CRLF;
+ src += 2;
+ }
if (eol_seen == EOL_SEEN_NONE)
/* This is the first end-of-line. */
eol_seen = this_eol;
else if (eol_seen != this_eol)
{
- /* The found type is different from what found before. */
- eol_seen = EOL_SEEN_LF;
- break;
+ /* The found type is different from what found before.
+ Allow for stray ^M characters in DOS EOL files. */
+ if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
+ || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
+ eol_seen = EOL_SEEN_CRLF;
+ else
+ {
+ eol_seen = EOL_SEEN_LF;
+ break;
+ }
}
if (++total == MAX_EOL_CHECK_COUNT)
break;
eol_seen = this_eol;
else if (eol_seen != this_eol)
{
- /* The found type is different from what found before. */
- eol_seen = EOL_SEEN_LF;
- break;
+ /* The found type is different from what found before.
+ Allow for stray ^M characters in DOS EOL files. */
+ if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
+ || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
+ eol_seen = EOL_SEEN_CRLF;
+ else
+ {
+ eol_seen = EOL_SEEN_LF;
+ break;
+ }
}
if (++total == MAX_EOL_CHECK_COUNT)
break;
struct coding_system *coding;
{
const unsigned char *src, *src_end;
+ int saved_mode = coding->mode;
coding->consumed = coding->consumed_char = 0;
coding->produced = coding->produced_char = 0;
coding_set_source (coding);
src_end = coding->source + coding->src_bytes;
+ coding->head_ascii = 0;
/* If we have not yet decided the text encoding type, detect it
now. */
{
int c, i;
struct coding_detection_info detect_info;
+ int null_byte_found = 0, eight_bit_found = 0;
detect_info.checked = detect_info.found = detect_info.rejected = 0;
- for (i = 0, src = coding->source; src < src_end; i++, src++)
+ for (src = coding->source; src < src_end; src++)
{
c = *src;
if (c & 0x80)
- break;
- if (c < 0x20
- && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
- && ! inhibit_iso_escape_detection
- && ! detect_info.checked)
{
- coding->head_ascii = src - (coding->source + coding->consumed);
- if (detect_coding_iso_2022 (coding, &detect_info))
+ eight_bit_found = 1;
+ if (null_byte_found)
+ break;
+ }
+ else if (c < 0x20)
+ {
+ if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ && ! inhibit_iso_escape_detection
+ && ! detect_info.checked)
{
- /* We have scanned the whole data. */
- if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
- /* We didn't find an 8-bit code. */
- src = src_end;
- break;
+ if (detect_coding_iso_2022 (coding, &detect_info))
+ {
+ /* We have scanned the whole data. */
+ if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
+ {
+ /* We didn't find an 8-bit code. We may
+ have found a null-byte, but it's very
+ rare that a binary file confirm to
+ ISO-2022. */
+ src = src_end;
+ coding->head_ascii = src - coding->source;
+ }
+ detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
+ break;
+ }
+ }
+ else if (! c && !inhibit_null_byte_detection)
+ {
+ null_byte_found = 1;
+ if (eight_bit_found)
+ break;
}
+ if (! eight_bit_found)
+ coding->head_ascii++;
}
+ else if (! eight_bit_found)
+ coding->head_ascii++;
}
- coding->head_ascii = src - (coding->source + coding->consumed);
- if (coding->head_ascii < coding->src_bytes
+ if (null_byte_found || eight_bit_found
+ || coding->head_ascii < coding->src_bytes
|| detect_info.found)
{
enum coding_category category;
break;
}
else
- for (i = 0; i < coding_category_raw_text; i++)
- {
- category = coding_priorities[i];
- this = coding_categories + category;
- if (this->id < 0)
- {
- /* No coding system of this category is defined. */
- detect_info.rejected |= (1 << category);
- }
- else if (category >= coding_category_raw_text)
- continue;
- else if (detect_info.checked & (1 << category))
- {
- if (detect_info.found & (1 << category))
+ {
+ if (null_byte_found)
+ {
+ detect_info.checked |= ~CATEGORY_MASK_UTF_16;
+ detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
+ }
+ for (i = 0; i < coding_category_raw_text; i++)
+ {
+ category = coding_priorities[i];
+ this = coding_categories + category;
+ if (this->id < 0)
+ {
+ /* No coding system of this category is defined. */
+ detect_info.rejected |= (1 << category);
+ }
+ else if (category >= coding_category_raw_text)
+ continue;
+ else if (detect_info.checked & (1 << category))
+ {
+ if (detect_info.found & (1 << category))
+ break;
+ }
+ else if ((*(this->detector)) (coding, &detect_info)
+ && detect_info.found & (1 << category))
+ {
+ if (category == coding_category_utf_16_auto)
+ {
+ if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
+ category = coding_category_utf_16_le;
+ else
+ category = coding_category_utf_16_be;
+ }
break;
- }
- else if ((*(this->detector)) (coding, &detect_info)
- && detect_info.found & (1 << category))
- {
- if (category == coding_category_utf_16_auto)
- {
- if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
- category = coding_category_utf_16_le;
- else
- category = coding_category_utf_16_be;
- }
- break;
- }
- }
-
+ }
+ }
+ }
+
if (i < coding_category_raw_text)
setup_coding_system (CODING_ID_NAME (this->id), coding);
- else if (detect_info.rejected == CATEGORY_MASK_ANY)
+ else if (null_byte_found)
+ setup_coding_system (Qno_conversion, coding);
+ else if ((detect_info.rejected & CATEGORY_MASK_ANY)
+ == CATEGORY_MASK_ANY)
setup_coding_system (Qraw_text, coding);
else if (detect_info.rejected)
for (i = 0; i < coding_category_raw_text; i++)
}
}
}
+ else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
+ == coding_category_utf_8_auto)
+ {
+ Lisp_Object coding_systems;
+ struct coding_detection_info detect_info;
+
+ coding_systems
+ = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
+ detect_info.found = detect_info.rejected = 0;
+ coding->head_ascii = 0;
+ if (CONSP (coding_systems)
+ && detect_coding_utf_8 (coding, &detect_info))
+ {
+ if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+ setup_coding_system (XCAR (coding_systems), coding);
+ else
+ setup_coding_system (XCDR (coding_systems), coding);
+ }
+ }
else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
== coding_category_utf_16_auto)
{
struct coding_detection_info detect_info;
coding_systems
- = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
+ = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0;
+ coding->head_ascii = 0;
if (CONSP (coding_systems)
&& detect_coding_utf_16 (coding, &detect_info))
{
setup_coding_system (XCDR (coding_systems), coding);
}
}
+ coding->mode = saved_mode;
}
{
Lisp_Object eol_type;
unsigned char *p, *pbeg, *pend;
-
+
eol_type = CODING_ID_EOL_TYPE (coding->id);
- if (EQ (eol_type, Qunix))
+ if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
return;
if (NILP (coding->dst_object))
eol_seen |= EOL_SEEN_CR;
}
}
- if (eol_seen != EOL_SEEN_NONE
+ /* Handle DOS-style EOLs in a file with stray ^M characters. */
+ if ((eol_seen & EOL_SEEN_CRLF) != 0
+ && (eol_seen & EOL_SEEN_CR) != 0
+ && (eol_seen & EOL_SEEN_LF) == 0)
+ eol_seen = EOL_SEEN_CRLF;
+ else if (eol_seen != EOL_SEEN_NONE
&& eol_seen != EOL_SEEN_LF
&& eol_seen != EOL_SEEN_CRLF
&& eol_seen != EOL_SEEN_CR)
} while (0)
+/* Return a translation of character(s) at BUF according to TRANS.
+ TRANS is TO-CHAR or ((FROM . TO) ...) where
+ FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
+ The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
+ translation is found, and Qnil if not found..
+ If BUF is too short to lookup characters in FROM, return Qt. */
+
static Lisp_Object
-get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
- Lisp_Object val;
+get_translation (trans, buf, buf_end)
+ Lisp_Object trans;
int *buf, *buf_end;
- int last_block;
- int *from_nchars, *to_nchars;
{
- /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
- [TO-CHAR ...]. */
- if (CONSP (val))
+
+ if (INTEGERP (trans))
+ return trans;
+ for (; CONSP (trans); trans = XCDR (trans))
{
- Lisp_Object from, tail;
- int i, len;
+ Lisp_Object val = XCAR (trans);
+ Lisp_Object from = XCAR (val);
+ int len = ASIZE (from);
+ int i;
- for (tail = val; CONSP (tail); tail = XCDR (tail))
+ for (i = 0; i < len; i++)
{
- val = XCAR (tail);
- from = XCAR (val);
- len = ASIZE (from);
- for (i = 0; i < len; i++)
- {
- if (buf + i == buf_end)
- {
- if (! last_block)
- return Qt;
- break;
- }
- if (XINT (AREF (from, i)) != buf[i])
- break;
- }
- if (i == len)
- {
- val = XCDR (val);
- *from_nchars = len;
- break;
- }
+ if (buf + i == buf_end)
+ return Qt;
+ if (XINT (AREF (from, i)) != buf[i])
+ break;
}
- if (! CONSP (tail))
- return Qnil;
+ if (i == len)
+ return val;
}
- if (VECTORP (val))
- *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
- else
- *buf = XINT (val);
- return val;
+ return Qnil;
}
{
unsigned char *dst = coding->destination + coding->produced;
unsigned char *dst_end = coding->destination + coding->dst_bytes;
- int produced;
- int produced_chars = 0;
+ EMACS_INT produced;
+ EMACS_INT produced_chars = 0;
int carryover = 0;
if (! coding->chars_at_source)
{
- /* Characters are in coding->charbuf. */
+ /* Source characters are in coding->charbuf. */
int *buf = coding->charbuf;
int *buf_end = buf + coding->charbuf_used;
- if (BUFFERP (coding->src_object)
- && EQ (coding->src_object, coding->dst_object))
- dst_end = ((unsigned char *) coding->source) + coding->consumed;
+ if (EQ (coding->src_object, coding->dst_object))
+ {
+ coding_set_source (coding);
+ dst_end = ((unsigned char *) coding->source) + coding->consumed;
+ }
while (buf < buf_end)
{
LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
if (! NILP (trans))
{
- trans = get_translation (trans, buf, buf_end, last_block,
- &from_nchars, &to_nchars);
- if (EQ (trans, Qt))
+ trans = get_translation (trans, buf, buf_end);
+ if (INTEGERP (trans))
+ c = XINT (trans);
+ else if (CONSP (trans))
+ {
+ from_nchars = ASIZE (XCAR (trans));
+ trans = XCDR (trans);
+ if (INTEGERP (trans))
+ c = XINT (trans);
+ else
+ {
+ to_nchars = ASIZE (trans);
+ c = XINT (AREF (trans, 0));
+ }
+ }
+ else if (EQ (trans, Qt) && ! last_block)
break;
- c = *buf;
}
if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
buf_end - buf
+ MAX_MULTIBYTE_LENGTH * to_nchars,
dst);
- dst_end = coding->destination + coding->dst_bytes;
+ if (EQ (coding->src_object, coding->dst_object))
+ {
+ coding_set_source (coding);
+ dst_end = (((unsigned char *) coding->source)
+ + coding->consumed);
+ }
+ else
+ dst_end = coding->destination + coding->dst_bytes;
}
for (i = 0; i < to_nchars; i++)
c = XINT (AREF (trans, i));
if (coding->dst_multibyte
|| ! CHAR_BYTE8_P (c))
- CHAR_STRING_ADVANCE (c, dst);
+ CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
else
*dst++ = CHAR_TO_BYTE8 (c);
}
produced_chars += to_nchars;
- *buf++ = to_nchars;
- while (--from_nchars > 0)
- *buf++ = 0;
+ buf += from_nchars;
}
else
/* This is an annotation datum. (-C) is the length. */
}
else
{
+ /* Source characters are at coding->source. */
const unsigned char *src = coding->source;
- const unsigned char *src_end = src + coding->src_bytes;
- Lisp_Object eol_type;
-
- eol_type = CODING_ID_EOL_TYPE (coding->id);
+ const unsigned char *src_end = src + coding->consumed;
+ if (EQ (coding->dst_object, coding->src_object))
+ dst_end = (unsigned char *) src;
if (coding->src_multibyte != coding->dst_multibyte)
{
if (coding->src_multibyte)
{
int multibytep = 1;
- int consumed_chars;
+ EMACS_INT consumed_chars = 0;
while (1)
{
int c;
ONE_MORE_BYTE (c);
- if (c == '\r')
+ if (dst == dst_end)
{
- if (EQ (eol_type, Qdos))
+ if (EQ (coding->src_object, coding->dst_object))
+ dst_end = (unsigned char *) src;
+ if (dst == dst_end)
{
- if (src == src_end)
- {
- record_conversion_result
- (coding, CODING_RESULT_INSUFFICIENT_SRC);
- goto no_more_source;
- }
- if (*src == '\n')
- c = *src++;
+ EMACS_INT offset = src - coding->source;
+
+ dst = alloc_destination (coding, src_end - src + 1,
+ dst);
+ dst_end = coding->destination + coding->dst_bytes;
+ coding_set_source (coding);
+ src = coding->source + offset;
+ src_end = coding->source + coding->src_bytes;
+ if (EQ (coding->src_object, coding->dst_object))
+ dst_end = (unsigned char *) src;
}
- else if (EQ (eol_type, Qmac))
- c = '\n';
- }
- if (dst == dst_end)
- {
- coding->consumed = src - coding->source;
-
- if (EQ (coding->src_object, coding->dst_object))
- dst_end = (unsigned char *) src;
- if (dst == dst_end)
- {
- dst = alloc_destination (coding, src_end - src + 1,
- dst);
- dst_end = coding->destination + coding->dst_bytes;
- coding_set_source (coding);
- src = coding->source + coding->consumed;
- src_end = coding->source + coding->src_bytes;
- }
}
*dst++ = c;
produced_chars++;
int multibytep = 1;
int c = *src++;
- if (c == '\r')
- {
- if (EQ (eol_type, Qdos))
- {
- if (src < src_end
- && *src == '\n')
- c = *src++;
- }
- else if (EQ (eol_type, Qmac))
- c = '\n';
- }
if (dst >= dst_end - 1)
{
- coding->consumed = src - coding->source;
-
if (EQ (coding->src_object, coding->dst_object))
dst_end = (unsigned char *) src;
if (dst >= dst_end - 1)
{
- dst = alloc_destination (coding, src_end - src + 2,
- dst);
+ EMACS_INT offset = src - coding->source;
+ EMACS_INT more_bytes;
+
+ if (EQ (coding->src_object, coding->dst_object))
+ more_bytes = ((src_end - src) / 2) + 2;
+ else
+ more_bytes = src_end - src + 2;
+ dst = alloc_destination (coding, more_bytes, dst);
dst_end = coding->destination + coding->dst_bytes;
coding_set_source (coding);
- src = coding->source + coding->consumed;
+ src = coding->source + offset;
src_end = coding->source + coding->src_bytes;
+ if (EQ (coding->src_object, coding->dst_object))
+ dst_end = (unsigned char *) src;
}
}
EMIT_ONE_BYTE (c);
{
if (!EQ (coding->src_object, coding->dst_object))
{
- int require = coding->src_bytes - coding->dst_bytes;
+ EMACS_INT require = coding->src_bytes - coding->dst_bytes;
if (require > 0)
{
src_end = coding->source + coding->src_bytes;
}
}
- produced_chars = coding->src_chars;
+ produced_chars = coding->consumed_char;
while (src < src_end)
- {
- int c = *src++;
-
- if (c == '\r')
- {
- if (EQ (eol_type, Qdos))
- {
- if (src < src_end
- && *src == '\n')
- c = *src++;
- produced_chars--;
- }
- else if (EQ (eol_type, Qmac))
- c = '\n';
- }
- *dst++ = c;
- }
+ *dst++ = *src++;
}
- coding->consumed = coding->src_bytes;
- coding->consumed_char = coding->src_chars;
}
produced = dst - (coding->destination + coding->produced);
/* Compose text in CODING->object according to the annotation data at
CHARBUF. CHARBUF is an array:
- [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
+ [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
*/
static INLINE void
enum composition_method method;
Lisp_Object components;
- len = -charbuf[0];
+ len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
to = pos + charbuf[2];
- if (to <= pos)
- return;
- method = (enum composition_method) (charbuf[3]);
+ method = (enum composition_method) (charbuf[4]);
if (method == COMPOSITION_RELATIVE)
components = Qnil;
- else if (method >= COMPOSITION_WITH_RULE
- && method <= COMPOSITION_WITH_RULE_ALTCHARS)
+ else
{
Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
- int i;
+ int i, j;
- len -= 4;
- charbuf += 4;
- for (i = 0; i < len; i++)
+ if (method == COMPOSITION_WITH_RULE)
+ len = charbuf[2] * 3 - 2;
+ charbuf += MAX_ANNOTATION_LENGTH;
+ /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
+ for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
{
- args[i] = make_number (charbuf[i]);
- if (charbuf[i] < 0)
- return;
+ if (charbuf[i] >= 0)
+ args[j] = make_number (charbuf[i]);
+ else
+ {
+ i++;
+ args[j] = make_number (charbuf[i] % 0x100);
+ }
}
- components = (method == COMPOSITION_WITH_ALTCHARS
- ? Fstring (len, args) : Fvector (len, args));
+ components = (i == j ? Fstring (j, args) : Fvector (j, args));
}
- else
- return;
compose_text (pos, to, components, Qnil, coding->dst_object);
}
#define ALLOC_CONVERSION_WORK_AREA(coding) \
do { \
- int size = CHARBUF_SIZE;; \
+ int size = CHARBUF_SIZE; \
\
coding->charbuf = NULL; \
while (size > 1024) \
while (charbuf < charbuf_end)
{
if (*charbuf >= 0)
- pos += *charbuf++;
+ pos++, charbuf++;
else
{
int len = -*charbuf;
- switch (charbuf[1])
- {
- case CODING_ANNOTATE_COMPOSITION_MASK:
- produce_composition (coding, charbuf, pos);
- break;
- case CODING_ANNOTATE_CHARSET_MASK:
- produce_charset (coding, charbuf, pos);
- break;
- default:
- abort ();
- }
+
+ if (len > 2)
+ switch (charbuf[1])
+ {
+ case CODING_ANNOTATE_COMPOSITION_MASK:
+ produce_composition (coding, charbuf, pos);
+ break;
+ case CODING_ANNOTATE_CHARSET_MASK:
+ produce_charset (coding, charbuf, pos);
+ break;
+ }
charbuf += len;
}
}
that the number of data is less than the size of
coding->charbuf. */
coding->charbuf_used = 0;
+ coding->chars_at_source = 0;
+
while (nbytes-- > 0)
{
int c = *src++;
coding->carryover. */
unsigned char *p = coding->carryover;
+ if (nbytes > sizeof coding->carryover)
+ nbytes = sizeof coding->carryover;
coding->carryover_bytes = nbytes;
while (nbytes-- > 0)
*p++ = *src++;
coding->consumed = coding->src_bytes;
}
- if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
+ if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
+ && !inhibit_eol_conversion)
decode_eol (coding);
if (BUFFERP (coding->dst_object))
{
enum composition_method method = COMPOSITION_METHOD (prop);
int nchars = COMPOSITION_LENGTH (prop);
- ADD_COMPOSITION_DATA (buf, nchars, method);
+ ADD_COMPOSITION_DATA (buf, nchars, 0, method);
if (method != COMPOSITION_RELATIVE)
{
Lisp_Object components;
if (! NILP (translation_table))
lookup_buf = alloca (sizeof (int) * max_lookup);
- eol_type = CODING_ID_EOL_TYPE (coding->id);
+ eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
if (VECTORP (eol_type))
eol_type = Qunix;
if (coding->encoder == encode_coding_raw_text)
c = *src++, pos++;
else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
- c = STRING_CHAR_ADVANCE (src), pos += bytes;
+ c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
else
c = BYTE8_TO_CHAR (*src), src++, pos++;
}
else
- c = STRING_CHAR_ADVANCE (src), pos++;
+ c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
c = '\n';
if (! EQ (eol_type, Qunix))
for (i = 1; i < max_lookup && p < src_end; i++)
lookup_buf[i] = STRING_CHAR_ADVANCE (p);
lookup_buf_end = lookup_buf + i;
- trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
- &from_nchars, &to_nchars);
- if (EQ (trans, Qt)
- || buf + to_nchars > buf_end)
+ trans = get_translation (trans, lookup_buf, lookup_buf_end);
+ if (INTEGERP (trans))
+ c = XINT (trans);
+ else if (CONSP (trans))
+ {
+ from_nchars = ASIZE (XCAR (trans));
+ trans = XCDR (trans);
+ if (INTEGERP (trans))
+ c = XINT (trans);
+ else
+ {
+ to_nchars = ASIZE (trans);
+ if (buf + to_nchars > buf_end)
+ break;
+ c = XINT (AREF (trans, 0));
+ }
+ }
+ else
break;
- *buf++ = *lookup_buf;
+ *buf++ = c;
for (i = 1; i < to_nchars; i++)
*buf++ = XINT (AREF (trans, i));
for (i = 1; i < from_nchars; i++, pos++)
}
else
{
- name = Vcode_conversion_workbuf_name;
- workbuf = Fget_buffer_create (name);
- if (NILP (Vcode_conversion_reused_workbuf))
- Vcode_conversion_reused_workbuf = workbuf;
+ if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
+ Vcode_conversion_reused_workbuf
+ = Fget_buffer_create (Vcode_conversion_workbuf_name);
+ workbuf = Vcode_conversion_reused_workbuf;
}
current = current_buffer;
set_buffer_internal (XBUFFER (workbuf));
- Ferase_buffer ();
+ /* We can't allow modification hooks to run in the work buffer. For
+ instance, directory_files_internal assumes that file decoding
+ doesn't compile new regexps. */
+ Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
+ Ferase_buffer ();
current_buffer->undo_list = Qt;
current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
set_buffer_internal (current);
EMACS_INT chars = to - from;
EMACS_INT bytes = to_byte - from_byte;
Lisp_Object attrs;
- Lisp_Object buffer;
int saved_pt = -1, saved_pt_byte;
int need_marker_adjustment = 0;
+ Lisp_Object old_deactivate_mark;
- buffer = Fcurrent_buffer ();
+ old_deactivate_mark = Vdeactivate_mark;
if (NILP (dst_object))
{
}
saved_pt = PT, saved_pt_byte = PT_BYTE;
TEMP_SET_PT_BOTH (from, from_byte);
+ current_buffer->text->inhibit_shrinking = 1;
del_range_both (from, from_byte, to, to_byte, 1);
coding->src_pos = -chars;
coding->src_pos_byte = -bytes;
|| (! NILP (CODING_ATTR_POST_READ (attrs))
&& NILP (dst_object)))
{
- coding->dst_object = code_conversion_save (1, 1);
+ coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
+ coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
coding->dst_pos = BEG;
coding->dst_pos_byte = BEG_BYTE;
- coding->dst_multibyte = 1;
}
else if (BUFFERP (dst_object))
{
{
code_conversion_save (0, 0);
coding->dst_object = Qnil;
+ /* Most callers presume this will return a multibyte result, and they
+ won't use `binary' or `raw-text' anyway, so let's not worry about
+ CODING_FOR_UNIBYTE. */
coding->dst_multibyte = 1;
}
if (! NILP (CODING_ATTR_POST_READ (attrs)))
{
- struct gcpro gcpro1, gcpro2;
+ struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
Lisp_Object val;
TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
- GCPRO2 (coding->src_object, coding->dst_object);
+ GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
+ old_deactivate_mark);
val = safe_call1 (CODING_ATTR_POST_READ (attrs),
make_number (coding->produced_char));
UNGCPRO;
set_buffer_internal (XBUFFER (coding->dst_object));
if (dst_bytes < coding->produced)
{
- destination
- = (unsigned char *) xrealloc (destination, coding->produced);
+ destination = xrealloc (destination, coding->produced);
if (! destination)
{
record_conversion_result (coding,
As we have moved PT while replacing the original buffer
contents, we must recover it now. */
set_buffer_internal (XBUFFER (src_object));
+ current_buffer->text->inhibit_shrinking = 0;
if (saved_pt < from)
TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
else if (saved_pt < from + chars)
}
}
+ Vdeactivate_mark = old_deactivate_mark;
unbind_to (count, coding->dst_object);
}
EMACS_INT chars = to - from;
EMACS_INT bytes = to_byte - from_byte;
Lisp_Object attrs;
- Lisp_Object buffer;
int saved_pt = -1, saved_pt_byte;
int need_marker_adjustment = 0;
int kill_src_buffer = 0;
+ Lisp_Object old_deactivate_mark;
- buffer = Fcurrent_buffer ();
+ old_deactivate_mark = Vdeactivate_mark;
coding->src_object = src_object;
coding->src_chars = chars;
{
Lisp_Object args[3];
+ struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
+ GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
+ old_deactivate_mark);
args[0] = CODING_ATTR_PRE_WRITE (attrs);
args[1] = make_number (BEG);
args[2] = make_number (Z);
safe_call (3, args);
+ UNGCPRO;
}
if (XBUFFER (coding->src_object) != current_buffer)
kill_src_buffer = 1;
}
else
{
- coding->dst_pos = BUF_PT (XBUFFER (dst_object));
- coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
+ struct buffer *current = current_buffer;
+
+ set_buffer_temp (XBUFFER (dst_object));
+ coding->dst_pos = PT;
+ coding->dst_pos_byte = PT_BYTE;
+ move_gap_both (coding->dst_pos, coding->dst_pos_byte);
+ set_buffer_temp (current);
}
coding->dst_multibyte
= ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
if (kill_src_buffer)
Fkill_buffer (coding->src_object);
+
+ Vdeactivate_mark = old_deactivate_mark;
unbind_to (count, Qnil);
}
doc: /* Return t if OBJECT is nil or a coding-system.
See the documentation of `define-coding-system' for information
about coding-system objects. */)
- (obj)
- Lisp_Object obj;
+ (object)
+ Lisp_Object object;
{
- if (NILP (obj)
- || CODING_SYSTEM_ID (obj) >= 0)
+ if (NILP (object)
+ || CODING_SYSTEM_ID (object) >= 0)
return Qt;
- if (! SYMBOLP (obj)
- || NILP (Fget (obj, Qcoding_system_define_form)))
+ if (! SYMBOLP (object)
+ || NILP (Fget (object, Qcoding_system_define_form)))
return Qnil;
return Qt;
}
detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
coding_system)
const unsigned char *src;
- int src_chars, src_bytes, highest;
+ EMACS_INT src_chars, src_bytes;
+ int highest;
int multibytep;
Lisp_Object coding_system;
{
const unsigned char *src_end = src + src_bytes;
Lisp_Object attrs, eol_type;
- Lisp_Object val;
+ Lisp_Object val = Qnil;
struct coding_system coding;
int id;
struct coding_detection_info detect_info;
enum coding_category base_category;
+ int null_byte_found = 0, eight_bit_found = 0;
if (NILP (coding_system))
coding_system = Qundecided;
coding.src_multibyte = multibytep;
coding.consumed = 0;
coding.mode |= CODING_MODE_LAST_BLOCK;
+ coding.head_ascii = 0;
detect_info.checked = detect_info.found = detect_info.rejected = 0;
int c, i;
/* Skip all ASCII bytes except for a few ISO2022 controls. */
- for (i = 0; src < src_end; i++, src++)
+ for (; src < src_end; src++)
{
c = *src;
if (c & 0x80)
- break;
- if (c < 0x20
- && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
- && ! inhibit_iso_escape_detection)
{
- coding.head_ascii = src - coding.source;
- if (detect_coding_iso_2022 (&coding, &detect_info))
+ eight_bit_found = 1;
+ if (null_byte_found)
+ break;
+ }
+ else if (c < 0x20)
+ {
+ if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ && ! inhibit_iso_escape_detection
+ && ! detect_info.checked)
{
- /* We have scanned the whole data. */
- if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
- /* We didn't find an 8-bit code. */
- src = src_end;
- break;
+ if (detect_coding_iso_2022 (&coding, &detect_info))
+ {
+ /* We have scanned the whole data. */
+ if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
+ {
+ /* We didn't find an 8-bit code. We may
+ have found a null-byte, but it's very
+ rare that a binary file confirm to
+ ISO-2022. */
+ src = src_end;
+ coding.head_ascii = src - coding.source;
+ }
+ detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
+ break;
+ }
+ }
+ else if (! c && !inhibit_null_byte_detection)
+ {
+ null_byte_found = 1;
+ if (eight_bit_found)
+ break;
}
+ if (! eight_bit_found)
+ coding.head_ascii++;
}
+ else if (! eight_bit_found)
+ coding.head_ascii++;
}
- coding.head_ascii = src - coding.source;
- if (src < src_end
+ if (null_byte_found || eight_bit_found
+ || coding.head_ascii < coding.src_bytes
|| detect_info.found)
{
- if (src == src_end)
+ if (coding.head_ascii == coding.src_bytes)
/* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
for (i = 0; i < coding_category_raw_text; i++)
{
break;
}
else
- for (i = 0; i < coding_category_raw_text; i++)
- {
- category = coding_priorities[i];
- this = coding_categories + category;
+ {
+ if (null_byte_found)
+ {
+ detect_info.checked |= ~CATEGORY_MASK_UTF_16;
+ detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
+ }
+ for (i = 0; i < coding_category_raw_text; i++)
+ {
+ category = coding_priorities[i];
+ this = coding_categories + category;
- if (this->id < 0)
- {
- /* No coding system of this category is defined. */
- detect_info.rejected |= (1 << category);
- }
- else if (category >= coding_category_raw_text)
- continue;
- else if (detect_info.checked & (1 << category))
- {
- if (highest
- && (detect_info.found & (1 << category)))
- break;
- }
- else
- {
- if ((*(this->detector)) (&coding, &detect_info)
- && highest
- && (detect_info.found & (1 << category)))
- {
- if (category == coding_category_utf_16_auto)
- {
- if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
- category = coding_category_utf_16_le;
- else
- category = coding_category_utf_16_be;
- }
+ if (this->id < 0)
+ {
+ /* No coding system of this category is defined. */
+ detect_info.rejected |= (1 << category);
+ }
+ else if (category >= coding_category_raw_text)
+ continue;
+ else if (detect_info.checked & (1 << category))
+ {
+ if (highest
+ && (detect_info.found & (1 << category)))
break;
- }
- }
- }
+ }
+ else if ((*(this->detector)) (&coding, &detect_info)
+ && highest
+ && (detect_info.found & (1 << category)))
+ {
+ if (category == coding_category_utf_16_auto)
+ {
+ if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
+ category = coding_category_utf_16_le;
+ else
+ category = coding_category_utf_16_be;
+ }
+ break;
+ }
+ }
+ }
}
- if (detect_info.rejected == CATEGORY_MASK_ANY)
+ if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
+ || null_byte_found)
{
detect_info.found = CATEGORY_MASK_RAW_TEXT;
- id = coding_categories[coding_category_raw_text].id;
+ id = CODING_SYSTEM_ID (Qno_conversion);
val = Fcons (make_number (id), Qnil);
}
else if (! detect_info.rejected && ! detect_info.found)
{
int mask = detect_info.rejected | detect_info.found;
int found = 0;
- val = Qnil;
for (i = coding_category_raw_text - 1; i >= 0; i--)
{
detect_info.found |= found;
}
}
+ else if (base_category == coding_category_utf_8_auto)
+ {
+ if (detect_coding_utf_8 (&coding, &detect_info))
+ {
+ struct coding_system *this;
+
+ if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+ this = coding_categories + coding_category_utf_8_sig;
+ else
+ this = coding_categories + coding_category_utf_8_nosig;
+ val = Fcons (make_number (this->id), Qnil);
+ }
+ }
else if (base_category == coding_category_utf_16_auto)
{
if (detect_coding_utf_16 (&coding, &detect_info))
/* Then, detect eol-format if necessary. */
{
- int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
+ int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
Lisp_Object tail;
if (VECTORP (eol_type))
{
if (detect_info.found & ~CATEGORY_MASK_UTF_16)
- normal_eol = detect_eol (coding.source, src_bytes,
- coding_category_raw_text);
+ {
+ if (null_byte_found)
+ normal_eol = EOL_SEEN_LF;
+ else
+ normal_eol = detect_eol (coding.source, src_bytes,
+ coding_category_raw_text);
+ }
if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
| CATEGORY_MASK_UTF_16_BE_NOSIG))
utf_16_be_eol = detect_eol (coding.source, src_bytes,
}
}
- return (highest ? XCAR (val) : val);
+ return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
}
2, 3, 0,
doc: /* Detect coding system of the text in the region between START and END.
Return a list of possible coding systems ordered by priority.
+The coding systems to try and their priorities follows what
+the function `coding-system-priority-list' (which see) returns.
If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
If optional argument HIGHEST is non-nil, return the coding system of
highest priority. */)
Return a list of possible coding systems ordered by priority.
If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
If optional argument HIGHEST is non-nil, return the coding system of
highest priority. */)
Sunencodable_char_position, 3, 5, 0,
doc: /*
Return position of first un-encodable character in a region.
-START and END specfiy the region and CODING-SYSTEM specifies the
+START and END specify the region and CODING-SYSTEM specifies the
encoding to check. Return nil if CODING-SYSTEM does encode the region.
If optional 4th argument COUNT is non-nil, it specifies at most how
CODING-SYSTEM-LIST is a list of coding systems to check.
The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
-CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
+CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
whole region, POS0, POS1, ... are buffer positions where non-encodable
characters are found.
START may be a string. In that case, check if the string is
encodable, and the value contains indices to the string instead of
-buffer positions. END is ignored. */)
+buffer positions. END is ignored.
+
+If the current buffer (or START if it is a string) is unibyte, the value
+is nil. */)
(start, end, coding_system_list)
Lisp_Object start, end, coding_system_list;
{
if (STRINGP (start))
{
if (!STRING_MULTIBYTE (start)
- && SCHARS (start) != SBYTES (start))
+ || SCHARS (start) == SBYTES (start))
return Qnil;
start_byte = 0;
end_byte = SBYTES (start);
start_byte = CHAR_TO_BYTE (XINT (start));
end_byte = CHAR_TO_BYTE (XINT (end));
if (XINT (end) - XINT (start) == end_byte - start_byte)
- return Qt;
+ return Qnil;
if (XINT (start) < GPT && XINT (end) > GPT)
{
Optional 4th arguments DESTINATION specifies where the decoded text goes.
If nil, the region between START and END is replaced by the decoded text.
-If buffer, the decoded text is inserted in the buffer.
-If t, the decoded text is returned.
+If buffer, the decoded text is inserted in that buffer after point (point
+does not move).
+In those cases, the length of the decoded text is returned.
+If DESTINATION is t, the decoded text is returned.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified.)
-It returns the length of the decoded text. */)
+not fully specified.) */)
(start, end, coding_system, destination)
Lisp_Object start, end, coding_system, destination;
{
DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3, 4, "r\nzCoding system: ",
doc: /* Encode the current region by specified coding system.
-When called from a program, takes three arguments:
-START, END, and CODING-SYSTEM. START and END are buffer positions.
+When called from a program, takes four arguments:
+ START, END, CODING-SYSTEM and DESTINATION.
+START and END are buffer positions.
Optional 4th arguments DESTINATION specifies where the encoded text goes.
If nil, the region between START and END is replace by the encoded text.
-If buffer, the encoded text is inserted in the buffer.
-If t, the encoded text is returned.
+If buffer, the encoded text is inserted in that buffer after point (point
+does not move).
+In those cases, the length of the encoded text is returned.
+If DESTINATION is t, the encoded text is returned.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified.)
-It returns the length of the encoded text. */)
+not fully specified.) */)
(start, end, coding_system, destination)
Lisp_Object start, end, coding_system, destination;
{
Optional third arg NOCOPY non-nil means it is OK to return STRING itself
if the decoding operation is trivial.
-Optional fourth arg BUFFER non-nil meant that the decoded text is
-inserted in BUFFER instead of returned as a string. In this case,
-the return value is BUFFER.
+Optional fourth arg BUFFER non-nil means that the decoded text is
+inserted in that buffer after point (point does not move). In this
+case, the return value is the length of the decoded text.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified. */)
+not fully specified.) */)
(string, coding_system, nocopy, buffer)
Lisp_Object string, coding_system, nocopy, buffer;
{
Optional third arg NOCOPY non-nil means it is OK to return STRING
itself if the encoding operation is trivial.
-Optional fourth arg BUFFER non-nil meant that the encoded text is
-inserted in BUFFER instead of returned as a string. In this case,
-the return value is BUFFER.
+Optional fourth arg BUFFER non-nil means that the encoded text is
+inserted in that buffer after point (point does not move). In this
+case, the return value is the length of the encoded text.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
TARGET has a meaning which depends on OPERATION:
For file I/O, TARGET is a file name (except for the special case below).
For process I/O, TARGET is a process name.
- For network I/O, TARGET is a service name or a port number
+ For network I/O, TARGET is a service name or a port number.
-This function looks up what specified for TARGET in,
+This function looks up what is specified for TARGET in
`file-coding-system-alist', `process-coding-system-alist',
or `network-coding-system-alist' depending on OPERATION.
They may specify a coding system, a cons of coding systems,
operation = args[0];
if (!SYMBOLP (operation)
|| !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
- error ("Invalid first arguement");
+ error ("Invalid first argument");
if (nargs < 1 + XINT (target_idx))
error ("Too few arguments for operation: %s",
SDATA (SYMBOL_NAME (operation)));
DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
Sset_coding_system_priority, 0, MANY, 0,
doc: /* Assign higher priority to the coding systems given as arguments.
-If multiple coding systems belongs to the same category,
+If multiple coding systems belong to the same category,
all but the first one are ignored.
-usage: (set-coding-system-priority ...) */)
+usage: (set-coding-system-priority &rest coding-systems) */)
(nargs, args)
int nargs;
Lisp_Object *args;
DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
Scoding_system_priority_list, 0, 1, 0,
doc: /* Return a list of coding systems ordered by their priorities.
+The list contains a subset of coding systems; i.e. coding systems
+assigned to each coding category (see `coding-category-list').
+
HIGHESTP non-nil means just return the highest priority one. */)
(highestp)
Lisp_Object highestp;
}
CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
- safe_charsets = Fmake_string (make_number (max_charset_id + 1),
- make_number (255));
+ safe_charsets = make_uninit_string (max_charset_id + 1);
+ memset (SDATA (safe_charsets), 255, max_charset_id + 1);
for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
val = XCDR (bom);
CHECK_CODING_SYSTEM (val);
}
- ASET (attrs, coding_attr_utf_16_bom, bom);
+ ASET (attrs, coding_attr_utf_bom, bom);
endian = args[coding_arg_utf16_endian];
CHECK_SYMBOL (endian);
}
else if (EQ (coding_type, Qutf_8))
{
- category = coding_category_utf_8;
+ Lisp_Object bom;
+
CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
+
+ if (nargs < coding_arg_utf8_max)
+ goto short_args;
+
+ bom = args[coding_arg_utf8_bom];
+ if (! NILP (bom) && ! EQ (bom, Qt))
+ {
+ CHECK_CONS (bom);
+ val = XCAR (bom);
+ CHECK_CODING_SYSTEM (val);
+ val = XCDR (bom);
+ CHECK_CODING_SYSTEM (val);
+ }
+ ASET (attrs, coding_attr_utf_bom, bom);
+
+ category = (CONSP (bom) ? coding_category_utf_8_auto
+ : NILP (bom) ? coding_category_utf_8_nosig
+ : coding_category_utf_8_sig);
}
else if (EQ (coding_type, Qundecided))
category = coding_category_undecided;
= Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
CODING_ATTR_PLIST (attrs)));
CODING_ATTR_PLIST (attrs)
- = Fcons (QCascii_compatible_p,
+ = Fcons (QCascii_compatible_p,
Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
CODING_ATTR_PLIST (attrs)));
CHECK_CHARACTER (val);
CODING_ATTR_MNEMONIC (attrs) = val;
}
- else if (EQ (prop, QCdefalut_char))
+ else if (EQ (prop, QCdefault_char))
{
if (NILP (val))
val = make_number (' ');
CHECK_SYMBOL (alias);
CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
aliases = AREF (spec, 1);
- /* ALISES should be a list of length more than zero, and the first
+ /* ALIASES should be a list of length more than zero, and the first
element is a base coding system. Append ALIAS at the tail of the
list. */
while (!NILP (XCDR (aliases)))
DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
Scoding_system_eol_type, 1, 1, 0,
doc: /* Return eol-type of CODING-SYSTEM.
-An eol-type is integer 0, 1, 2, or a vector of coding systems.
+An eol-type is an integer 0, 1, 2, or a vector of coding systems.
Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
and CR respectively.
DEFSYM (QCcategory, ":category");
DEFSYM (QCmnemonic, ":mnemonic");
- DEFSYM (QCdefalut_char, ":default-char");
+ DEFSYM (QCdefault_char, ":default-char");
DEFSYM (QCdecode_translation_table, ":decode-translation-table");
DEFSYM (QCencode_translation_table, ":encode-translation-table");
DEFSYM (QCpost_read_conversion, ":post-read-conversion");
intern ("coding-category-iso-7-else"));
ASET (Vcoding_category_table, coding_category_iso_8_else,
intern ("coding-category-iso-8-else"));
- ASET (Vcoding_category_table, coding_category_utf_8,
+ ASET (Vcoding_category_table, coding_category_utf_8_auto,
+ intern ("coding-category-utf-8-auto"));
+ ASET (Vcoding_category_table, coding_category_utf_8_nosig,
intern ("coding-category-utf-8"));
+ ASET (Vcoding_category_table, coding_category_utf_8_sig,
+ intern ("coding-category-utf-8-sig"));
ASET (Vcoding_category_table, coding_category_utf_16_be,
intern ("coding-category-utf-16-be"));
ASET (Vcoding_category_table, coding_category_utf_16_auto,
DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
doc: /* Alist of coding system names.
Each element is one element list of coding system name.
-This variable is given to `completing-read' as TABLE argument.
+This variable is given to `completing-read' as COLLECTION argument.
Do not alter the value of this variable manually. This variable should be
updated by the functions `make-coding-system' and
doc: /* Specify the coding system for read operations.
It is useful to bind this variable with `let', but do not set it globally.
If the value is a coding system, it is used for decoding on read operation.
-If not, an appropriate element is used from one of the coding system alists:
-There are three such tables, `file-coding-system-alist',
+If not, an appropriate element is used from one of the coding system alists.
+There are three such tables: `file-coding-system-alist',
`process-coding-system-alist', and `network-coding-system-alist'. */);
Vcoding_system_for_read = Qnil;
when writing it to a file and when sending it to a file or subprocess.
If this does not specify a coding system, an appropriate element
-is used from one of the coding system alists:
-There are three such tables, `file-coding-system-alist',
+is used from one of the coding system alists.
+There are three such tables: `file-coding-system-alist',
`process-coding-system-alist', and `network-coding-system-alist'.
For output to files, if the above procedure does not specify a coding system,
the value of `buffer-file-coding-system' is used. */);
a coding system of ISO 2022 variant which has a flag
`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
or reading output of a subprocess.
-Only 128th through 159th elements has a meaning. */);
+Only 128th through 159th elements have a meaning. */);
Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
DEFVAR_LISP ("select-safe-coding-system-function",
DEFVAR_BOOL ("inhibit-iso-escape-detection",
&inhibit_iso_escape_detection,
doc: /*
-If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
+If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
-By default, on reading a file, Emacs tries to detect how the text is
-encoded. This code detection is sensitive to escape sequences. If
-the sequence is valid as ISO2022, the code is determined as one of
-the ISO2022 encodings, and the file is decoded by the corresponding
-coding system (e.g. `iso-2022-7bit').
+When Emacs reads text, it tries to detect how the text is encoded.
+This code detection is sensitive to escape sequences. If Emacs sees
+a valid ISO-2022 escape sequence, it assumes the text is encoded in one
+of the ISO2022 encodings, and decodes text by the corresponding coding
+system (e.g. `iso-2022-7bit').
However, there may be a case that you want to read escape sequences in
a file as is. In such a case, you can set this variable to non-nil.
-Then, as the code detection ignores any escape sequences, no file is
-detected as encoded in some ISO2022 encoding. The result is that all
+Then the code detection will ignore any escape sequences, and no text is
+detected as encoded in some ISO-2022 encoding. The result is that all
escape sequences become visible in a buffer.
The default value is nil, and it is strongly recommended not to change
reading if you suppress escape sequence detection.
The other way to read escape sequences in a file without decoding is
-to explicitly specify some coding system that doesn't use ISO2022's
+to explicitly specify some coding system that doesn't use ISO-2022
escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
inhibit_iso_escape_detection = 0;
+ DEFVAR_BOOL ("inhibit-null-byte-detection",
+ &inhibit_null_byte_detection,
+ doc: /* If non-nil, Emacs ignores null bytes on code detection.
+By default, Emacs treats it as binary data, and does not attempt to
+decode it. The effect is as if you specified `no-conversion' for
+reading that text.
+
+Set this to non-nil when a regular text happens to include null bytes.
+Examples are Index nodes of Info files and null-byte delimited output
+from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
+decode text as usual. */);
+ inhibit_null_byte_detection = 0;
+
DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
doc: /* Char table for translating self-inserting characters.
-This is applied to the result of input methods, not their input. See also
-`keyboard-translate-table'. */);
+This is applied to the result of input methods, not their input.
+See also `keyboard-translate-table'.
+
+Use of this variable for character code unification was rendered
+obsolete in Emacs 23.1 and later, since Unicode is now the basis of
+internal character representation. */);
Vtranslation_table_for_input = Qnil;
{