/*** TABLE OF CONTENTS ***
+ 0. General comments
1. Preamble
2. Emacs' internal format (emacs-mule) handlers
3. ISO2022 handlers
*/
+/*** 0. General comments ***/
+
+
/*** GENERAL NOTE on CODING SYSTEM ***
Coding system is an encoding mechanism of one or more character
/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
- These functions decode SRC_BYTES length text at SOURCE encoded in
- CODING to Emacs' internal format (emacs-mule). The resulting text
- goes to a place pointed to by DESTINATION, the length of which
- should not exceed DST_BYTES. These functions set the information of
- original and decoded texts in the members produced, produced_char,
- consumed, and consumed_char of the structure *CODING.
+ These functions decode SRC_BYTES length of unibyte text at SOURCE
+ encoded in CODING to Emacs' internal format. The resulting
+ multibyte text goes to a place pointed to by DESTINATION, the length
+ of which should not exceed DST_BYTES.
- The return value is an integer (CODING_FINISH_XXX) indicating how
- the decoding finished.
+ These functions set the information of original and decoded texts in
+ the members produced, produced_char, consumed, and consumed_char of
+ the structure *CODING. They also set the member result to one of
+ CODING_FINISH_XXX indicating how the decoding finished.
DST_BYTES zero means that source area and destination area are
overlapped, which means that we can produce a decoded text until it
Below is a template of these functions. */
#if 0
+static void
decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
struct coding_system *coding;
unsigned char *source, *destination;
/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
These functions encode SRC_BYTES length text at SOURCE of Emacs'
- internal format (emacs-mule) to CODING. The resulting text goes to
- a place pointed to by DESTINATION, the length of which should not
- exceed DST_BYTES. These functions set the information of
- original and encoded texts in the members produced, produced_char,
- consumed, and consumed_char of the structure *CODING.
+ internal multibyte format to CODING. The resulting unibyte text
+ goes to a place pointed to by DESTINATION, the length of which
+ should not exceed DST_BYTES.
- The return value is an integer (CODING_FINISH_XXX) indicating how
- the encoding finished.
+ These functions set the information of original and encoded texts in
+ the members produced, produced_char, consumed, and consumed_char of
+ the structure *CODING. They also set the member result to one of
+ CODING_FINISH_XXX indicating how the encoding finished.
DST_BYTES zero means that source area and destination area are
- overlapped, which means that we can produce a decoded text until it
- reaches at the head of not-yet-decoded source text.
+ overlapped, which means that we can produce a encoded text until it
+ reaches at the head of not-yet-encoded source text.
Below is a template of these functions. */
#if 0
+static void
encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
struct coding_system *coding;
unsigned char *source, *destination;
/*** COMMONLY USED MACROS ***/
-/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
- THREE_MORE_BYTES safely get one, two, and three bytes from the
- source text respectively. If there are not enough bytes in the
- source, they jump to `label_end_of_loop'. The caller should set
- variables `src' and `src_end' to appropriate areas in advance. */
-
-#define ONE_MORE_BYTE(c1) \
- do { \
- if (src < src_end) \
- c1 = *src++; \
- else \
- goto label_end_of_loop; \
- } while (0)
+/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
+ get one, two, and three bytes from the source text respectively.
+ If there are not enough bytes in the source, they jump to
+ `label_end_of_loop'. The caller should set variables `coding',
+ `src' and `src_end' to appropriate pointer in advance. These
+ macros are called from decoding routines `decode_coding_XXX', thus
+ it is assumed that the source text is unibyte. */
-#define TWO_MORE_BYTES(c1, c2) \
- do { \
- if (src + 1 < src_end) \
- c1 = *src++, c2 = *src++; \
- else \
- goto label_end_of_loop; \
+#define ONE_MORE_BYTE(c1) \
+ do { \
+ if (src >= src_end) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
+ goto label_end_of_loop; \
+ } \
+ c1 = *src++; \
} while (0)
-#define THREE_MORE_BYTES(c1, c2, c3) \
- do { \
- if (src + 2 < src_end) \
- c1 = *src++, c2 = *src++, c3 = *src++; \
- else \
- goto label_end_of_loop; \
+#define TWO_MORE_BYTES(c1, c2) \
+ do { \
+ if (src + 1 >= src_end) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
+ goto label_end_of_loop; \
+ } \
+ c1 = *src++; \
+ c2 = *src++; \
} while (0)
-/* The following three macros DECODE_CHARACTER_ASCII,
- DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
- the multi-byte form of a character of each class at the place
- pointed by `dst'. The caller should set the variable `dst' to
- point to an appropriate area and the variable `coding' to point to
- the coding-system of the currently decoding text in advance. */
-/* Decode one ASCII character C. */
+/* Set C to the next character at the source text pointed by `src'.
+ If there are not enough characters in the source, jump to
+ `label_end_of_loop'. The caller should set variables `coding'
+ `src', `src_end', and `translation_table' to appropriate pointers
+ in advance. This macro is used in encoding routines
+ `encode_coding_XXX', thus it assumes that the source text is in
+ multibyte form except for 8-bit characters. 8-bit characters are
+ in multibyte form if coding->src_multibyte is nonzero, else they
+ are represented by a single byte. */
-#define DECODE_CHARACTER_ASCII(c) \
- do { \
- if (COMPOSING_P (coding->composing)) \
- { \
- *dst++ = 0xA0, *dst++ = (c) | 0x80; \
- coding->composed_chars++; \
- if (((c) | 0x80) < 0xA0) \
- coding->fake_multibyte = 1; \
- } \
- else \
- { \
- *dst++ = (c); \
- coding->produced_char++; \
- if ((c) >= 0x80) \
- coding->fake_multibyte = 1; \
- } \
+#define ONE_MORE_CHAR(c) \
+ do { \
+ int len = src_end - src; \
+ int bytes; \
+ if (len <= 0) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
+ goto label_end_of_loop; \
+ } \
+ if (coding->src_multibyte \
+ || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
+ c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
+ else \
+ c = *src, bytes = 1; \
+ if (!NILP (translation_table)) \
+ c = translate_char (translation_table, c, 0, 0, 0); \
+ src += bytes; \
} while (0)
-/* Decode one DIMENSION1 character whose charset is CHARSET and whose
- position-code is C. */
-#define DECODE_CHARACTER_DIMENSION1(charset, c) \
+/* Produce a multibyte form of characater C to `dst'. Jump to
+ `label_end_of_loop' if there's not enough space at `dst'.
+
+ If we are now in the middle of composition sequence, the decoded
+ character may be ALTCHAR (for the current composition). In that
+ case, the character goes to coding->cmp_data->data instead of
+ `dst'.
+
+ This macro is used in decoding routines. */
+
+#define EMIT_CHAR(c) \
do { \
- unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
- if (COMPOSING_P (coding->composing)) \
+ if (! COMPOSING_P (coding) \
+ || coding->composing == COMPOSITION_RELATIVE \
+ || coding->composing == COMPOSITION_WITH_RULE) \
{ \
- *dst++ = leading_code + 0x20; \
- coding->composed_chars++; \
+ int bytes = CHAR_BYTES (c); \
+ if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_DST; \
+ goto label_end_of_loop; \
+ } \
+ dst += CHAR_STRING (c, dst); \
+ coding->produced_char++; \
} \
- else \
+ \
+ if (COMPOSING_P (coding) \
+ && coding->composing != COMPOSITION_RELATIVE) \
{ \
- *dst++ = leading_code; \
- coding->produced_char++; \
+ CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
+ coding->composition_rule_follows \
+ = coding->composing != COMPOSITION_WITH_ALTCHARS; \
} \
- if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
- *dst++ = leading_code; \
- *dst++ = (c) | 0x80; \
- if (((c) | 0x80) < 0xA0) \
- coding->fake_multibyte = 1; \
} while (0)
-/* Decode one DIMENSION2 character whose charset is CHARSET and whose
- position-codes are C1 and C2. */
-#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
- do { \
- DECODE_CHARACTER_DIMENSION1 (charset, c1); \
- *dst++ = (c2) | 0x80; \
- if (((c2) | 0x80) < 0xA0) \
- coding->fake_multibyte = 1; \
+#define EMIT_ONE_BYTE(c) \
+ do { \
+ if (dst >= (dst_bytes ? dst_end : src)) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_DST; \
+ goto label_end_of_loop; \
+ } \
+ *dst++ = c; \
+ } while (0)
+
+#define EMIT_TWO_BYTES(c1, c2) \
+ do { \
+ if (dst + 2 > (dst_bytes ? dst_end : src)) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_DST; \
+ goto label_end_of_loop; \
+ } \
+ *dst++ = c1, *dst++ = c2; \
+ } while (0)
+
+#define EMIT_BYTES(from, to) \
+ do { \
+ if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_DST; \
+ goto label_end_of_loop; \
+ } \
+ while (from < to) \
+ *dst++ = *from++; \
} while (0)
\f
/*** 1. Preamble ***/
+#ifdef emacs
+#include <config.h>
+#endif
+
#include <stdio.h>
#ifdef emacs
-#include <config.h>
#include "lisp.h"
#include "buffer.h"
#include "charset.h"
+#include "composite.h"
#include "ccl.h"
#include "coding.h"
#include "window.h"
Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
Lisp_Object Qno_conversion, Qundecided;
Lisp_Object Qcoding_system_history;
-Lisp_Object Qsafe_charsets;
+Lisp_Object Qsafe_chars;
Lisp_Object Qvalid_codes;
extern Lisp_Object Qinsert_file_contents, Qwrite_region;
/* Flag to inhibit code conversion of end-of-line format. */
int inhibit_eol_conversion;
+/* Flag to inhibit ISO2022 escape sequence detection. */
+int inhibit_iso_escape_detection;
+
/* Flag to make buffer-file-coding-system inherit from process-coding. */
int inherit_process_coding_system;
Lisp_Object Vprocess_coding_system_alist;
Lisp_Object Vnetwork_coding_system_alist;
+Lisp_Object Vlocale_coding_system;
+
#endif /* emacs */
Lisp_Object Qcoding_category, Qcoding_category_index;
"coding-category-iso-8-else",
"coding-category-ccl",
"coding-category-big5",
+ "coding-category-utf-8",
+ "coding-category-utf-16-be",
+ "coding-category-utf-16-le",
"coding-category-raw-text",
"coding-category-binary"
};
/* Default coding systems used for process I/O. */
Lisp_Object Vdefault_process_coding_system;
+/* Global flag to tell that we can't call post-read-conversion and
+ pre-write-conversion functions. Usually the value is zero, but it
+ is set to 1 temporarily while such functions are running. This is
+ to avoid infinite recursive call. */
+static int inhibit_pre_post_conversion;
+
+/* Char-table containing safe coding systems of each character. */
+Lisp_Object Vchar_coding_system_table;
+Lisp_Object Qchar_coding_system;
+
+/* Return `safe-chars' property of coding system CODING. Don't check
+ validity of CODING. */
+
+Lisp_Object
+coding_safe_chars (coding)
+ struct coding_system *coding;
+{
+ Lisp_Object coding_spec, plist, safe_chars;
+
+ coding_spec = Fget (coding->symbol, Qcoding_system);
+ plist = XVECTOR (coding_spec)->contents[3];
+ safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
+ return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
+}
+
+#define CODING_SAFE_CHAR_P(safe_chars, c) \
+ (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
+
\f
/*** 2. Emacs internal format (emacs-mule) handlers ***/
/* Emacs' internal format for encoding multiple character sets is a
kind of multi-byte encoding, i.e. characters are encoded by
- variable-length sequences of one-byte codes. ASCII characters
- and control characters (e.g. `tab', `newline') are represented by
- one-byte sequences which are their ASCII codes, in the range 0x00
- through 0x7F. The other characters are represented by a sequence
- of `base leading-code', optional `extended leading-code', and one
- or two `position-code's. The length of the sequence is determined
- by the base leading-code. Leading-code takes the range 0x80
- through 0x9F, whereas extended leading-code and position-code take
- the range 0xA0 through 0xFF. See `charset.h' for more details
- about leading-code and position-code.
-
- There's one exception to this rule. Special leading-code
- `leading-code-composition' denotes that the following several
- characters should be composed into one character. Leading-codes of
- components (except for ASCII) are added 0x20. An ASCII character
- component is represented by a 2-byte sequence of `0xA0' and
- `ASCII-code + 0x80'. See also the comments in `charset.h' for the
- details of composite character. Hence, we can summarize the code
- range as follows:
+ variable-length sequences of one-byte codes.
+
+ ASCII characters and control characters (e.g. `tab', `newline') are
+ represented by one-byte sequences which are their ASCII codes, in
+ the range 0x00 through 0x7F.
+
+ 8-bit characters of the range 0x80..0x9F are represented by
+ two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
+ code + 0x20).
+
+ 8-bit characters of the range 0xA0..0xFF are represented by
+ one-byte sequences which are their 8-bit code.
+
+ The other characters are represented by a sequence of `base
+ leading-code', optional `extended leading-code', and one or two
+ `position-code's. The length of the sequence is determined by the
+ base leading-code. Leading-code takes the range 0x80 through 0x9F,
+ whereas extended leading-code and position-code take the range 0xA0
+ through 0xFF. See `charset.h' for more details about leading-code
+ and position-code.
--- CODE RANGE of Emacs' internal format ---
- (character set) (range)
- ASCII 0x00 .. 0x7F
- ELSE (1st byte) 0x80 .. 0x9F
- (rest bytes) 0xA0 .. 0xFF
+ character set range
+ ------------- -----
+ ascii 0x00..0x7F
+ eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
+ eight-bit-graphic 0xA0..0xBF
+ ELSE 0x81..0x9F + [0xA0..0xFF]+
---------------------------------------------
*/
enum emacs_code_class_type emacs_code_class[256];
-/* Go to the next statement only if *SRC is accessible and the code is
- greater than 0xA0. */
-#define CHECK_CODE_RANGE_A0_FF \
- do { \
- if (src >= src_end) \
- goto label_end_of_switch; \
- else if (*src++ < 0xA0) \
- return 0; \
- } while (0)
-
/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
Check if a text is encoded in Emacs' internal format. If it is,
return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
int
detect_coding_emacs_mule (src, src_end)
- unsigned char *src, *src_end;
+ unsigned char *src, *src_end;
{
unsigned char c;
int composing = 0;
+ /* Dummy for ONE_MORE_BYTE. */
+ struct coding_system dummy_coding;
+ struct coding_system *coding = &dummy_coding;
- while (src < src_end)
+ while (1)
{
- c = *src++;
+ ONE_MORE_BYTE (c);
if (composing)
{
if (c < 0xA0)
composing = 0;
+ else if (c == 0xA0)
+ {
+ ONE_MORE_BYTE (c);
+ c &= 0x7F;
+ }
else
c -= 0x20;
}
- switch (emacs_code_class[c])
+ if (c < 0x20)
{
- case EMACS_ascii_code:
- case EMACS_linefeed_code:
- break;
-
- case EMACS_control_code:
if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
return 0;
- break;
+ }
+ else if (c >= 0x80 && c < 0xA0)
+ {
+ if (c == 0x80)
+ /* Old leading code for a composite character. */
+ composing = 1;
+ else
+ {
+ unsigned char *src_base = src - 1;
+ int bytes;
- case EMACS_invalid_code:
- return 0;
+ if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
+ bytes))
+ return 0;
+ src = src_base + bytes;
+ }
+ }
+ }
+ label_end_of_loop:
+ return CODING_CATEGORY_MASK_EMACS_MULE;
+}
- case EMACS_leading_code_composition: /* c == 0x80 */
- if (composing)
- CHECK_CODE_RANGE_A0_FF;
- else
- composing = 1;
- break;
- case EMACS_leading_code_4:
- CHECK_CODE_RANGE_A0_FF;
- /* fall down to check it two more times ... */
+/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
- case EMACS_leading_code_3:
- CHECK_CODE_RANGE_A0_FF;
- /* fall down to check it one more time ... */
+static void
+decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
+ struct coding_system *coding;
+ unsigned char *source, *destination;
+ int src_bytes, dst_bytes;
+{
+ unsigned char *src = source;
+ unsigned char *src_end = source + src_bytes;
+ unsigned char *dst = destination;
+ unsigned char *dst_end = destination + dst_bytes;
+ /* SRC_BASE remembers the start position in source in each loop.
+ The loop will be exited when there's not enough source code, or
+ when there's not enough destination area to produce a
+ character. */
+ unsigned char *src_base;
- case EMACS_leading_code_2:
- CHECK_CODE_RANGE_A0_FF;
- break;
+ coding->produced_char = 0;
+ while ((src_base = src) < src_end)
+ {
+ unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
+ int bytes;
- default:
- label_end_of_switch:
+ if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
+ {
+ p = src;
+ src += bytes;
+ }
+ else
+ {
+ bytes = CHAR_STRING (*src, tmp);
+ p = tmp;
+ src++;
+ }
+ if (dst + bytes >= (dst_bytes ? dst_end : src))
+ {
+ coding->result = CODING_FINISH_INSUFFICIENT_DST;
break;
}
+ while (bytes--) *dst++ = *p++;
+ coding->produced_char++;
}
- return CODING_CATEGORY_MASK_EMACS_MULE;
+ coding->consumed = coding->consumed_char = src_base - source;
+ coding->produced = dst - destination;
}
+#define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
+ encode_eol (coding, source, destination, src_bytes, dst_bytes)
+
+
\f
/*** 3. ISO2022 handlers ***/
is encoded using bytes less than 128. This may make the encoded
text a little bit longer, but the text passes more easily through
several gateways, some of which strip off MSB (Most Signigant Bit).
-
+
There are two kinds of character sets: control character set and
graphic character set. The former contains control characters such
as `newline' and `escape' to provide control functions (control
abbreviated to the escape sequence ESC '[' in a 7-bit environment.
Character composition specification takes the following form:
- o ESC '0' -- start character composition
- o ESC '1' -- end character composition
- Since these are not standard escape sequences of any ISO standard,
- the use of them for these meaning is restricted to Emacs only. */
+ o ESC '0' -- start relative composition
+ o ESC '1' -- end composition
+ o ESC '2' -- start rule-base composition (*)
+ o ESC '3' -- start relative composition with alternate chars (**)
+ o ESC '4' -- start rule-base composition with alternate chars (**)
+ Since these are not standard escape sequences of any ISO standard,
+ the use of them for these meaning is restricted to Emacs only.
+
+ (*) This form is used only in Emacs 20.5 and the older versions,
+ but the newer versions can safely decode it.
+ (**) This form is used only in Emacs 21.1 and the newer versions,
+ and the older versions can't decode it.
+
+ Here's a list of examples usages of these composition escape
+ sequences (categorized by `enum composition_method').
+
+ COMPOSITION_RELATIVE:
+ ESC 0 CHAR [ CHAR ] ESC 1
+ COMPOSITOIN_WITH_RULE:
+ ESC 2 CHAR [ RULE CHAR ] ESC 1
+ COMPOSITION_WITH_ALTCHARS:
+ ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
+ COMPOSITION_WITH_RULE_ALTCHARS:
+ ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
enum iso_code_class_type iso_code_class[256];
-#define CHARSET_OK(idx, charset) \
- (coding_system_table[idx] \
- && (coding_system_table[idx]->safe_charsets[charset] \
- || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
- (coding_system_table[idx], charset) \
- != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
+#define CHARSET_OK(idx, charset, c) \
+ (coding_system_table[idx] \
+ && (charset == CHARSET_ASCII \
+ || (safe_chars = coding_safe_chars (coding_system_table[idx]), \
+ CODING_SAFE_CHAR_P (safe_chars, c))) \
+ && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
+ charset) \
+ != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
#define SHIFT_OUT_OK(idx) \
(CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
int mask_found = 0;
int reg[4], shift_out = 0, single_shifting = 0;
int c, c1, i, charset;
+ /* Dummy for ONE_MORE_BYTE. */
+ struct coding_system dummy_coding;
+ struct coding_system *coding = &dummy_coding;
+ Lisp_Object safe_chars;
reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
while (mask && src < src_end)
{
- c = *src++;
+ ONE_MORE_BYTE (c);
switch (c)
{
case ISO_CODE_ESC:
- single_shifting = 0;
- if (src >= src_end)
+ if (inhibit_iso_escape_detection)
break;
- c = *src++;
+ single_shifting = 0;
+ ONE_MORE_BYTE (c);
if (c >= '(' && c <= '/')
{
/* Designation sequence for a charset of dimension 1. */
- if (src >= src_end)
- break;
- c1 = *src++;
+ ONE_MORE_BYTE (c1);
if (c1 < ' ' || c1 >= 0x80
|| (charset = iso_charset_table[0][c >= ','][c1]) < 0)
/* Invalid designation sequence. Just ignore. */
else if (c == '$')
{
/* Designation sequence for a charset of dimension 2. */
- if (src >= src_end)
- break;
- c = *src++;
+ ONE_MORE_BYTE (c);
if (c >= '@' && c <= 'B')
/* Designation for JISX0208.1978, GB2312, or JISX0208. */
reg[0] = charset = iso_charset_table[1][0][c];
else if (c >= '(' && c <= '/')
{
- if (src >= src_end)
- break;
- c1 = *src++;
+ ONE_MORE_BYTE (c1);
if (c1 < ' ' || c1 >= 0x80
|| (charset = iso_charset_table[1][c >= ','][c1]) < 0)
/* Invalid designation sequence. Just ignore. */
mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
break;
}
- else if (c == '0' || c == '1' || c == '2')
- /* ESC <Fp> for start/end composition. Just ignore. */
- break;
+ else if (c >= '0' && c <= '4')
+ {
+ /* ESC <Fp> for start/end composition. */
+ mask_found |= CODING_CATEGORY_MASK_ISO;
+ break;
+ }
else
/* Invalid escape sequence. Just ignore. */
break;
/* We found a valid designation sequence for CHARSET. */
mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
- if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
+ c = MAKE_CHAR (charset, 0, 0);
+ if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
mask_found |= CODING_CATEGORY_MASK_ISO_7;
else
mask &= ~CODING_CATEGORY_MASK_ISO_7;
- if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
+ if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
else
mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
- if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
+ if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
else
mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
- if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
+ if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
else
mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
break;
case ISO_CODE_SO:
+ if (inhibit_iso_escape_detection)
+ break;
single_shifting = 0;
if (shift_out == 0
&& (reg[1] >= 0
break;
case ISO_CODE_SI:
+ if (inhibit_iso_escape_detection)
+ break;
single_shifting = 0;
if (shift_out == 1)
{
{
int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
+ if (inhibit_iso_escape_detection)
+ break;
if (c != ISO_CODE_CSI)
{
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
}
else
{
- unsigned char *src_begin = src;
-
mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
| CODING_CATEGORY_MASK_ISO_7_ELSE);
mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
0xA0..0FF. If the byte length is odd, we exclude
CODING_CATEGORY_MASK_ISO_8_2. We can check this only
when we are not single shifting. */
- if (!single_shifting)
+ if (!single_shifting
+ && mask & CODING_CATEGORY_MASK_ISO_8_2)
{
- while (src < src_end && *src >= 0xA0)
- src++;
- if ((src - src_begin - 1) & 1 && src < src_end)
+ int i = 1;
+ while (src < src_end)
+ {
+ ONE_MORE_BYTE (c);
+ if (c < 0xA0)
+ break;
+ i++;
+ }
+
+ if (i & 1 && src < src_end)
mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
else
mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
break;
}
}
-
+ label_end_of_loop:
return (mask & mask_found);
}
-/* Decode a character of which charset is CHARSET and the 1st position
- code is C1. If dimension of CHARSET is 2, the 2nd position code is
- fetched from SRC and set to C2. If CHARSET is negative, it means
- that we are decoding ill formed text, and what we can do is just to
- read C1 as is. */
+/* Decode a character of which charset is CHARSET, the 1st position
+ code is C1, the 2nd position code is C2, and return the decoded
+ character code. If the variable `translation_table' is non-nil,
+ returned the translated code. */
-#define DECODE_ISO_CHARACTER(charset, c1) \
- do { \
- int c_alt, charset_alt = (charset); \
- if (COMPOSING_HEAD_P (coding->composing)) \
- { \
- *dst++ = LEADING_CODE_COMPOSITION; \
- if (COMPOSING_WITH_RULE_P (coding->composing)) \
- /* To tell composition rules are embeded. */ \
- *dst++ = 0xFF; \
- coding->composing += 2; \
- } \
- if (charset_alt >= 0) \
- { \
- if (CHARSET_DIMENSION (charset_alt) == 2) \
- { \
- ONE_MORE_BYTE (c2); \
- if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
- && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
- { \
- src--; \
- charset_alt = CHARSET_ASCII; \
- } \
- } \
- if (!NILP (translation_table) \
- && ((c_alt = translate_char (translation_table, \
- -1, charset_alt, c1, c2)) >= 0)) \
- SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
- } \
- if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
- DECODE_CHARACTER_ASCII (c1); \
- else if (CHARSET_DIMENSION (charset_alt) == 1) \
- DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
- else \
- DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
- if (COMPOSING_WITH_RULE_P (coding->composing)) \
- /* To tell a composition rule follows. */ \
- coding->composing = COMPOSING_WITH_RULE_RULE; \
- } while (0)
+#define DECODE_ISO_CHARACTER(charset, c1, c2) \
+ (NILP (translation_table) \
+ ? MAKE_CHAR (charset, c1, c2) \
+ : translate_char (translation_table, -1, charset, c1, c2))
/* Set designation state into CODING. */
#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
do { \
- int charset; \
+ int charset, c; \
\
if (final_char < '0' || final_char >= 128) \
goto label_invalid_code; \
charset = ISO_CHARSET_TABLE (make_number (dimension), \
make_number (chars), \
make_number (final_char)); \
+ c = MAKE_CHAR (charset, 0, 0); \
if (charset >= 0 \
&& (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
- || coding->safe_charsets[charset])) \
+ || CODING_SAFE_CHAR_P (safe_chars, c))) \
{ \
if (coding->spec.iso2022.last_invalid_designation_register == 0 \
&& reg == 0 \
} \
} while (0)
-/* Return 0 if there's a valid composing sequence starting at SRC and
- ending before SRC_END, else return -1. */
+/* Allocate a memory block for storing information about compositions.
+ The block is chained to the already allocated blocks. */
-int
-check_composing_code (coding, src, src_end)
+void
+coding_allocate_composition_data (coding, char_offset)
struct coding_system *coding;
- unsigned char *src, *src_end;
+ int char_offset;
{
- int charset, c, c1, dim;
+ struct composition_data *cmp_data
+ = (struct composition_data *) xmalloc (sizeof *cmp_data);
+
+ cmp_data->char_offset = char_offset;
+ cmp_data->used = 0;
+ cmp_data->prev = coding->cmp_data;
+ cmp_data->next = NULL;
+ if (coding->cmp_data)
+ coding->cmp_data->next = cmp_data;
+ coding->cmp_data = cmp_data;
+ coding->cmp_data_start = 0;
+}
- while (src < src_end)
- {
- c = *src++;
- if (c >= 0x20)
- continue;
- if (c != ISO_CODE_ESC || src >= src_end)
- return -1;
- c = *src++;
- if (c == '1') /* end of compsition */
- return 0;
- if (src + 2 >= src_end
- || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
- return -1;
-
- dim = (c == '$');
- if (dim == 1)
- c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
- if (c >= '(' && c <= '/')
- {
- c1 = *src++;
- if ((c1 < ' ' || c1 >= 0x80)
- || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
- || ! coding->safe_charsets[charset]
- || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
- == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
- return -1;
- }
- else
- return -1;
- }
+/* Record the starting position START and METHOD of one composition. */
+
+#define CODING_ADD_COMPOSITION_START(coding, start, method) \
+ do { \
+ struct composition_data *cmp_data = coding->cmp_data; \
+ int *data = cmp_data->data + cmp_data->used; \
+ coding->cmp_data_start = cmp_data->used; \
+ data[0] = -1; \
+ data[1] = cmp_data->char_offset + start; \
+ data[3] = (int) method; \
+ cmp_data->used += 4; \
+ } while (0)
+
+/* Record the ending position END of the current composition. */
+
+#define CODING_ADD_COMPOSITION_END(coding, end) \
+ do { \
+ struct composition_data *cmp_data = coding->cmp_data; \
+ int *data = cmp_data->data + coding->cmp_data_start; \
+ data[0] = cmp_data->used - coding->cmp_data_start; \
+ data[2] = cmp_data->char_offset + end; \
+ } while (0)
+
+/* Record one COMPONENT (alternate character or composition rule). */
+
+#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
+ (coding->cmp_data->data[coding->cmp_data->used++] = component)
+
+/* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
+
+#define DECODE_COMPOSITION_START(c1) \
+ do { \
+ if (coding->composing == COMPOSITION_DISABLED) \
+ { \
+ *dst++ = ISO_CODE_ESC; \
+ *dst++ = c1 & 0x7f; \
+ coding->produced_char += 2; \
+ } \
+ else if (!COMPOSING_P (coding)) \
+ { \
+ /* This is surely the start of a composition. We must be sure \
+ that coding->cmp_data has enough space to store the \
+ information about the composition. If not, terminate the \
+ current decoding loop, allocate one more memory block for \
+ coding->cmp_data in the calller, then start the decoding \
+ loop again. We can't allocate memory here directly because \
+ it may cause buffer/string relocation. */ \
+ if (!coding->cmp_data \
+ || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
+ >= COMPOSITION_DATA_SIZE)) \
+ { \
+ coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
+ goto label_end_of_loop; \
+ } \
+ coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
+ : c1 == '2' ? COMPOSITION_WITH_RULE \
+ : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
+ : COMPOSITION_WITH_RULE_ALTCHARS); \
+ CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
+ coding->composing); \
+ coding->composition_rule_follows = 0; \
+ } \
+ else \
+ { \
+ /* We are already handling a composition. If the method is \
+ the following two, the codes following the current escape \
+ sequence are actual characters stored in a buffer. */ \
+ if (coding->composing == COMPOSITION_WITH_ALTCHARS \
+ || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
+ { \
+ coding->composing = COMPOSITION_RELATIVE; \
+ coding->composition_rule_follows = 0; \
+ } \
+ } \
+ } while (0)
+
+/* Handle compositoin end sequence ESC 1. */
+
+#define DECODE_COMPOSITION_END(c1) \
+ do { \
+ if (coding->composing == COMPOSITION_DISABLED) \
+ { \
+ *dst++ = ISO_CODE_ESC; \
+ *dst++ = c1; \
+ coding->produced_char += 2; \
+ } \
+ else \
+ { \
+ CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
+ coding->composing = COMPOSITION_NO; \
+ } \
+ } while (0)
+
+/* Decode a composition rule from the byte C1 (and maybe one more byte
+ from SRC) and store one encoded composition rule in
+ coding->cmp_data. */
+
+#define DECODE_COMPOSITION_RULE(c1) \
+ do { \
+ int rule = 0; \
+ (c1) -= 32; \
+ if (c1 < 81) /* old format (before ver.21) */ \
+ { \
+ int gref = (c1) / 9; \
+ int nref = (c1) % 9; \
+ if (gref == 4) gref = 10; \
+ if (nref == 4) nref = 10; \
+ rule = COMPOSITION_ENCODE_RULE (gref, nref); \
+ } \
+ else if (c1 < 93) /* new format (after ver.21) */ \
+ { \
+ ONE_MORE_BYTE (c2); \
+ rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
+ } \
+ CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
+ coding->composition_rule_follows = 0; \
+ } while (0)
- /* We have not found the sequence "ESC 1". */
- return -1;
-}
/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
-int
+static void
decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
struct coding_system *coding;
unsigned char *source, *destination;
unsigned char *src_end = source + src_bytes;
unsigned char *dst = destination;
unsigned char *dst_end = destination + dst_bytes;
- /* Since the maximum bytes produced by each loop is 7, we subtract 6
- from DST_END to assure that overflow checking is necessary only
- at the head of loop. */
- unsigned char *adjusted_dst_end = dst_end - 6;
- int charset;
/* Charsets invoked to graphic plane 0 and 1 respectively. */
int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
- Lisp_Object translation_table
- = coding->translation_table_for_decode;
- int result = CODING_FINISH_NORMAL;
+ /* SRC_BASE remembers the start position in source in each loop.
+ The loop will be exited when there's not enough source code
+ (within macro ONE_MORE_BYTE), or when there's not enough
+ destination area to produce a character (within macro
+ EMIT_CHAR). */
+ unsigned char *src_base;
+ int c, charset;
+ Lisp_Object translation_table;
+ Lisp_Object safe_chars;
- if (!NILP (Venable_character_translation) && NILP (translation_table))
- translation_table = Vstandard_translation_table_for_decode;
+ safe_chars = coding_safe_chars (coding);
- coding->produced_char = 0;
- coding->fake_multibyte = 0;
- while (src < src_end && (dst_bytes
- ? (dst < adjusted_dst_end)
- : (dst < src - 6)))
+ if (NILP (Venable_character_translation))
+ translation_table = Qnil;
+ else
+ {
+ translation_table = coding->translation_table_for_decode;
+ if (NILP (translation_table))
+ translation_table = Vstandard_translation_table_for_decode;
+ }
+
+ coding->result = CODING_FINISH_NORMAL;
+
+ while (1)
{
- /* SRC_BASE remembers the start position in source in each loop.
- The loop will be exited when there's not enough source text
- to analyze long escape sequence or 2-byte code (within macros
- ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
- to SRC_BASE before exiting. */
- unsigned char *src_base = src;
- int c1 = *src++, c2;
+ int c1, c2;
+
+ src_base = src;
+ ONE_MORE_BYTE (c1);
+ /* We produce no character or one character. */
switch (iso_code_class [c1])
{
case ISO_0x20_or_0x7F:
- if (!coding->composing
- && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
+ if (COMPOSING_P (coding) && coding->composition_rule_follows)
+ {
+ DECODE_COMPOSITION_RULE (c1);
+ continue;
+ }
+ if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
{
/* This is SPACE or DEL. */
- *dst++ = c1;
- coding->produced_char++;
+ charset = CHARSET_ASCII;
break;
}
/* This is a graphic character, we fall down ... */
case ISO_graphic_plane_0:
- if (coding->composing == COMPOSING_WITH_RULE_RULE)
+ if (COMPOSING_P (coding) && coding->composition_rule_follows)
{
- /* This is a composition rule. */
- *dst++ = c1 | 0x80;
- coding->composing = COMPOSING_WITH_RULE_TAIL;
+ DECODE_COMPOSITION_RULE (c1);
+ continue;
}
- else
- DECODE_ISO_CHARACTER (charset0, c1);
+ charset = charset0;
break;
case ISO_0xA0_or_0xFF:
/* This is a graphic character, we fall down ... */
case ISO_graphic_plane_1:
- if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
+ if (charset1 < 0)
goto label_invalid_code;
- else
- DECODE_ISO_CHARACTER (charset1, c1);
+ charset = charset1;
break;
- case ISO_control_code:
+ case ISO_control_0:
+ if (COMPOSING_P (coding))
+ DECODE_COMPOSITION_END ('1');
+
/* All ISO2022 control characters in this class have the
same representation in Emacs internal format. */
if (c1 == '\n'
&& (coding->eol_type == CODING_EOL_CR
|| coding->eol_type == CODING_EOL_CRLF))
{
- result = CODING_FINISH_INCONSISTENT_EOL;
- goto label_end_of_loop_2;
+ coding->result = CODING_FINISH_INCONSISTENT_EOL;
+ goto label_end_of_loop;
}
- *dst++ = c1;
- coding->produced_char++;
- if (c1 >= 0x80)
- coding->fake_multibyte = 1;
+ charset = CHARSET_ASCII;
break;
+ case ISO_control_1:
+ if (COMPOSING_P (coding))
+ DECODE_COMPOSITION_END ('1');
+ goto label_invalid_code;
+
case ISO_carriage_return:
+ if (COMPOSING_P (coding))
+ DECODE_COMPOSITION_END ('1');
+
if (coding->eol_type == CODING_EOL_CR)
- *dst++ = '\n';
+ c1 = '\n';
else if (coding->eol_type == CODING_EOL_CRLF)
{
ONE_MORE_BYTE (c1);
- if (c1 == ISO_CODE_LF)
- *dst++ = '\n';
- else
+ if (c1 != ISO_CODE_LF)
{
if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
{
- result = CODING_FINISH_INCONSISTENT_EOL;
- goto label_end_of_loop_2;
+ coding->result = CODING_FINISH_INCONSISTENT_EOL;
+ goto label_end_of_loop;
}
src--;
- *dst++ = '\r';
+ c1 = '\r';
}
}
- else
- *dst++ = c1;
- coding->produced_char++;
+ charset = CHARSET_ASCII;
break;
case ISO_shift_out:
goto label_invalid_code;
CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
- break;
+ continue;
case ISO_shift_in:
if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
goto label_invalid_code;
CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
- break;
+ continue;
case ISO_single_shift_2_7:
case ISO_single_shift_2:
}
else
goto label_invalid_code;
- break;
+ /* We must update these variables now. */
+ charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
+ charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
+ continue;
case 'n': /* invocation of locking-shift-2 */
if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
goto label_invalid_code;
CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
- break;
+ continue;
case 'o': /* invocation of locking-shift-3 */
if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
goto label_invalid_code;
CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
- break;
+ continue;
case 'N': /* invocation of single-shift-2 */
if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
|| CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
goto label_invalid_code;
- ONE_MORE_BYTE (c1);
charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
- DECODE_ISO_CHARACTER (charset, c1);
+ ONE_MORE_BYTE (c1);
+ if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
+ goto label_invalid_code;
break;
case 'O': /* invocation of single-shift-3 */
if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
|| CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
goto label_invalid_code;
- ONE_MORE_BYTE (c1);
charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
- DECODE_ISO_CHARACTER (charset, c1);
+ ONE_MORE_BYTE (c1);
+ if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
+ goto label_invalid_code;
break;
- case '0': case '2': /* start composing */
- /* Before processing composing, we must be sure that all
- characters being composed are supported by CODING.
- If not, we must give up composing. */
- if (check_composing_code (coding, src, src_end) == 0)
- {
- /* We are looking at a valid composition sequence. */
- coding->composing = (c1 == '0'
- ? COMPOSING_NO_RULE_HEAD
- : COMPOSING_WITH_RULE_HEAD);
- coding->composed_chars = 0;
- }
- else
- {
- *dst++ = ISO_CODE_ESC;
- *dst++ = c1;
- coding->produced_char += 2;
- }
- break;
+ case '0': case '2': case '3': case '4': /* start composition */
+ DECODE_COMPOSITION_START (c1);
+ continue;
- case '1': /* end composing */
- if (!coding->composing)
- {
- *dst++ = ISO_CODE_ESC;
- *dst++ = c1;
- coding->produced_char += 2;
- break;
- }
-
- if (coding->composed_chars > 0)
- {
- if (coding->composed_chars == 1)
- {
- unsigned char *this_char_start = dst;
- int this_bytes;
-
- /* Only one character is in the composing
- sequence. Make it a normal character. */
- while (*--this_char_start != LEADING_CODE_COMPOSITION);
- dst = (this_char_start
- + (coding->composing == COMPOSING_NO_RULE_TAIL
- ? 1 : 2));
- *dst -= 0x20;
- if (*dst == 0x80)
- *++dst &= 0x7F;
- this_bytes = BYTES_BY_CHAR_HEAD (*dst);
- while (this_bytes--) *this_char_start++ = *dst++;
- dst = this_char_start;
- }
- coding->produced_char++;
- }
- coding->composing = COMPOSING_NO;
- break;
+ case '1': /* end composition */
+ DECODE_COMPOSITION_END (c1);
+ continue;
case '[': /* specification of direction */
if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
default:
goto label_invalid_code;
}
- break;
+ continue;
default:
if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
}
else
- {
- goto label_invalid_code;
- }
+ goto label_invalid_code;
+ /* We must update these variables now. */
+ charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
+ charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
+ continue;
}
- /* We must update these variables now. */
- charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
- charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
- break;
+ }
- label_invalid_code:
- while (src_base < src)
- *dst++ = *src_base++;
- coding->fake_multibyte = 1;
+ /* Now we know CHARSET and 1st position code C1 of a character.
+ Produce a multibyte sequence for that character while getting
+ 2nd position code C2 if necessary. */
+ if (CHARSET_DIMENSION (charset) == 2)
+ {
+ ONE_MORE_BYTE (c2);
+ if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
+ /* C2 is not in a valid range. */
+ goto label_invalid_code;
}
+ c = DECODE_ISO_CHARACTER (charset, c1, c2);
+ EMIT_CHAR (c);
continue;
- label_end_of_loop:
- result = CODING_FINISH_INSUFFICIENT_SRC;
- label_end_of_loop_2:
+ label_invalid_code:
+ coding->errors++;
+ if (COMPOSING_P (coding))
+ DECODE_COMPOSITION_END ('1');
src = src_base;
- break;
- }
-
- if (src < src_end)
- {
- if (result == CODING_FINISH_NORMAL)
- result = CODING_FINISH_INSUFFICIENT_DST;
- else if (result != CODING_FINISH_INCONSISTENT_EOL
- && coding->mode & CODING_MODE_LAST_BLOCK)
- {
- /* This is the last block of the text to be decoded. We had
- better just flush out all remaining codes in the text
- although they are not valid characters. */
- src_bytes = src_end - src;
- if (dst_bytes && (dst_end - dst < src_bytes))
- src_bytes = dst_end - dst;
- bcopy (src, dst, src_bytes);
- dst += src_bytes;
- src += src_bytes;
- coding->fake_multibyte = 1;
- }
+ c = *src++;
+ EMIT_CHAR (c);
}
- coding->consumed = coding->consumed_char = src - source;
+ label_end_of_loop:
+ coding->consumed = coding->consumed_char = src_base - source;
coding->produced = dst - destination;
- return result;
+ return;
}
+
/* ISO2022 encoding stuff. */
/*
*/
/* Produce codes (escape sequence) for designating CHARSET to graphic
- register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
- the coding system CODING allows, produce designation sequence of
- short-form. */
+ register REG at DST, and increment DST. If <final-char> of CHARSET is
+ '@', 'A', or 'B' and the coding system CODING allows, produce
+ designation sequence of short-form. */
#define ENCODE_DESIGNATION(charset, reg, coding) \
do { \
char *intermediate_char_94 = "()*+"; \
char *intermediate_char_96 = ",-./"; \
int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
+ \
if (revision < 255) \
{ \
*dst++ = ISO_CODE_ESC; \
*dst++ = '&'; \
*dst++ = '@' + revision; \
} \
- *dst++ = ISO_CODE_ESC; \
+ *dst++ = ISO_CODE_ESC; \
if (CHARSET_DIMENSION (charset) == 1) \
{ \
if (CHARSET_CHARS (charset) == 94) \
*dst++ = '$'; \
if (CHARSET_CHARS (charset) == 94) \
{ \
- if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
- || reg != 0 \
- || final_char < '@' || final_char > 'B') \
+ if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
+ || reg != 0 \
+ || final_char < '@' || final_char > 'B') \
*dst++ = (unsigned char) (intermediate_char_94[reg]); \
} \
else \
- *dst++ = (unsigned char) (intermediate_char_96[reg]); \
+ *dst++ = (unsigned char) (intermediate_char_96[reg]); \
} \
- *dst++ = final_char; \
+ *dst++ = final_char; \
CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
} while (0)
if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
*dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
else \
- { \
- *dst++ = ISO_CODE_SS2; \
- coding->fake_multibyte = 1; \
- } \
+ *dst++ = ISO_CODE_SS2; \
CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
} while (0)
if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
*dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
else \
- { \
- *dst++ = ISO_CODE_SS3; \
- coding->fake_multibyte = 1; \
- } \
+ *dst++ = ISO_CODE_SS3; \
CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
} while (0)
escape sequence) for ISO2022 locking-shift functions (shift-in,
shift-out, locking-shift-2, and locking-shift-3). */
-#define ENCODE_SHIFT_IN \
- do { \
- *dst++ = ISO_CODE_SI; \
+#define ENCODE_SHIFT_IN \
+ do { \
+ *dst++ = ISO_CODE_SI; \
CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
} while (0)
-#define ENCODE_SHIFT_OUT \
- do { \
- *dst++ = ISO_CODE_SO; \
+#define ENCODE_SHIFT_OUT \
+ do { \
+ *dst++ = ISO_CODE_SO; \
CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
} while (0)
CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
} while (0)
-#define ENCODE_LOCKING_SHIFT_3 \
- do { \
- *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
+#define ENCODE_LOCKING_SHIFT_3 \
+ do { \
+ *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
} while (0)
CHARSET and whose position-code is C1. Designation and invocation
sequences are also produced in advance if necessary. */
-
#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
do { \
if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
*dst++ = c1 | 0x80; \
break; \
} \
- else if (coding->flags & CODING_FLAG_ISO_SAFE \
- && !coding->safe_charsets[charset]) \
- { \
- /* We should not encode this character, instead produce one or \
- two `?'s. */ \
- *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
- if (CHARSET_WIDTH (charset) == 2) \
- *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
- break; \
- } \
else \
/* Since CHARSET is not yet invoked to any graphic planes, we \
must invoke it, or, at first, designate it to some graphic \
*dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
break; \
} \
- else if (coding->flags & CODING_FLAG_ISO_SAFE \
- && !coding->safe_charsets[charset]) \
- { \
- /* We should not encode this character, instead produce one or \
- two `?'s. */ \
- *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
- if (CHARSET_WIDTH (charset) == 2) \
- *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
- break; \
- } \
else \
/* Since CHARSET is not yet invoked to any graphic planes, we \
must invoke it, or, at first, designate it to some graphic \
dst = encode_invocation_designation (charset, coding, dst); \
} while (1)
-#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
+#define ENCODE_ISO_CHARACTER(c) \
+ do { \
+ int charset, c1, c2; \
+ \
+ SPLIT_CHAR (c, charset, c1, c2); \
+ if (CHARSET_DEFINED_P (charset)) \
+ { \
+ if (CHARSET_DIMENSION (charset) == 1) \
+ { \
+ if (charset == CHARSET_ASCII \
+ && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
+ charset = charset_latin_jisx0201; \
+ ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
+ } \
+ else \
+ { \
+ if (charset == charset_jisx0208 \
+ && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
+ charset = charset_jisx0208_1978; \
+ ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
+ } \
+ } \
+ else \
+ { \
+ *dst++ = c1; \
+ if (c2 >= 0) \
+ *dst++ = c2; \
+ } \
+ } while (0)
+
+
+/* Instead of encoding character C, produce one or two `?'s. */
+
+#define ENCODE_UNSAFE_CHARACTER(c) \
do { \
- int c_alt, charset_alt; \
- if (!NILP (translation_table) \
- && ((c_alt = translate_char (translation_table, -1, \
- charset, c1, c2)) \
- >= 0)) \
- SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
- else \
- charset_alt = charset; \
- if (CHARSET_DEFINED_P (charset_alt)) \
- { \
- if (CHARSET_DIMENSION (charset_alt) == 1) \
- { \
- if (charset == CHARSET_ASCII \
- && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
- charset_alt = charset_latin_jisx0201; \
- ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
- } \
- else \
- { \
- if (charset == charset_jisx0208 \
- && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
- charset_alt = charset_jisx0208_1978; \
- ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
- } \
- } \
- else \
- { \
- if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
- { \
- *dst++ = charset & 0x7f; \
- *dst++ = c1 & 0x7f; \
- if (c2) \
- *dst++ = c2 & 0x7f; \
- } \
- else \
- { \
- *dst++ = charset; \
- *dst++ = c1; \
- if (c2) \
- *dst++ = c2; \
- } \
- } \
- if (! COMPOSING_P (coding->composing)) \
- coding->consumed_char++; \
+ ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
+ if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
+ ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
} while (0)
+
/* Produce designation and invocation codes at a place pointed by DST
to use CHARSET. The element `spec.iso2022' of *CODING is updated.
Return new DST. */
break;
}
}
+
return dst;
}
-/* The following two macros produce codes for indicating composition. */
-#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
-#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
-#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
+/* Produce 2-byte codes for encoded composition rule RULE. */
+
+#define ENCODE_COMPOSITION_RULE(rule) \
+ do { \
+ int gref, nref; \
+ COMPOSITION_DECODE_RULE (rule, gref, nref); \
+ *dst++ = 32 + 81 + gref; \
+ *dst++ = 32 + nref; \
+ } while (0)
+
+/* Produce codes for indicating the start of a composition sequence
+ (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
+ which specify information about the composition. See the comment
+ in coding.h for the format of DATA. */
+
+#define ENCODE_COMPOSITION_START(coding, data) \
+ do { \
+ coding->composing = data[3]; \
+ *dst++ = ISO_CODE_ESC; \
+ if (coding->composing == COMPOSITION_RELATIVE) \
+ *dst++ = '0'; \
+ else \
+ { \
+ *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
+ ? '3' : '4'); \
+ coding->cmp_data_index = coding->cmp_data_start + 4; \
+ coding->composition_rule_follows = 0; \
+ } \
+ } while (0)
+
+/* Produce codes for indicating the end of the current composition. */
+
+#define ENCODE_COMPOSITION_END(coding, data) \
+ do { \
+ *dst++ = ISO_CODE_ESC; \
+ *dst++ = '1'; \
+ coding->cmp_data_start += data[0]; \
+ coding->composing = COMPOSITION_NO; \
+ if (coding->cmp_data_start == coding->cmp_data->used \
+ && coding->cmp_data->next) \
+ { \
+ coding->cmp_data = coding->cmp_data->next; \
+ coding->cmp_data_start = 0; \
+ } \
+ } while (0)
+
+/* Produce composition start sequence ESC 0. Here, this sequence
+ doesn't mean the start of a new composition but means that we have
+ just produced components (alternate chars and composition rules) of
+ the composition and the actual text follows in SRC. */
+
+#define ENCODE_COMPOSITION_FAKE_START(coding) \
+ do { \
+ *dst++ = ISO_CODE_ESC; \
+ *dst++ = '0'; \
+ coding->composing = COMPOSITION_RELATIVE; \
+ } while (0)
/* The following three macros produce codes for indicating direction
of text. */
-#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
- do { \
+#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
+ do { \
if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
- *dst++ = ISO_CODE_ESC, *dst++ = '['; \
- else \
- *dst++ = ISO_CODE_CSI; \
+ *dst++ = ISO_CODE_ESC, *dst++ = '['; \
+ else \
+ *dst++ = ISO_CODE_CSI; \
} while (0)
#define ENCODE_DIRECTION_R2L \
- ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
+ ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
#define ENCODE_DIRECTION_L2R \
- ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
+ ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
/* Produce codes for designation and invocation to reset the graphic
planes and registers to initial state. */
} while (0)
/* Produce designation sequences of charsets in the line started from
- SRC to a place pointed by *DSTP, and update DSTP.
+ SRC to a place pointed by DST, and return updated DST.
If the current block ends before any end-of-line, we may fail to
find all the necessary designations. */
-void
-encode_designation_at_bol (coding, table, src, src_end, dstp)
+static unsigned char *
+encode_designation_at_bol (coding, translation_table, src, src_end, dst)
struct coding_system *coding;
- Lisp_Object table;
- unsigned char *src, *src_end, **dstp;
+ Lisp_Object translation_table;
+ unsigned char *src, *src_end, *dst;
{
int charset, c, found = 0, reg;
/* Table of charsets to be designated to each graphic register. */
int r[4];
- unsigned char *dst = *dstp;
for (reg = 0; reg < 4; reg++)
r[reg] = -1;
- while (src < src_end && *src != '\n' && found < 4)
+ while (found < 4)
{
- int bytes = BYTES_BY_CHAR_HEAD (*src);
+ ONE_MORE_CHAR (c);
+ if (c == '\n')
+ break;
- if (NILP (table))
- charset = CHARSET_AT (src);
- else
- {
- int c_alt;
- unsigned char c1, c2;
-
- SPLIT_STRING(src, bytes, charset, c1, c2);
- if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
- charset = CHAR_CHARSET (c_alt);
- }
-
+ charset = CHAR_CHARSET (c);
reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
{
found++;
r[reg] = charset;
}
-
- src += bytes;
}
+ label_end_of_loop:
if (found)
{
for (reg = 0; reg < 4; reg++)
if (r[reg] >= 0
&& CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
ENCODE_DESIGNATION (r[reg], reg, coding);
- *dstp = dst;
}
+
+ return dst;
}
/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
-int
+static void
encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
struct coding_system *coding;
unsigned char *source, *destination;
from DST_END to assure overflow checking is necessary only at the
head of loop. */
unsigned char *adjusted_dst_end = dst_end - 19;
- Lisp_Object translation_table
- = coding->translation_table_for_encode;
- int result = CODING_FINISH_NORMAL;
+ /* SRC_BASE remembers the start position in source in each loop.
+ The loop will be exited when there's not enough source text to
+ analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
+ there's not enough destination area to produce encoded codes
+ (within macro EMIT_BYTES). */
+ unsigned char *src_base;
+ int c;
+ Lisp_Object translation_table;
+ Lisp_Object safe_chars;
- if (!NILP (Venable_character_translation) && NILP (translation_table))
- translation_table = Vstandard_translation_table_for_encode;
+ safe_chars = coding_safe_chars (coding);
+
+ if (NILP (Venable_character_translation))
+ translation_table = Qnil;
+ else
+ {
+ translation_table = coding->translation_table_for_encode;
+ if (NILP (translation_table))
+ translation_table = Vstandard_translation_table_for_encode;
+ }
coding->consumed_char = 0;
- coding->fake_multibyte = 0;
- while (src < src_end && (dst_bytes
- ? (dst < adjusted_dst_end)
- : (dst < src - 19)))
+ coding->errors = 0;
+ while (1)
{
- /* SRC_BASE remembers the start position in source in each loop.
- The loop will be exited when there's not enough source text
- to analyze multi-byte codes (within macros ONE_MORE_BYTE,
- TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
- reset to SRC_BASE before exiting. */
- unsigned char *src_base = src;
- int charset, c1, c2, c3, c4;
+ src_base = src;
+
+ if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
+ {
+ coding->result = CODING_FINISH_INSUFFICIENT_DST;
+ break;
+ }
if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
&& CODING_SPEC_ISO_BOL (coding))
{
/* We have to produce designation sequences if any now. */
- encode_designation_at_bol (coding, translation_table,
- src, src_end, &dst);
+ dst = encode_designation_at_bol (coding, translation_table,
+ src, src_end, dst);
CODING_SPEC_ISO_BOL (coding) = 0;
}
- c1 = *src++;
- /* If we are seeing a component of a composite character, we are
- seeing a leading-code encoded irregularly for composition, or
- a composition rule if composing with rule. We must set C1 to
- a normal leading-code or an ASCII code. If we are not seeing
- a composite character, we must reset composition,
- designation, and invocation states. */
- if (COMPOSING_P (coding->composing))
+ /* Check composition start and end. */
+ if (coding->composing != COMPOSITION_DISABLED
+ && coding->cmp_data_start < coding->cmp_data->used)
{
- if (c1 < 0xA0)
+ struct composition_data *cmp_data = coding->cmp_data;
+ int *data = cmp_data->data + coding->cmp_data_start;
+ int this_pos = cmp_data->char_offset + coding->consumed_char;
+
+ if (coding->composing == COMPOSITION_RELATIVE)
{
- /* We are not in a composite character any longer. */
- coding->composing = COMPOSING_NO;
- ENCODE_RESET_PLANE_AND_REGISTER;
- ENCODE_COMPOSITION_END;
+ if (this_pos == data[2])
+ {
+ ENCODE_COMPOSITION_END (coding, data);
+ cmp_data = coding->cmp_data;
+ data = cmp_data->data + coding->cmp_data_start;
+ }
}
- else
+ else if (COMPOSING_P (coding))
{
- if (coding->composing == COMPOSING_WITH_RULE_RULE)
+ /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
+ if (coding->cmp_data_index == coding->cmp_data_start + data[0])
+ /* We have consumed components of the composition.
+ What follows in SRC is the compositions's base
+ text. */
+ ENCODE_COMPOSITION_FAKE_START (coding);
+ else
{
- *dst++ = c1 & 0x7F;
- coding->composing = COMPOSING_WITH_RULE_HEAD;
+ int c = cmp_data->data[coding->cmp_data_index++];
+ if (coding->composition_rule_follows)
+ {
+ ENCODE_COMPOSITION_RULE (c);
+ coding->composition_rule_follows = 0;
+ }
+ else
+ {
+ if (coding->flags & CODING_FLAG_ISO_SAFE
+ && ! CODING_SAFE_CHAR_P (safe_chars, c))
+ ENCODE_UNSAFE_CHARACTER (c);
+ else
+ ENCODE_ISO_CHARACTER (c);
+ if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
+ coding->composition_rule_follows = 1;
+ }
continue;
}
- else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
- coding->composing = COMPOSING_WITH_RULE_RULE;
- if (c1 == 0xA0)
+ }
+ if (!COMPOSING_P (coding))
+ {
+ if (this_pos == data[1])
{
- /* This is an ASCII component. */
- ONE_MORE_BYTE (c1);
- c1 &= 0x7F;
+ ENCODE_COMPOSITION_START (coding, data);
+ continue;
}
- else
- /* This is a leading-code of non ASCII component. */
- c1 -= 0x20;
}
}
-
- /* Now encode one character. C1 is a control character, an
- ASCII character, or a leading-code of multi-byte character. */
- switch (emacs_code_class[c1])
- {
- case EMACS_ascii_code:
- c2 = 0;
- ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
- break;
-
- case EMACS_control_code:
- if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
- ENCODE_RESET_PLANE_AND_REGISTER;
- *dst++ = c1;
- coding->consumed_char++;
- break;
-
- case EMACS_carriage_return_code:
- if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
- {
- if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
- ENCODE_RESET_PLANE_AND_REGISTER;
- *dst++ = c1;
- coding->consumed_char++;
- break;
- }
- /* fall down to treat '\r' as '\n' ... */
-
- case EMACS_linefeed_code:
- if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
- ENCODE_RESET_PLANE_AND_REGISTER;
- if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
- bcopy (coding->spec.iso2022.initial_designation,
- coding->spec.iso2022.current_designation,
- sizeof coding->spec.iso2022.initial_designation);
- if (coding->eol_type == CODING_EOL_LF
- || coding->eol_type == CODING_EOL_UNDECIDED)
- *dst++ = ISO_CODE_LF;
- else if (coding->eol_type == CODING_EOL_CRLF)
- *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
- else
- *dst++ = ISO_CODE_CR;
- CODING_SPEC_ISO_BOL (coding) = 1;
- coding->consumed_char++;
- break;
-
- case EMACS_leading_code_2:
- ONE_MORE_BYTE (c2);
- c3 = 0;
- if (c2 < 0xA0)
- {
- /* invalid sequence */
- *dst++ = c1;
- src--;
- coding->consumed_char++;
- }
- else
- ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
- break;
-
- case EMACS_leading_code_3:
- TWO_MORE_BYTES (c2, c3);
- c4 = 0;
- if (c2 < 0xA0 || c3 < 0xA0)
- {
- /* invalid sequence */
- *dst++ = c1;
- src -= 2;
- coding->consumed_char++;
- }
- else if (c1 < LEADING_CODE_PRIVATE_11)
- ENCODE_ISO_CHARACTER (c1, c2, c3);
- else
- ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
- break;
- case EMACS_leading_code_4:
- THREE_MORE_BYTES (c2, c3, c4);
- if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
- {
- /* invalid sequence */
- *dst++ = c1;
- src -= 3;
- coding->consumed_char++;
- }
- else
- ENCODE_ISO_CHARACTER (c2, c3, c4);
- break;
+ ONE_MORE_CHAR (c);
- case EMACS_leading_code_composition:
- ONE_MORE_BYTE (c2);
- if (c2 < 0xA0)
+ /* Now encode the character C. */
+ if (c < 0x20 || c == 0x7F)
+ {
+ if (c == '\r')
{
- /* invalid sequence */
- *dst++ = c1;
- src--;
- coding->consumed_char++;
+ if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
+ {
+ if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
+ ENCODE_RESET_PLANE_AND_REGISTER;
+ *dst++ = c;
+ continue;
+ }
+ /* fall down to treat '\r' as '\n' ... */
+ c = '\n';
}
- else if (c2 == 0xFF)
+ if (c == '\n')
{
- ENCODE_RESET_PLANE_AND_REGISTER;
- coding->composing = COMPOSING_WITH_RULE_HEAD;
- ENCODE_COMPOSITION_WITH_RULE_START;
- coding->consumed_char++;
+ if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
+ ENCODE_RESET_PLANE_AND_REGISTER;
+ if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
+ bcopy (coding->spec.iso2022.initial_designation,
+ coding->spec.iso2022.current_designation,
+ sizeof coding->spec.iso2022.initial_designation);
+ if (coding->eol_type == CODING_EOL_LF
+ || coding->eol_type == CODING_EOL_UNDECIDED)
+ *dst++ = ISO_CODE_LF;
+ else if (coding->eol_type == CODING_EOL_CRLF)
+ *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
+ else
+ *dst++ = ISO_CODE_CR;
+ CODING_SPEC_ISO_BOL (coding) = 1;
}
- else
+ else
{
- ENCODE_RESET_PLANE_AND_REGISTER;
- /* Rewind one byte because it is a character code of
- composition elements. */
- src--;
- coding->composing = COMPOSING_NO_RULE_HEAD;
- ENCODE_COMPOSITION_NO_RULE_START;
- coding->consumed_char++;
+ if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
+ ENCODE_RESET_PLANE_AND_REGISTER;
+ *dst++ = c;
}
- break;
-
- case EMACS_invalid_code:
- if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
- ENCODE_RESET_PLANE_AND_REGISTER;
- *dst++ = c1;
- coding->consumed_char++;
- break;
}
- continue;
- label_end_of_loop:
- result = CODING_FINISH_INSUFFICIENT_SRC;
- src = src_base;
- break;
- }
-
- if (src < src_end && result == CODING_FINISH_NORMAL)
- result = CODING_FINISH_INSUFFICIENT_DST;
-
- /* If this is the last block of the text to be encoded, we must
- reset graphic planes and registers to the initial state, and
- flush out the carryover if any. */
- if (coding->mode & CODING_MODE_LAST_BLOCK)
- {
- ENCODE_RESET_PLANE_AND_REGISTER;
- if (COMPOSING_P (coding->composing))
- ENCODE_COMPOSITION_END;
- if (result == CODING_FINISH_INSUFFICIENT_SRC)
+ else if (ASCII_BYTE_P (c))
+ ENCODE_ISO_CHARACTER (c);
+ else if (SINGLE_BYTE_CHAR_P (c))
{
- while (src < src_end && dst < dst_end)
- *dst++ = *src++;
+ *dst++ = c;
+ coding->errors++;
}
+ else if (coding->flags & CODING_FLAG_ISO_SAFE
+ && ! CODING_SAFE_CHAR_P (safe_chars, c))
+ ENCODE_UNSAFE_CHARACTER (c);
+ else
+ ENCODE_ISO_CHARACTER (c);
+
+ coding->consumed_char++;
}
- coding->consumed = src - source;
+
+ label_end_of_loop:
+ coding->consumed = src_base - source;
coding->produced = coding->produced_char = dst - destination;
- return result;
}
\f
b2 += b2 < 0x3F ? 0x40 : 0x62; \
} while (0)
-#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
- do { \
- int c_alt, charset_alt = (charset); \
- if (!NILP (translation_table) \
- && ((c_alt = translate_char (translation_table, \
- -1, (charset), c1, c2)) >= 0)) \
- SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
- if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
- DECODE_CHARACTER_ASCII (c1); \
- else if (CHARSET_DIMENSION (charset_alt) == 1) \
- DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
- else \
- DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
- } while (0)
-
-#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
- do { \
- int c_alt, charset_alt; \
- if (!NILP (translation_table) \
- && ((c_alt = translate_char (translation_table, -1, \
- charset, c1, c2)) \
- >= 0)) \
- SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
- else \
- charset_alt = charset; \
- if (charset_alt == charset_ascii) \
- *dst++ = c1; \
- else if (CHARSET_DIMENSION (charset_alt) == 1) \
- { \
- if (sjis_p && charset_alt == charset_katakana_jisx0201) \
- *dst++ = c1; \
- else \
- { \
- *dst++ = charset_alt, *dst++ = c1; \
- coding->fake_multibyte = 1; \
- } \
- } \
- else \
- { \
- c1 &= 0x7F, c2 &= 0x7F; \
- if (sjis_p && charset_alt == charset_jisx0208) \
- { \
- unsigned char s1, s2; \
- \
- ENCODE_SJIS (c1, c2, s1, s2); \
- *dst++ = s1, *dst++ = s2; \
- coding->fake_multibyte = 1; \
- } \
- else if (!sjis_p \
- && (charset_alt == charset_big5_1 \
- || charset_alt == charset_big5_2)) \
- { \
- unsigned char b1, b2; \
- \
- ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
- *dst++ = b1, *dst++ = b2; \
- } \
- else \
- { \
- *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
- coding->fake_multibyte = 1; \
- } \
- } \
- coding->consumed_char++; \
- } while (0);
-
/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
Check if a text is encoded in SJIS. If it is, return
CODING_CATEGORY_MASK_SJIS, else return 0. */
detect_coding_sjis (src, src_end)
unsigned char *src, *src_end;
{
- unsigned char c;
+ int c;
+ /* Dummy for ONE_MORE_BYTE. */
+ struct coding_system dummy_coding;
+ struct coding_system *coding = &dummy_coding;
- while (src < src_end)
+ while (1)
{
- c = *src++;
+ ONE_MORE_BYTE (c);
if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
{
- if (src < src_end && *src++ < 0x40)
+ ONE_MORE_BYTE (c);
+ if (c < 0x40)
return 0;
}
}
+ label_end_of_loop:
return CODING_CATEGORY_MASK_SJIS;
}
detect_coding_big5 (src, src_end)
unsigned char *src, *src_end;
{
- unsigned char c;
+ int c;
+ /* Dummy for ONE_MORE_BYTE. */
+ struct coding_system dummy_coding;
+ struct coding_system *coding = &dummy_coding;
- while (src < src_end)
+ while (1)
{
- c = *src++;
+ ONE_MORE_BYTE (c);
if (c >= 0xA1)
{
- if (src >= src_end)
- break;
- c = *src++;
+ ONE_MORE_BYTE (c);
if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
return 0;
}
}
+ label_end_of_loop:
return CODING_CATEGORY_MASK_BIG5;
}
-/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
- If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
+/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ Check if a text is encoded in UTF-8. If it is, return
+ CODING_CATEGORY_MASK_UTF_8, else return 0. */
+
+#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
+#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
+#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
+#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
+#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
+#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
+#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
int
-decode_coding_sjis_big5 (coding, source, destination,
- src_bytes, dst_bytes, sjis_p)
- struct coding_system *coding;
- unsigned char *source, *destination;
- int src_bytes, dst_bytes;
- int sjis_p;
+detect_coding_utf_8 (src, src_end)
+ unsigned char *src, *src_end;
{
- unsigned char *src = source;
- unsigned char *src_end = source + src_bytes;
- unsigned char *dst = destination;
- unsigned char *dst_end = destination + dst_bytes;
- /* Since the maximum bytes produced by each loop is 4, we subtract 3
- from DST_END to assure overflow checking is necessary only at the
- head of loop. */
- unsigned char *adjusted_dst_end = dst_end - 3;
- Lisp_Object translation_table
- = coding->translation_table_for_decode;
- int result = CODING_FINISH_NORMAL;
-
- if (!NILP (Venable_character_translation) && NILP (translation_table))
- translation_table = Vstandard_translation_table_for_decode;
+ unsigned char c;
+ int seq_maybe_bytes;
+ /* Dummy for ONE_MORE_BYTE. */
+ struct coding_system dummy_coding;
+ struct coding_system *coding = &dummy_coding;
- coding->produced_char = 0;
- coding->fake_multibyte = 0;
- while (src < src_end && (dst_bytes
- ? (dst < adjusted_dst_end)
- : (dst < src - 3)))
+ while (1)
{
- /* SRC_BASE remembers the start position in source in each loop.
- The loop will be exited when there's not enough source text
- to analyze two-byte character (within macro ONE_MORE_BYTE).
- In that case, SRC is reset to SRC_BASE before exiting. */
- unsigned char *src_base = src;
- unsigned char c1 = *src++, c2, c3, c4;
-
- if (c1 < 0x20)
+ ONE_MORE_BYTE (c);
+ if (UTF_8_1_OCTET_P (c))
+ continue;
+ else if (UTF_8_2_OCTET_LEADING_P (c))
+ seq_maybe_bytes = 1;
+ else if (UTF_8_3_OCTET_LEADING_P (c))
+ seq_maybe_bytes = 2;
+ else if (UTF_8_4_OCTET_LEADING_P (c))
+ seq_maybe_bytes = 3;
+ else if (UTF_8_5_OCTET_LEADING_P (c))
+ seq_maybe_bytes = 4;
+ else if (UTF_8_6_OCTET_LEADING_P (c))
+ seq_maybe_bytes = 5;
+ else
+ return 0;
+
+ do
{
- if (c1 == '\r')
- {
- if (coding->eol_type == CODING_EOL_CRLF)
- {
- ONE_MORE_BYTE (c2);
- if (c2 == '\n')
- *dst++ = c2;
- else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
- {
- result = CODING_FINISH_INCONSISTENT_EOL;
- goto label_end_of_loop_2;
+ ONE_MORE_BYTE (c);
+ if (!UTF_8_EXTRA_OCTET_P (c))
+ return 0;
+ seq_maybe_bytes--;
+ }
+ while (seq_maybe_bytes > 0);
+ }
+
+ label_end_of_loop:
+ return CODING_CATEGORY_MASK_UTF_8;
+}
+
+/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
+ Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
+ Little Endian (otherwise). If it is, return
+ CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
+ else return 0. */
+
+#define UTF_16_INVALID_P(val) \
+ (((val) == 0xFFFE) \
+ || ((val) == 0xFFFF))
+
+#define UTF_16_HIGH_SURROGATE_P(val) \
+ (((val) & 0xD800) == 0xD800)
+
+#define UTF_16_LOW_SURROGATE_P(val) \
+ (((val) & 0xDC00) == 0xDC00)
+
+int
+detect_coding_utf_16 (src, src_end)
+ unsigned char *src, *src_end;
+{
+ unsigned char c1, c2;
+ /* Dummy for TWO_MORE_BYTES. */
+ struct coding_system dummy_coding;
+ struct coding_system *coding = &dummy_coding;
+
+ TWO_MORE_BYTES (c1, c2);
+
+ if ((c1 == 0xFF) && (c2 == 0xFE))
+ return CODING_CATEGORY_MASK_UTF_16_LE;
+ else if ((c1 == 0xFE) && (c2 == 0xFF))
+ return CODING_CATEGORY_MASK_UTF_16_BE;
+
+ label_end_of_loop:
+ return 0;
+}
+
+/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
+ If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
+
+static void
+decode_coding_sjis_big5 (coding, source, destination,
+ src_bytes, dst_bytes, sjis_p)
+ struct coding_system *coding;
+ unsigned char *source, *destination;
+ int src_bytes, dst_bytes;
+ int sjis_p;
+{
+ unsigned char *src = source;
+ unsigned char *src_end = source + src_bytes;
+ unsigned char *dst = destination;
+ unsigned char *dst_end = destination + dst_bytes;
+ /* SRC_BASE remembers the start position in source in each loop.
+ The loop will be exited when there's not enough source code
+ (within macro ONE_MORE_BYTE), or when there's not enough
+ destination area to produce a character (within macro
+ EMIT_CHAR). */
+ unsigned char *src_base;
+ Lisp_Object translation_table;
+
+ if (NILP (Venable_character_translation))
+ translation_table = Qnil;
+ else
+ {
+ translation_table = coding->translation_table_for_decode;
+ if (NILP (translation_table))
+ translation_table = Vstandard_translation_table_for_decode;
+ }
+
+ coding->produced_char = 0;
+ while (1)
+ {
+ int c, charset, c1, c2;
+
+ src_base = src;
+ ONE_MORE_BYTE (c1);
+
+ if (c1 < 0x80)
+ {
+ charset = CHARSET_ASCII;
+ if (c1 < 0x20)
+ {
+ if (c1 == '\r')
+ {
+ if (coding->eol_type == CODING_EOL_CRLF)
+ {
+ ONE_MORE_BYTE (c2);
+ if (c2 == '\n')
+ c1 = c2;
+ else if (coding->mode
+ & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ {
+ coding->result = CODING_FINISH_INCONSISTENT_EOL;
+ goto label_end_of_loop;
+ }
+ else
+ /* To process C2 again, SRC is subtracted by 1. */
+ src--;
}
- else
- /* To process C2 again, SRC is subtracted by 1. */
- *dst++ = c1, src--;
+ else if (coding->eol_type == CODING_EOL_CR)
+ c1 = '\n';
+ }
+ else if (c1 == '\n'
+ && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ && (coding->eol_type == CODING_EOL_CR
+ || coding->eol_type == CODING_EOL_CRLF))
+ {
+ coding->result = CODING_FINISH_INCONSISTENT_EOL;
+ goto label_end_of_loop;
}
- else if (coding->eol_type == CODING_EOL_CR)
- *dst++ = '\n';
- else
- *dst++ = c1;
- }
- else if (c1 == '\n'
- && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
- && (coding->eol_type == CODING_EOL_CR
- || coding->eol_type == CODING_EOL_CRLF))
- {
- result = CODING_FINISH_INCONSISTENT_EOL;
- goto label_end_of_loop_2;
}
- else
- *dst++ = c1;
- coding->produced_char++;
}
- else if (c1 < 0x80)
- DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
else
- {
+ {
if (sjis_p)
{
- if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
+ if (c1 >= 0xF0)
+ goto label_invalid_code;
+ if (c1 < 0xA0 || c1 >= 0xE0)
{
/* SJIS -> JISX0208 */
ONE_MORE_BYTE (c2);
- if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
- {
- DECODE_SJIS (c1, c2, c3, c4);
- DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
- }
- else
- goto label_invalid_code_2;
+ if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
+ goto label_invalid_code;
+ DECODE_SJIS (c1, c2, c1, c2);
+ charset = charset_jisx0208;
}
- else if (c1 < 0xE0)
- /* SJIS -> JISX0201-Kana */
- DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
- /* dummy */ c2);
else
- goto label_invalid_code_1;
+ /* SJIS -> JISX0201-Kana */
+ charset = charset_katakana_jisx0201;
}
else
{
/* BIG5 -> Big5 */
- if (c1 >= 0xA1 && c1 <= 0xFE)
- {
- ONE_MORE_BYTE (c2);
- if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
- {
- int charset;
-
- DECODE_BIG5 (c1, c2, charset, c3, c4);
- DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
- }
- else
- goto label_invalid_code_2;
- }
- else
- goto label_invalid_code_1;
+ if (c1 < 0xA1 || c1 > 0xFE)
+ goto label_invalid_code;
+ ONE_MORE_BYTE (c2);
+ if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
+ goto label_invalid_code;
+ DECODE_BIG5 (c1, c2, charset, c1, c2);
}
}
- continue;
-
- label_invalid_code_1:
- *dst++ = c1;
- coding->produced_char++;
- coding->fake_multibyte = 1;
- continue;
- label_invalid_code_2:
- *dst++ = c1; *dst++= c2;
- coding->produced_char += 2;
- coding->fake_multibyte = 1;
+ c = DECODE_ISO_CHARACTER (charset, c1, c2);
+ EMIT_CHAR (c);
continue;
- label_end_of_loop:
- result = CODING_FINISH_INSUFFICIENT_SRC;
- label_end_of_loop_2:
+ label_invalid_code:
+ coding->errors++;
src = src_base;
- break;
- }
-
- if (src < src_end)
- {
- if (result == CODING_FINISH_NORMAL)
- result = CODING_FINISH_INSUFFICIENT_DST;
- else if (result != CODING_FINISH_INCONSISTENT_EOL
- && coding->mode & CODING_MODE_LAST_BLOCK)
- {
- src_bytes = src_end - src;
- if (dst_bytes && (dst_end - dst < src_bytes))
- src_bytes = dst_end - dst;
- bcopy (dst, src, src_bytes);
- src += src_bytes;
- dst += src_bytes;
- coding->fake_multibyte = 1;
- }
+ c = *src++;
+ EMIT_CHAR (c);
}
- coding->consumed = coding->consumed_char = src - source;
+ label_end_of_loop:
+ coding->consumed = coding->consumed_char = src_base - source;
coding->produced = dst - destination;
- return result;
+ return;
}
/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
- This function can encode `charset_ascii', `charset_katakana_jisx0201',
- `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
- sure that all these charsets are registered as official charset
+ This function can encode charsets `ascii', `katakana-jisx0201',
+ `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
+ are sure that all these charsets are registered as official charset
(i.e. do not have extended leading-codes). Characters of other
charsets are produced without any encoding. If SJIS_P is 1, encode
SJIS text, else encode BIG5 text. */
-int
+static void
encode_coding_sjis_big5 (coding, source, destination,
src_bytes, dst_bytes, sjis_p)
struct coding_system *coding;
unsigned char *src_end = source + src_bytes;
unsigned char *dst = destination;
unsigned char *dst_end = destination + dst_bytes;
- /* Since the maximum bytes produced by each loop is 2, we subtract 1
- from DST_END to assure overflow checking is necessary only at the
- head of loop. */
- unsigned char *adjusted_dst_end = dst_end - 1;
- Lisp_Object translation_table
- = coding->translation_table_for_encode;
- int result = CODING_FINISH_NORMAL;
+ /* SRC_BASE remembers the start position in source in each loop.
+ The loop will be exited when there's not enough source text to
+ analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
+ there's not enough destination area to produce encoded codes
+ (within macro EMIT_BYTES). */
+ unsigned char *src_base;
+ Lisp_Object translation_table;
- if (!NILP (Venable_character_translation) && NILP (translation_table))
- translation_table = Vstandard_translation_table_for_encode;
+ if (NILP (Venable_character_translation))
+ translation_table = Qnil;
+ else
+ {
+ translation_table = coding->translation_table_for_decode;
+ if (NILP (translation_table))
+ translation_table = Vstandard_translation_table_for_decode;
+ }
- coding->consumed_char = 0;
- coding->fake_multibyte = 0;
- while (src < src_end && (dst_bytes
- ? (dst < adjusted_dst_end)
- : (dst < src - 1)))
+ while (1)
{
- /* SRC_BASE remembers the start position in source in each loop.
- The loop will be exited when there's not enough source text
- to analyze multi-byte codes (within macros ONE_MORE_BYTE and
- TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
- before exiting. */
- unsigned char *src_base = src;
- unsigned char c1 = *src++, c2, c3, c4;
-
- if (coding->composing)
+ int c, charset, c1, c2;
+
+ src_base = src;
+ ONE_MORE_CHAR (c);
+
+ /* Now encode the character C. */
+ if (SINGLE_BYTE_CHAR_P (c))
{
- if (c1 == 0xA0)
+ switch (c)
{
- ONE_MORE_BYTE (c1);
- c1 &= 0x7F;
+ case '\r':
+ if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
+ {
+ EMIT_ONE_BYTE (c);
+ break;
+ }
+ c = '\n';
+ case '\n':
+ if (coding->eol_type == CODING_EOL_CRLF)
+ {
+ EMIT_TWO_BYTES ('\r', c);
+ break;
+ }
+ else if (coding->eol_type == CODING_EOL_CR)
+ c = '\r';
+ default:
+ EMIT_ONE_BYTE (c);
}
- else if (c1 >= 0xA0)
- c1 -= 0x20;
- else
- coding->composing = 0;
}
-
- switch (emacs_code_class[c1])
+ else
{
- case EMACS_ascii_code:
- ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
- break;
-
- case EMACS_control_code:
- *dst++ = c1;
- coding->consumed_char++;
- break;
-
- case EMACS_carriage_return_code:
- if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
+ SPLIT_CHAR (c, charset, c1, c2);
+ if (sjis_p)
{
- *dst++ = c1;
- coding->consumed_char++;
- break;
+ if (charset == charset_jisx0208
+ || charset == charset_jisx0208_1978)
+ {
+ ENCODE_SJIS (c1, c2, c1, c2);
+ EMIT_TWO_BYTES (c1, c2);
+ }
+ else if (charset == charset_latin_jisx0201)
+ EMIT_ONE_BYTE (c1);
+ else
+ /* There's no way other than producing the internal
+ codes as is. */
+ EMIT_BYTES (src_base, src);
}
- /* fall down to treat '\r' as '\n' ... */
-
- case EMACS_linefeed_code:
- if (coding->eol_type == CODING_EOL_LF
- || coding->eol_type == CODING_EOL_UNDECIDED)
- *dst++ = '\n';
- else if (coding->eol_type == CODING_EOL_CRLF)
- *dst++ = '\r', *dst++ = '\n';
else
- *dst++ = '\r';
- coding->consumed_char++;
- break;
-
- case EMACS_leading_code_2:
- ONE_MORE_BYTE (c2);
- ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
- break;
-
- case EMACS_leading_code_3:
- TWO_MORE_BYTES (c2, c3);
- ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
- break;
-
- case EMACS_leading_code_4:
- THREE_MORE_BYTES (c2, c3, c4);
- ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
- break;
-
- case EMACS_leading_code_composition:
- coding->composing = 1;
- break;
-
- default: /* i.e. case EMACS_invalid_code: */
- *dst++ = c1;
- coding->consumed_char++;
+ {
+ if (charset == charset_big5_1 || charset == charset_big5_2)
+ {
+ ENCODE_BIG5 (charset, c1, c2, c1, c2);
+ EMIT_TWO_BYTES (c1, c2);
+ }
+ else
+ /* There's no way other than producing the internal
+ codes as is. */
+ EMIT_BYTES (src_base, src);
+ }
}
- continue;
-
- label_end_of_loop:
- result = CODING_FINISH_INSUFFICIENT_SRC;
- src = src_base;
- break;
+ coding->consumed_char++;
}
- if (result == CODING_FINISH_NORMAL
- && src < src_end)
- result = CODING_FINISH_INSUFFICIENT_DST;
- coding->consumed = src - source;
+ label_end_of_loop:
+ coding->consumed = src_base - source;
coding->produced = coding->produced_char = dst - destination;
- return result;
}
\f
unsigned char *src, *src_end;
{
unsigned char *valid;
+ int c;
+ /* Dummy for ONE_MORE_BYTE. */
+ struct coding_system dummy_coding;
+ struct coding_system *coding = &dummy_coding;
/* No coding system is assigned to coding-category-ccl. */
if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
return 0;
valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
- while (src < src_end)
+ while (1)
{
- if (! valid[*src]) return 0;
- src++;
+ ONE_MORE_BYTE (c);
+ if (! valid[c])
+ return 0;
}
+ label_end_of_loop:
return CODING_CATEGORY_MASK_CCL;
}
\f
/*** 6. End-of-line handlers ***/
-/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
- This function is called only when `coding->eol_type' is
- CODING_EOL_CRLF or CODING_EOL_CR. */
+/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
-int
+static void
decode_eol (coding, source, destination, src_bytes, dst_bytes)
struct coding_system *coding;
unsigned char *source, *destination;
int src_bytes, dst_bytes;
{
unsigned char *src = source;
- unsigned char *src_end = source + src_bytes;
unsigned char *dst = destination;
- unsigned char *dst_end = destination + dst_bytes;
- unsigned char c;
- int result = CODING_FINISH_NORMAL;
-
- coding->fake_multibyte = 0;
-
- if (src_bytes <= 0)
- return result;
-
+ unsigned char *src_end = src + src_bytes;
+ unsigned char *dst_end = dst + dst_bytes;
+ Lisp_Object translation_table;
+ /* SRC_BASE remembers the start position in source in each loop.
+ The loop will be exited when there's not enough source code
+ (within macro ONE_MORE_BYTE), or when there's not enough
+ destination area to produce a character (within macro
+ EMIT_CHAR). */
+ unsigned char *src_base;
+ int c;
+
+ translation_table = Qnil;
switch (coding->eol_type)
{
case CODING_EOL_CRLF:
- {
- /* Since the maximum bytes produced by each loop is 2, we
- subtract 1 from DST_END to assure overflow checking is
- necessary only at the head of loop. */
- unsigned char *adjusted_dst_end = dst_end - 1;
-
- while (src < src_end && (dst_bytes
- ? (dst < adjusted_dst_end)
- : (dst < src - 1)))
- {
- unsigned char *src_base = src;
-
- c = *src++;
- if (c == '\r')
- {
- ONE_MORE_BYTE (c);
- if (c == '\n')
- *dst++ = c;
- else
- {
- if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
- {
- result = CODING_FINISH_INCONSISTENT_EOL;
- goto label_end_of_loop_2;
- }
- src--;
- *dst++ = '\r';
- if (BASE_LEADING_CODE_P (c))
- coding->fake_multibyte = 1;
- }
- }
- else if (c == '\n'
- && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
- {
- result = CODING_FINISH_INCONSISTENT_EOL;
- goto label_end_of_loop_2;
- }
- else
- {
- *dst++ = c;
- if (BASE_LEADING_CODE_P (c))
- coding->fake_multibyte = 1;
- }
- continue;
-
- label_end_of_loop:
- result = CODING_FINISH_INSUFFICIENT_SRC;
- label_end_of_loop_2:
- src = src_base;
- break;
- }
- if (src < src_end)
- {
- if (result == CODING_FINISH_NORMAL)
- result = CODING_FINISH_INSUFFICIENT_DST;
- else if (result != CODING_FINISH_INCONSISTENT_EOL
- && coding->mode & CODING_MODE_LAST_BLOCK)
- {
- /* This is the last block of the text to be decoded.
- We flush out all remaining codes. */
- src_bytes = src_end - src;
- if (dst_bytes && (dst_end - dst < src_bytes))
- src_bytes = dst_end - dst;
- bcopy (src, dst, src_bytes);
- dst += src_bytes;
- src += src_bytes;
- }
- }
- }
- break;
-
- case CODING_EOL_CR:
- if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ while (1)
{
- while (src < src_end)
+ src_base = src;
+ ONE_MORE_BYTE (c);
+ if (c == '\r')
{
- if ((c = *src++) == '\n')
- break;
- if (BASE_LEADING_CODE_P (c))
- coding->fake_multibyte = 1;
+ ONE_MORE_BYTE (c);
+ if (c != '\n')
+ {
+ if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ {
+ coding->result = CODING_FINISH_INCONSISTENT_EOL;
+ goto label_end_of_loop;
+ }
+ src--;
+ c = '\r';
+ }
}
- if (*--src == '\n')
+ else if (c == '\n'
+ && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
{
- src_bytes = src - source;
- result = CODING_FINISH_INCONSISTENT_EOL;
+ coding->result = CODING_FINISH_INCONSISTENT_EOL;
+ goto label_end_of_loop;
}
+ EMIT_CHAR (c);
}
- if (dst_bytes && src_bytes > dst_bytes)
+ break;
+
+ case CODING_EOL_CR:
+ while (1)
{
- result = CODING_FINISH_INSUFFICIENT_DST;
- src_bytes = dst_bytes;
+ src_base = src;
+ ONE_MORE_BYTE (c);
+ if (c == '\n')
+ {
+ if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ {
+ coding->result = CODING_FINISH_INCONSISTENT_EOL;
+ goto label_end_of_loop;
+ }
+ }
+ else if (c == '\r')
+ c = '\n';
+ EMIT_CHAR (c);
}
- if (dst_bytes)
- bcopy (source, destination, src_bytes);
- else
- safe_bcopy (source, destination, src_bytes);
- src = source + src_bytes;
- while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
break;
- default: /* i.e. case: CODING_EOL_LF */
- if (dst_bytes && src_bytes > dst_bytes)
+ default: /* no need for EOL handling */
+ while (1)
{
- result = CODING_FINISH_INSUFFICIENT_DST;
- src_bytes = dst_bytes;
+ src_base = src;
+ ONE_MORE_BYTE (c);
+ EMIT_CHAR (c);
}
- if (dst_bytes)
- bcopy (source, destination, src_bytes);
- else
- safe_bcopy (source, destination, src_bytes);
- src += src_bytes;
- dst += src_bytes;
- coding->fake_multibyte = 1;
- break;
}
- coding->consumed = coding->consumed_char = src - source;
- coding->produced = coding->produced_char = dst - destination;
- return result;
+ label_end_of_loop:
+ coding->consumed = coding->consumed_char = src_base - source;
+ coding->produced = dst - destination;
+ return;
}
/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
- format of end-of-line according to `coding->eol_type'. If
- `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
- '\r' in source text also means end-of-line. */
+ format of end-of-line according to `coding->eol_type'. It also
+ convert multibyte form 8-bit characers to unibyte if
+ CODING->src_multibyte is nonzero. If `coding->mode &
+ CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
+ also means end-of-line. */
-int
+static void
encode_eol (coding, source, destination, src_bytes, dst_bytes)
struct coding_system *coding;
unsigned char *source, *destination;
{
unsigned char *src = source;
unsigned char *dst = destination;
- int result = CODING_FINISH_NORMAL;
-
- coding->fake_multibyte = 0;
+ unsigned char *src_end = src + src_bytes;
+ unsigned char *dst_end = dst + dst_bytes;
+ Lisp_Object translation_table;
+ /* SRC_BASE remembers the start position in source in each loop.
+ The loop will be exited when there's not enough source text to
+ analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
+ there's not enough destination area to produce encoded codes
+ (within macro EMIT_BYTES). */
+ unsigned char *src_base;
+ int c;
+ int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
+
+ translation_table = Qnil;
+ if (coding->src_multibyte
+ && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
+ {
+ src_end--;
+ src_bytes--;
+ coding->result = CODING_FINISH_INSUFFICIENT_SRC;
+ }
if (coding->eol_type == CODING_EOL_CRLF)
{
- unsigned char c;
- unsigned char *src_end = source + src_bytes;
- unsigned char *dst_end = destination + dst_bytes;
- /* Since the maximum bytes produced by each loop is 2, we
- subtract 1 from DST_END to assure overflow checking is
- necessary only at the head of loop. */
- unsigned char *adjusted_dst_end = dst_end - 1;
-
- while (src < src_end && (dst_bytes
- ? (dst < adjusted_dst_end)
- : (dst < src - 1)))
+ while (src < src_end)
{
+ src_base = src;
c = *src++;
- if (c == '\n'
- || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
- *dst++ = '\r', *dst++ = '\n';
+ if (c >= 0x20)
+ EMIT_ONE_BYTE (c);
+ else if (c == '\n' || (c == '\r' && selective_display))
+ EMIT_TWO_BYTES ('\r', '\n');
else
- {
- *dst++ = c;
- if (BASE_LEADING_CODE_P (c))
- coding->fake_multibyte = 1;
- }
+ EMIT_ONE_BYTE (c);
}
- if (src < src_end)
- result = CODING_FINISH_INSUFFICIENT_DST;
+ src_base = src;
+ label_end_of_loop:
+ ;
}
else
{
- unsigned char c;
-
- if (dst_bytes && src_bytes > dst_bytes)
+ if (src_bytes <= dst_bytes)
{
- src_bytes = dst_bytes;
- result = CODING_FINISH_INSUFFICIENT_DST;
+ safe_bcopy (src, dst, src_bytes);
+ src_base = src_end;
+ dst += src_bytes;
}
- if (dst_bytes)
- bcopy (source, destination, src_bytes);
else
- safe_bcopy (source, destination, src_bytes);
- dst_bytes = src_bytes;
+ {
+ if (coding->src_multibyte
+ && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
+ dst_bytes--;
+ safe_bcopy (src, dst, dst_bytes);
+ src_base = src + dst_bytes;
+ dst = destination + dst_bytes;
+ coding->result = CODING_FINISH_INSUFFICIENT_DST;
+ }
if (coding->eol_type == CODING_EOL_CR)
{
- while (src_bytes--)
- {
- if ((c = *dst++) == '\n')
- dst[-1] = '\r';
- else if (BASE_LEADING_CODE_P (c))
- coding->fake_multibyte = 1;
- }
+ for (src = destination; src < dst; src++)
+ if (*src == '\n') *src = '\r';
}
- else
+ else if (selective_display)
{
- if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
- {
- while (src_bytes--)
- if (*dst++ == '\r') dst[-1] = '\n';
- }
- coding->fake_multibyte = 1;
+ for (src = destination; src < dst; src++)
+ if (*src == '\r') *src = '\n';
}
- src = source + dst_bytes;
- dst = destination + dst_bytes;
}
+ if (coding->src_multibyte)
+ dst = destination + str_as_unibyte (destination, dst - destination);
- coding->consumed = coding->consumed_char = src - source;
- coding->produced = coding->produced_char = dst - destination;
- return result;
+ coding->consumed = src_base - source;
+ coding->produced = dst - destination;
}
\f
coding->mode = 0;
coding->heading_ascii = -1;
coding->post_read_conversion = coding->pre_write_conversion = Qnil;
+ coding->composing = COMPOSITION_DISABLED;
+ coding->cmp_data = NULL;
if (NILP (coding_system))
goto label_invalid_coding_system;
return 0;
}
- /* Initialize remaining fields. */
- coding->composing = 0;
- coding->composed_chars = 0;
-
/* Get values of coding system properties:
`post-read-conversion', `pre-write-conversion',
`translation-table-for-decode', `translation-table-for-encode'. */
plist = XVECTOR (coding_spec)->contents[3];
- coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
- coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
+ /* Pre & post conversion functions should be disabled if
+ inhibit_eol_conversion is nozero. This is the case that a code
+ conversion function is called while those functions are running. */
+ if (! inhibit_pre_post_conversion)
+ {
+ coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
+ coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
+ }
val = Fplist_get (plist, Qtranslation_table_for_decode);
if (SYMBOLP (val))
val = Fget (val, Qtranslation_table_for_decode);
else
goto label_invalid_coding_system;
- val = Fplist_get (plist, Qsafe_charsets);
- if (EQ (val, Qt))
- {
- for (i = 0; i <= MAX_CHARSET; i++)
- coding->safe_charsets[i] = 1;
- }
- else
- {
- bzero (coding->safe_charsets, MAX_CHARSET + 1);
- while (CONSP (val))
- {
- if ((i = get_charset_id (XCONS (val)->car)) >= 0)
- coding->safe_charsets[i] = 1;
- val = XCONS (val)->cdr;
- }
- }
+ /* If the coding system has non-nil `composition' property, enable
+ composition handling. */
+ val = Fplist_get (plist, Qcomposition);
+ if (!NILP (val))
+ coding->composing = COMPOSITION_NO;
switch (XFASTINT (coding_type))
{
val = Vcharset_revision_alist;
while (CONSP (val))
{
- charset = get_charset_id (Fcar_safe (XCONS (val)->car));
+ charset = get_charset_id (Fcar_safe (XCAR (val)));
if (charset >= 0
- && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
+ && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
&& (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
- val = XCONS (val)->cdr;
+ val = XCDR (val);
}
/* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
tail = flags[i];
coding->flags |= CODING_FLAG_ISO_DESIGNATION;
- if (INTEGERP (XCONS (tail)->car)
- && (charset = XINT (XCONS (tail)->car),
+ if (INTEGERP (XCAR (tail))
+ && (charset = XINT (XCAR (tail)),
CHARSET_VALID_P (charset))
- || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
+ || (charset = get_charset_id (XCAR (tail))) >= 0)
{
CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
}
else
CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
- tail = XCONS (tail)->cdr;
+ tail = XCDR (tail);
while (CONSP (tail))
{
- if (INTEGERP (XCONS (tail)->car)
- && (charset = XINT (XCONS (tail)->car),
+ if (INTEGERP (XCAR (tail))
+ && (charset = XINT (XCAR (tail)),
CHARSET_VALID_P (charset))
- || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
+ || (charset = get_charset_id (XCAR (tail))) >= 0)
CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
= i;
- else if (EQ (XCONS (tail)->car, Qt))
+ else if (EQ (XCAR (tail), Qt))
reg_bits |= 1 << i;
- tail = XCONS (tail)->cdr;
+ tail = XCDR (tail);
}
}
else
if (reg_bits)
for (charset = 0; charset <= MAX_CHARSET; charset++)
{
- if (CHARSET_VALID_P (charset))
+ if (CHARSET_VALID_P (charset)
+ && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
+ == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
{
/* There exist some default graphic registers to be
- used CHARSET. */
+ used by CHARSET. */
/* We had better avoid designating a charset of
CHARS96 to REG 0 as far as possible. */
coding->common_flags
|= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
{
- Lisp_Object val;
- Lisp_Object decoder, encoder;
-
val = XVECTOR (coding_spec)->contents[4];
- if (CONSP (val)
- && SYMBOLP (XCONS (val)->car)
- && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
- && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
- && SYMBOLP (XCONS (val)->cdr)
- && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
- && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
- {
- setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
- setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
- }
- else
+ if (! CONSP (val)
+ || setup_ccl_program (&(coding->spec.ccl.decoder),
+ XCAR (val)) < 0
+ || setup_ccl_program (&(coding->spec.ccl.encoder),
+ XCDR (val)) < 0)
goto label_invalid_coding_system;
bzero (coding->spec.ccl.valid_codes, 256);
{
Lisp_Object this;
- for (; CONSP (val); val = XCONS (val)->cdr)
+ for (; CONSP (val); val = XCDR (val))
{
- this = XCONS (val)->car;
+ this = XCAR (val);
if (INTEGERP (this)
&& XINT (this) >= 0 && XINT (this) < 256)
coding->spec.ccl.valid_codes[XINT (this)] = 1;
else if (CONSP (this)
- && INTEGERP (XCONS (this)->car)
- && INTEGERP (XCONS (this)->cdr))
+ && INTEGERP (XCAR (this))
+ && INTEGERP (XCDR (this)))
{
- int start = XINT (XCONS (this)->car);
- int end = XINT (XCONS (this)->cdr);
+ int start = XINT (XCAR (this));
+ int end = XINT (XCDR (this));
if (start >= 0 && start <= end && end < 256)
while (start <= end)
}
}
coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
+ coding->spec.ccl.cr_carryover = 0;
break;
case 5:
return -1;
}
-/* Setup raw-text or one of its subsidiaries in the structure
- coding_system CODING according to the already setup value eol_type
- in CODING. CODING should be setup for some coding system in
- advance. */
+/* Free memory blocks allocated for storing composition information. */
void
-setup_raw_text_coding_system (coding)
+coding_free_composition_data (coding)
struct coding_system *coding;
{
- if (coding->type != coding_type_raw_text)
+ struct composition_data *cmp_data = coding->cmp_data, *next;
+
+ if (!cmp_data)
+ return;
+ /* Memory blocks are chained. At first, rewind to the first, then,
+ free blocks one by one. */
+ while (cmp_data->prev)
+ cmp_data = cmp_data->prev;
+ while (cmp_data)
+ {
+ next = cmp_data->next;
+ xfree (cmp_data);
+ cmp_data = next;
+ }
+ coding->cmp_data = NULL;
+}
+
+/* Set `char_offset' member of all memory blocks pointed by
+ coding->cmp_data to POS. */
+
+void
+coding_adjust_composition_offset (coding, pos)
+ struct coding_system *coding;
+ int pos;
+{
+ struct composition_data *cmp_data;
+
+ for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
+ cmp_data->char_offset = pos;
+}
+
+/* Setup raw-text or one of its subsidiaries in the structure
+ coding_system CODING according to the already setup value eol_type
+ in CODING. CODING should be setup for some coding system in
+ advance. */
+
+void
+setup_raw_text_coding_system (coding)
+ struct coding_system *coding;
+{
+ if (coding->type != coding_type_raw_text)
{
coding->symbol = Qraw_text;
coding->type = coding_type_raw_text;
coding->symbol
= XVECTOR (subsidiaries)->contents[coding->eol_type];
}
+ setup_coding_system (coding->symbol, coding);
}
return;
}
as BIG5. Assigned the coding-system (Lisp symbol)
`cn-big5' by default.
+ o coding-category-utf-8
+
+ The category for a coding system which has the same code range
+ as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
+ symbol) `utf-8' by default.
+
+ o coding-category-utf-16-be
+
+ The category for a coding system in which a text has an
+ Unicode signature (cf. Unicode Standard) in the order of BIG
+ endian at the head. Assigned the coding-system (Lisp symbol)
+ `utf-16-be' by default.
+
+ o coding-category-utf-16-le
+
+ The category for a coding system in which a text has an
+ Unicode signature (cf. Unicode Standard) in the order of
+ LITTLE endian at the head. Assigned the coding-system (Lisp
+ symbol) `utf-16-le' by default.
+
o coding-category-ccl
The category for a coding system of which encoder/decoder is
/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
If it detects possible coding systems, return an integer in which
appropriate flag bits are set. Flag bits are defined by macros
- CODING_CATEGORY_MASK_XXX in `coding.h'.
+ CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
+ it should point the table `coding_priorities'. In that case, only
+ the flag bit for a coding system of the highest priority is set in
+ the returned value.
How many ASCII characters are at the head is returned as *SKIP. */
{
register unsigned char c;
unsigned char *src = source, *src_end = source + src_bytes;
- unsigned int mask;
- int i;
+ unsigned int mask, utf16_examined_p, iso2022_examined_p;
+ int i, idx;
/* At first, skip all ASCII characters and control characters except
for three ISO2022 specific control characters. */
goto label_loop_detect_coding;
}
if (priorities)
- goto label_return_highest_only;
+ {
+ for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
+ {
+ if (mask & priorities[i])
+ return priorities[i];
+ }
+ return CODING_CATEGORY_MASK_RAW_TEXT;
+ }
}
else
{
if (c < 0xA0)
{
/* C is the first byte of SJIS character code,
- or a leading-code of Emacs' internal format (emacs-mule). */
- try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
+ or a leading-code of Emacs' internal format (emacs-mule),
+ or the first byte of UTF-16. */
+ try = (CODING_CATEGORY_MASK_SJIS
+ | CODING_CATEGORY_MASK_EMACS_MULE
+ | CODING_CATEGORY_MASK_UTF_16_BE
+ | CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, if C is a special latin extra code,
or is an ISO2022 specific control code of C1 (SS2 or SS3),
else
/* C is a character of ISO2022 in graphic plane right,
or a SJIS's 1-byte character code (i.e. JISX0201),
- or the first byte of BIG5's 2-byte code. */
+ or the first byte of BIG5's 2-byte code,
+ or the first byte of UTF-8/16. */
try = (CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT
| CODING_CATEGORY_MASK_SJIS
- | CODING_CATEGORY_MASK_BIG5);
+ | CODING_CATEGORY_MASK_BIG5
+ | CODING_CATEGORY_MASK_UTF_8
+ | CODING_CATEGORY_MASK_UTF_16_BE
+ | CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, we may have to consider the possibility of CCL. */
if (coding_system_table[CODING_CATEGORY_IDX_CCL]
try |= CODING_CATEGORY_MASK_CCL;
mask = 0;
+ utf16_examined_p = iso2022_examined_p = 0;
if (priorities)
{
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
{
- if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
- mask = detect_coding_iso2022 (src, src_end);
+ if (!iso2022_examined_p
+ && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
+ {
+ mask |= detect_coding_iso2022 (src, src_end);
+ iso2022_examined_p = 1;
+ }
else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
- mask = detect_coding_sjis (src, src_end);
+ mask |= detect_coding_sjis (src, src_end);
+ else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
+ mask |= detect_coding_utf_8 (src, src_end);
+ else if (!utf16_examined_p
+ && (priorities[i] & try &
+ CODING_CATEGORY_MASK_UTF_16_BE_LE))
+ {
+ mask |= detect_coding_utf_16 (src, src_end);
+ utf16_examined_p = 1;
+ }
else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
- mask = detect_coding_big5 (src, src_end);
+ mask |= detect_coding_big5 (src, src_end);
else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
- mask = detect_coding_emacs_mule (src, src_end);
+ mask |= detect_coding_emacs_mule (src, src_end);
else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
- mask = detect_coding_ccl (src, src_end);
+ mask |= detect_coding_ccl (src, src_end);
else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
- mask = CODING_CATEGORY_MASK_RAW_TEXT;
+ mask |= CODING_CATEGORY_MASK_RAW_TEXT;
else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
- mask = CODING_CATEGORY_MASK_BINARY;
- if (mask)
- goto label_return_highest_only;
+ mask |= CODING_CATEGORY_MASK_BINARY;
+ if (mask & priorities[i])
+ return priorities[i];
}
return CODING_CATEGORY_MASK_RAW_TEXT;
}
mask |= detect_coding_sjis (src, src_end);
if (try & CODING_CATEGORY_MASK_BIG5)
mask |= detect_coding_big5 (src, src_end);
+ if (try & CODING_CATEGORY_MASK_UTF_8)
+ mask |= detect_coding_utf_8 (src, src_end);
+ if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
+ mask |= detect_coding_utf_16 (src, src_end);
if (try & CODING_CATEGORY_MASK_EMACS_MULE)
mask |= detect_coding_emacs_mule (src, src_end);
if (try & CODING_CATEGORY_MASK_CCL)
mask |= detect_coding_ccl (src, src_end);
}
return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
-
- label_return_highest_only:
- for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
- {
- if (mask & priorities[i])
- return priorities[i];
- }
- return CODING_CATEGORY_MASK_RAW_TEXT;
}
/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
if (VECTORP (tmp))
val = XVECTOR (tmp)->contents[coding->eol_type];
}
- setup_coding_system (val, coding);
- /* Set this again because setup_coding_system reset this member. */
- coding->heading_ascii = skip;
+
+ /* Setup this new coding system while preserving some slots. */
+ {
+ int src_multibyte = coding->src_multibyte;
+ int dst_multibyte = coding->dst_multibyte;
+
+ setup_coding_system (val, coding);
+ coding->src_multibyte = src_multibyte;
+ coding->dst_multibyte = dst_multibyte;
+ coding->heading_ascii = skip;
+ }
}
/* Detect how end-of-line of a text of length SRC_BYTES pointed by
return eol_type;
}
+/* Like detect_eol_type, but detect EOL type in 2-octet
+ big-endian/little-endian format for coding systems utf-16-be and
+ utf-16-le. */
+
+static int
+detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
+ unsigned char *source;
+ int src_bytes, *skip;
+{
+ unsigned char *src = source, *src_end = src + src_bytes;
+ unsigned int c1, c2;
+ int total = 0; /* How many end-of-lines are found so far. */
+ int eol_type = CODING_EOL_UNDECIDED;
+ int this_eol_type;
+ int msb, lsb;
+
+ if (big_endian_p)
+ msb = 0, lsb = 1;
+ else
+ msb = 1, lsb = 0;
+
+ *skip = 0;
+
+ while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
+ {
+ c1 = (src[msb] << 8) | (src[lsb]);
+ src += 2;
+
+ if (c1 == '\n' || c1 == '\r')
+ {
+ if (*skip == 0)
+ *skip = src - 2 - source;
+ total++;
+ if (c1 == '\n')
+ {
+ this_eol_type = CODING_EOL_LF;
+ }
+ else
+ {
+ if ((src + 1) >= src_end)
+ {
+ this_eol_type = CODING_EOL_CR;
+ }
+ else
+ {
+ c2 = (src[msb] << 8) | (src[lsb]);
+ if (c2 == '\n')
+ this_eol_type = CODING_EOL_CRLF, src += 2;
+ else
+ this_eol_type = CODING_EOL_CR;
+ }
+ }
+
+ if (eol_type == CODING_EOL_UNDECIDED)
+ /* This is the first end-of-line. */
+ eol_type = this_eol_type;
+ else if (eol_type != this_eol_type)
+ {
+ /* The found type is different from what found before. */
+ eol_type = CODING_EOL_INCONSISTENT;
+ break;
+ }
+ }
+ }
+
+ if (*skip == 0)
+ *skip = src_end - source;
+ return eol_type;
+}
+
/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
is encoded. If it detects an appropriate format of end-of-line, it
sets the information in *CODING. */
{
Lisp_Object val;
int skip;
- int eol_type = detect_eol_type (src, src_bytes, &skip);
+ int eol_type;
+
+ switch (coding->category_idx)
+ {
+ case CODING_CATEGORY_IDX_UTF_16_BE:
+ eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
+ break;
+ case CODING_CATEGORY_IDX_UTF_16_LE:
+ eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
+ break;
+ default:
+ eol_type = detect_eol_type (src, src_bytes, &skip);
+ break;
+ }
if (coding->heading_ascii > skip)
coding->heading_ascii = skip;
val = Fget (coding->symbol, Qeol_type);
if (VECTORP (val) && XVECTOR (val)->size == 3)
{
+ int src_multibyte = coding->src_multibyte;
+ int dst_multibyte = coding->dst_multibyte;
+
setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
+ coding->src_multibyte = src_multibyte;
+ coding->dst_multibyte = dst_multibyte;
coding->heading_ascii = skip;
}
}
#define CONVERSION_BUFFER_EXTRA_ROOM 256
-#define DECODING_BUFFER_MAG(coding) \
- (coding->type == coding_type_iso2022 \
- ? 3 \
- : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
- ? 2 \
- : (coding->type == coding_type_raw_text \
- ? 1 \
- : (coding->type == coding_type_ccl \
- ? coding->spec.ccl.decoder.buf_magnification \
- : 2))))
+#define DECODING_BUFFER_MAG(coding) \
+ (coding->type == coding_type_iso2022 \
+ ? 3 \
+ : (coding->type == coding_type_ccl \
+ ? coding->spec.ccl.decoder.buf_magnification \
+ : 2))
/* Return maximum size (bytes) of a buffer enough for decoding
SRC_BYTES of text encoded in CODING. */
if (coding->type == coding_type_ccl)
magnification = coding->spec.ccl.encoder.buf_magnification;
- else
+ else if (CODING_REQUIRE_ENCODING (coding))
magnification = 3;
+ else
+ magnification = 1;
return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
}
int result;
ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
-
+ if (encodep)
+ ccl->eol_type = coding->eol_type;
coding->produced = ccl_driver (ccl, source, destination,
src_bytes, dst_bytes, &(coding->consumed));
- coding->produced_char
- = (encodep
- ? coding->produced
- : multibyte_chars_in_text (destination, coding->produced));
- coding->consumed_char
- = multibyte_chars_in_text (source, coding->consumed);
+ if (encodep)
+ coding->produced_char = coding->produced;
+ else
+ {
+ int bytes
+ = dst_bytes ? dst_bytes : source + coding->consumed - destination;
+ coding->produced = str_as_multibyte (destination, bytes,
+ coding->produced,
+ &(coding->produced_char));
+ }
switch (ccl->status)
{
return result;
}
+/* Decode EOL format of the text at PTR of BYTES length destructively
+ according to CODING->eol_type. This is called after the CCL
+ program produced a decoded text at PTR. If we do CRLF->LF
+ conversion, update CODING->produced and CODING->produced_char. */
+
+static void
+decode_eol_post_ccl (coding, ptr, bytes)
+ struct coding_system *coding;
+ unsigned char *ptr;
+ int bytes;
+{
+ Lisp_Object val, saved_coding_symbol;
+ unsigned char *pend = ptr + bytes;
+ int dummy;
+
+ /* Remember the current coding system symbol. We set it back when
+ an inconsistent EOL is found so that `last-coding-system-used' is
+ set to the coding system that doesn't specify EOL conversion. */
+ saved_coding_symbol = coding->symbol;
+
+ coding->spec.ccl.cr_carryover = 0;
+ if (coding->eol_type == CODING_EOL_UNDECIDED)
+ {
+ /* Here, to avoid the call of setup_coding_system, we directly
+ call detect_eol_type. */
+ coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
+ if (coding->eol_type == CODING_EOL_INCONSISTENT)
+ coding->eol_type = CODING_EOL_LF;
+ if (coding->eol_type != CODING_EOL_UNDECIDED)
+ {
+ val = Fget (coding->symbol, Qeol_type);
+ if (VECTORP (val) && XVECTOR (val)->size == 3)
+ coding->symbol = XVECTOR (val)->contents[coding->eol_type];
+ }
+ coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
+ }
+
+ if (coding->eol_type == CODING_EOL_LF
+ || coding->eol_type == CODING_EOL_UNDECIDED)
+ {
+ /* We have nothing to do. */
+ ptr = pend;
+ }
+ else if (coding->eol_type == CODING_EOL_CRLF)
+ {
+ unsigned char *pstart = ptr, *p = ptr;
+
+ if (! (coding->mode & CODING_MODE_LAST_BLOCK)
+ && *(pend - 1) == '\r')
+ {
+ /* If the last character is CR, we can't handle it here
+ because LF will be in the not-yet-decoded source text.
+ Recorded that the CR is not yet processed. */
+ coding->spec.ccl.cr_carryover = 1;
+ coding->produced--;
+ coding->produced_char--;
+ pend--;
+ }
+ while (ptr < pend)
+ {
+ if (*ptr == '\r')
+ {
+ if (ptr + 1 < pend && *(ptr + 1) == '\n')
+ {
+ *p++ = '\n';
+ ptr += 2;
+ }
+ else
+ {
+ if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ goto undo_eol_conversion;
+ *p++ = *ptr++;
+ }
+ }
+ else if (*ptr == '\n'
+ && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ goto undo_eol_conversion;
+ else
+ *p++ = *ptr++;
+ continue;
+
+ undo_eol_conversion:
+ /* We have faced with inconsistent EOL format at PTR.
+ Convert all LFs before PTR back to CRLFs. */
+ for (p--, ptr--; p >= pstart; p--)
+ {
+ if (*p == '\n')
+ *ptr-- = '\n', *ptr-- = '\r';
+ else
+ *ptr-- = *p;
+ }
+ /* If carryover is recorded, cancel it because we don't
+ convert CRLF anymore. */
+ if (coding->spec.ccl.cr_carryover)
+ {
+ coding->spec.ccl.cr_carryover = 0;
+ coding->produced++;
+ coding->produced_char++;
+ pend++;
+ }
+ p = ptr = pend;
+ coding->eol_type = CODING_EOL_LF;
+ coding->symbol = saved_coding_symbol;
+ }
+ if (p < pend)
+ {
+ /* As each two-byte sequence CRLF was converted to LF, (PEND
+ - P) is the number of deleted characters. */
+ coding->produced -= pend - p;
+ coding->produced_char -= pend - p;
+ }
+ }
+ else /* i.e. coding->eol_type == CODING_EOL_CR */
+ {
+ unsigned char *p = ptr;
+
+ for (; ptr < pend; ptr++)
+ {
+ if (*ptr == '\r')
+ *ptr = '\n';
+ else if (*ptr == '\n'
+ && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
+ {
+ for (; p < ptr; p++)
+ {
+ if (*p == '\n')
+ *p = '\r';
+ }
+ ptr = pend;
+ coding->eol_type = CODING_EOL_LF;
+ coding->symbol = saved_coding_symbol;
+ }
+ }
+ }
+}
+
/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
decoding, it may detect coding system and format of end-of-line if
- those are not yet decided.
-
- This function does not make full use of DESTINATION buffer. For
- instance, if coding->type is coding_type_iso2022, it uses only
- (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
- DST_BYTES is decided by the function decoding_buffer_size, it
- contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
- So, this function can decode the full SOURCE. But, in the other
- case, if you want to avoid carry over, you must supply at least 7
- bytes more area in DESTINATION buffer than expected maximum bytes
- that will be produced by this function. */
+ those are not yet decided. The source should be unibyte, the
+ result is multibyte if CODING->dst_multibyte is nonzero, else
+ unibyte. */
int
decode_coding (coding, source, destination, src_bytes, dst_bytes)
unsigned char *source, *destination;
int src_bytes, dst_bytes;
{
- int result;
-
- if (src_bytes <= 0
- && coding->type != coding_type_ccl
- && ! (coding->mode & CODING_MODE_LAST_BLOCK
- && CODING_REQUIRE_FLUSHING (coding)))
- {
- coding->produced = coding->produced_char = 0;
- coding->consumed = coding->consumed_char = 0;
- coding->fake_multibyte = 0;
- return CODING_FINISH_NORMAL;
- }
-
if (coding->type == coding_type_undecided)
detect_coding (coding, source, src_bytes);
- if (coding->eol_type == CODING_EOL_UNDECIDED)
+ if (coding->eol_type == CODING_EOL_UNDECIDED
+ && coding->type != coding_type_ccl)
detect_eol (coding, source, src_bytes);
+ coding->produced = coding->produced_char = 0;
+ coding->consumed = coding->consumed_char = 0;
+ coding->errors = 0;
+ coding->result = CODING_FINISH_NORMAL;
+
switch (coding->type)
{
- case coding_type_emacs_mule:
- case coding_type_undecided:
- case coding_type_raw_text:
- if (coding->eol_type == CODING_EOL_LF
- || coding->eol_type == CODING_EOL_UNDECIDED)
- goto label_no_conversion;
- result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
- break;
-
case coding_type_sjis:
- result = decode_coding_sjis_big5 (coding, source, destination,
- src_bytes, dst_bytes, 1);
+ decode_coding_sjis_big5 (coding, source, destination,
+ src_bytes, dst_bytes, 1);
break;
case coding_type_iso2022:
- result = decode_coding_iso2022 (coding, source, destination,
- src_bytes, dst_bytes);
+ decode_coding_iso2022 (coding, source, destination,
+ src_bytes, dst_bytes);
break;
case coding_type_big5:
- result = decode_coding_sjis_big5 (coding, source, destination,
- src_bytes, dst_bytes, 0);
+ decode_coding_sjis_big5 (coding, source, destination,
+ src_bytes, dst_bytes, 0);
break;
- case coding_type_ccl:
- result = ccl_coding_driver (coding, source, destination,
- src_bytes, dst_bytes, 0);
+ case coding_type_emacs_mule:
+ decode_coding_emacs_mule (coding, source, destination,
+ src_bytes, dst_bytes);
break;
- default: /* i.e. case coding_type_no_conversion: */
- label_no_conversion:
- if (dst_bytes && src_bytes > dst_bytes)
+ case coding_type_ccl:
+ if (coding->spec.ccl.cr_carryover)
{
- coding->produced = dst_bytes;
- result = CODING_FINISH_INSUFFICIENT_DST;
+ /* Set the CR which is not processed by the previous call of
+ decode_eol_post_ccl in DESTINATION. */
+ *destination = '\r';
+ coding->produced++;
+ coding->produced_char++;
+ dst_bytes--;
}
- else
+ ccl_coding_driver (coding, source,
+ destination + coding->spec.ccl.cr_carryover,
+ src_bytes, dst_bytes, 0);
+ if (coding->eol_type != CODING_EOL_LF)
+ decode_eol_post_ccl (coding, destination, coding->produced);
+ break;
+
+ default:
+ decode_eol (coding, source, destination, src_bytes, dst_bytes);
+ }
+
+ if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
+ && coding->consumed == src_bytes)
+ coding->result = CODING_FINISH_NORMAL;
+
+ if (coding->mode & CODING_MODE_LAST_BLOCK
+ && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
+ {
+ unsigned char *src = source + coding->consumed;
+ unsigned char *dst = destination + coding->produced;
+
+ src_bytes -= coding->consumed;
+ coding->errors++;
+ if (COMPOSING_P (coding))
+ DECODE_COMPOSITION_END ('1');
+ while (src_bytes--)
{
- coding->produced = src_bytes;
- result = CODING_FINISH_NORMAL;
+ int c = *src++;
+ dst += CHAR_STRING (c, dst);
+ coding->produced_char++;
}
- if (dst_bytes)
- bcopy (source, destination, coding->produced);
- else
- safe_bcopy (source, destination, coding->produced);
- coding->fake_multibyte = 1;
- coding->consumed
- = coding->consumed_char = coding->produced_char = coding->produced;
- break;
+ coding->consumed = coding->consumed_char = src - source;
+ coding->produced = dst - destination;
}
- return result;
-}
+ if (!coding->dst_multibyte)
+ {
+ coding->produced = str_as_unibyte (destination, coding->produced);
+ coding->produced_char = coding->produced;
+ }
-/* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
+ return coding->result;
+}
- This function does not make full use of DESTINATION buffer. For
- instance, if coding->type is coding_type_iso2022, it uses only
- (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
- DST_BYTES is decided by the function encoding_buffer_size, it
- contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
- So, this function can encode the full SOURCE. But, in the other
- case, if you want to avoid carry over, you must supply at least 20
- bytes more area in DESTINATION buffer than expected maximum bytes
- that will be produced by this function. */
+/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
+ multibyteness of the source is CODING->src_multibyte, the
+ multibyteness of the result is always unibyte. */
int
encode_coding (coding, source, destination, src_bytes, dst_bytes)
unsigned char *source, *destination;
int src_bytes, dst_bytes;
{
- int result;
-
- if (src_bytes <= 0
- && ! (coding->mode & CODING_MODE_LAST_BLOCK
- && CODING_REQUIRE_FLUSHING (coding)))
- {
- coding->produced = coding->produced_char = 0;
- coding->consumed = coding->consumed_char = 0;
- coding->fake_multibyte = 0;
- return CODING_FINISH_NORMAL;
- }
+ coding->produced = coding->produced_char = 0;
+ coding->consumed = coding->consumed_char = 0;
+ coding->errors = 0;
+ coding->result = CODING_FINISH_NORMAL;
switch (coding->type)
{
- case coding_type_emacs_mule:
- case coding_type_undecided:
- case coding_type_raw_text:
- if (coding->eol_type == CODING_EOL_LF
- || coding->eol_type == CODING_EOL_UNDECIDED)
- goto label_no_conversion;
- result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
- break;
-
case coding_type_sjis:
- result = encode_coding_sjis_big5 (coding, source, destination,
- src_bytes, dst_bytes, 1);
+ encode_coding_sjis_big5 (coding, source, destination,
+ src_bytes, dst_bytes, 1);
break;
case coding_type_iso2022:
- result = encode_coding_iso2022 (coding, source, destination,
- src_bytes, dst_bytes);
+ encode_coding_iso2022 (coding, source, destination,
+ src_bytes, dst_bytes);
break;
case coding_type_big5:
- result = encode_coding_sjis_big5 (coding, source, destination,
- src_bytes, dst_bytes, 0);
+ encode_coding_sjis_big5 (coding, source, destination,
+ src_bytes, dst_bytes, 0);
+ break;
+
+ case coding_type_emacs_mule:
+ encode_coding_emacs_mule (coding, source, destination,
+ src_bytes, dst_bytes);
break;
case coding_type_ccl:
- result = ccl_coding_driver (coding, source, destination,
- src_bytes, dst_bytes, 1);
+ ccl_coding_driver (coding, source, destination,
+ src_bytes, dst_bytes, 1);
break;
- default: /* i.e. case coding_type_no_conversion: */
- label_no_conversion:
- if (dst_bytes && src_bytes > dst_bytes)
- {
- coding->produced = dst_bytes;
- result = CODING_FINISH_INSUFFICIENT_DST;
- }
- else
- {
- coding->produced = src_bytes;
- result = CODING_FINISH_NORMAL;
- }
- if (dst_bytes)
- bcopy (source, destination, coding->produced);
- else
- safe_bcopy (source, destination, coding->produced);
- if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
+ default:
+ encode_eol (coding, source, destination, src_bytes, dst_bytes);
+ }
+
+ if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
+ && coding->consumed == src_bytes)
+ coding->result = CODING_FINISH_NORMAL;
+
+ if (coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ unsigned char *src = source + coding->consumed;
+ unsigned char *src_end = src + src_bytes;
+ unsigned char *dst = destination + coding->produced;
+
+ if (coding->type == coding_type_iso2022)
+ ENCODE_RESET_PLANE_AND_REGISTER;
+ if (COMPOSING_P (coding))
+ *dst++ = ISO_CODE_ESC, *dst++ = '1';
+ if (coding->consumed < src_bytes)
{
- unsigned char *p = destination, *pend = p + coding->produced;
- while (p < pend)
- if (*p++ == '\015') p[-1] = '\n';
+ int len = src_bytes - coding->consumed;
+
+ BCOPY_SHORT (source + coding->consumed, dst, len);
+ if (coding->src_multibyte)
+ len = str_as_unibyte (dst, len);
+ dst += len;
+ coding->consumed = src_bytes;
}
- coding->fake_multibyte = 1;
- coding->consumed
- = coding->consumed_char = coding->produced_char = coding->produced;
- break;
+ coding->produced = coding->produced_char = dst - destination;
}
- return result;
+ return coding->result;
}
/* Scan text in the region between *BEG and *END (byte positions),
skip characters which we don't have to decode by coding system
CODING at the head and tail, then set *BEG and *END to the region
of the text we actually have to convert. The caller should move
- the gap out of the region in advance.
+ the gap out of the region in advance if the region is from a
+ buffer.
If STR is not NULL, *BEG and *END are indices into STR. */
if (coding->type == coding_type_ccl
|| coding->type == coding_type_undecided
- || !NILP (coding->post_read_conversion))
+ || coding->eol_type != CODING_EOL_LF
+ || !NILP (coding->post_read_conversion)
+ || coding->composing != COMPOSITION_DISABLED)
{
/* We can't skip any data. */
return;
}
- else if (coding->type == coding_type_no_conversion)
+ if (coding->type == coding_type_no_conversion
+ || coding->type == coding_type_raw_text
+ || coding->type == coding_type_emacs_mule)
{
/* We need no conversion, but don't have to skip any data here.
Decoding routine handles them effectively anyway. */
if (!NILP (CHAR_TABLE_REF (translation_table, i)))
break;
if (i < 128)
- /* Some ASCII character should be tranlsated. We give up
+ /* Some ASCII character should be translated. We give up
shrinking. */
return;
}
- eol_conversion = (coding->eol_type != CODING_EOL_LF);
-
- if ((! eol_conversion) && (coding->heading_ascii >= 0))
+ if (coding->heading_ascii >= 0)
/* Detection routine has already found how much we can skip at the
head. */
*beg += coding->heading_ascii;
endp_orig = endp = begp + *end - *beg;
}
+ eol_conversion = (coding->eol_type == CODING_EOL_CR
+ || coding->eol_type == CODING_EOL_CRLF);
+
switch (coding->type)
{
- case coding_type_emacs_mule:
- case coding_type_raw_text:
- if (eol_conversion)
- {
- if (coding->heading_ascii < 0)
- while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
- while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
- endp--;
- /* Do not consider LF as ascii if preceded by CR, since that
- confuses eol decoding. */
- if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
- endp++;
- }
- else
- begp = endp;
- break;
-
case coding_type_sjis:
case coding_type_big5:
/* We can skip all ASCII characters at the head. */
endp++;
break;
- default: /* i.e. case coding_type_iso2022: */
+ case coding_type_iso2022:
if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
/* We can't skip any data. */
break;
endp = eight_bit;
}
}
+ break;
+
+ default:
+ abort ();
}
*beg += begp - begp_orig;
*end += endp - endp_orig;
int eol_conversion;
Lisp_Object translation_table;
- if (coding->type == coding_type_ccl)
- /* We can't skip any data. */
- return;
- else if (coding->type == coding_type_no_conversion)
+ if (coding->type == coding_type_ccl
+ || coding->eol_type == CODING_EOL_CRLF
+ || coding->eol_type == CODING_EOL_CR
+ || coding->cmp_data && coding->cmp_data->used > 0)
+ {
+ /* We can't skip any data. */
+ return;
+ }
+ if (coding->type == coding_type_no_conversion
+ || coding->type == coding_type_raw_text
+ || coding->type == coding_type_emacs_mule
+ || coding->type == coding_type_undecided)
{
- /* We need no conversion. */
- *beg = *end;
+ /* We need no conversion, but don't have to skip any data here.
+ Encoding routine handles them effectively anyway. */
return;
}
the caller is expected to have handled it already. */
switch (coding->type)
{
- case coding_type_undecided:
- case coding_type_emacs_mule:
- case coding_type_raw_text:
- if (eol_conversion)
- {
- while (begp < endp && *begp != '\n') begp++;
- while (begp < endp && endp[-1] != '\n') endp--;
- }
- else
- begp = endp;
- break;
-
case coding_type_iso2022:
if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
/* We can't skip any data. */
}
/* fall down ... */
- default:
+ case coding_type_sjis:
+ case coding_type_big5:
/* We can skip all ASCII characters at the head and tail. */
if (eol_conversion)
while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
else
while (begp < endp && *(endp - 1) < 0x80) endp--;
break;
+
+ default:
+ abort ();
+ }
+
+ *beg += begp - begp_orig;
+ *end += endp - endp_orig;
+ return;
+}
+
+/* As shrinking conversion region requires some overhead, we don't try
+ shrinking if the length of conversion region is less than this
+ value. */
+static int shrink_conversion_region_threshhold = 1024;
+
+#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
+ do { \
+ if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
+ { \
+ if (encodep) shrink_encoding_region (beg, end, coding, str); \
+ else shrink_decoding_region (beg, end, coding, str); \
+ } \
+ } while (0)
+
+static Lisp_Object
+code_convert_region_unwind (dummy)
+ Lisp_Object dummy;
+{
+ inhibit_pre_post_conversion = 0;
+ return Qnil;
+}
+
+/* Store information about all compositions in the range FROM and TO
+ of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
+ buffer or a string, defaults to the current buffer. */
+
+void
+coding_save_composition (coding, from, to, obj)
+ struct coding_system *coding;
+ int from, to;
+ Lisp_Object obj;
+{
+ Lisp_Object prop;
+ int start, end;
+
+ if (coding->composing == COMPOSITION_DISABLED)
+ return;
+ if (!coding->cmp_data)
+ coding_allocate_composition_data (coding, from);
+ if (!find_composition (from, to, &start, &end, &prop, obj)
+ || end > to)
+ return;
+ if (start < from
+ && (!find_composition (end, to, &start, &end, &prop, obj)
+ || end > to))
+ return;
+ coding->composing = COMPOSITION_NO;
+ do
+ {
+ if (COMPOSITION_VALID_P (start, end, prop))
+ {
+ enum composition_method method = COMPOSITION_METHOD (prop);
+ if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
+ >= COMPOSITION_DATA_SIZE)
+ coding_allocate_composition_data (coding, from);
+ /* For relative composition, we remember start and end
+ positions, for the other compositions, we also remember
+ components. */
+ CODING_ADD_COMPOSITION_START (coding, start - from, method);
+ if (method != COMPOSITION_RELATIVE)
+ {
+ /* We must store a*/
+ Lisp_Object val, ch;
+
+ val = COMPOSITION_COMPONENTS (prop);
+ if (CONSP (val))
+ while (CONSP (val))
+ {
+ ch = XCAR (val), val = XCDR (val);
+ CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
+ }
+ else if (VECTORP (val) || STRINGP (val))
+ {
+ int len = (VECTORP (val)
+ ? XVECTOR (val)->size : XSTRING (val)->size);
+ int i;
+ for (i = 0; i < len; i++)
+ {
+ ch = (STRINGP (val)
+ ? Faref (val, make_number (i))
+ : XVECTOR (val)->contents[i]);
+ CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
+ }
+ }
+ else /* INTEGERP (val) */
+ CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
+ }
+ CODING_ADD_COMPOSITION_END (coding, end - from);
+ }
+ start = end;
}
+ while (start < to
+ && find_composition (start, to, &start, &end, &prop, obj)
+ && end <= to);
+
+ /* Make coding->cmp_data point to the first memory block. */
+ while (coding->cmp_data->prev)
+ coding->cmp_data = coding->cmp_data->prev;
+ coding->cmp_data_start = 0;
+}
+
+/* Reflect the saved information about compositions to OBJ.
+ CODING->cmp_data points to a memory block for the informaiton. OBJ
+ is a buffer or a string, defaults to the current buffer. */
+
+void
+coding_restore_composition (coding, obj)
+ struct coding_system *coding;
+ Lisp_Object obj;
+{
+ struct composition_data *cmp_data = coding->cmp_data;
+
+ if (!cmp_data)
+ return;
- *beg += begp - begp_orig;
- *end += endp - endp_orig;
- return;
-}
+ while (cmp_data->prev)
+ cmp_data = cmp_data->prev;
-/* As shrinking conversion region requires some overhead, we don't try
- shrinking if the length of conversion region is less than this
- value. */
-static int shrink_conversion_region_threshhold = 1024;
+ while (cmp_data)
+ {
+ int i;
-#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
- do { \
- if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
- { \
- if (encodep) shrink_encoding_region (beg, end, coding, str); \
- else shrink_decoding_region (beg, end, coding, str); \
- } \
- } while (0)
+ for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
+ {
+ int *data = cmp_data->data + i;
+ enum composition_method method = (enum composition_method) data[3];
+ Lisp_Object components;
+
+ if (method == COMPOSITION_RELATIVE)
+ components = Qnil;
+ else
+ {
+ int len = data[0] - 4, j;
+ Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
+
+ for (j = 0; j < len; j++)
+ args[j] = make_number (data[4 + j]);
+ components = (method == COMPOSITION_WITH_ALTCHARS
+ ? Fstring (len, args) : Fvector (len, args));
+ }
+ compose_text (data[1], data[2], components, Qnil, obj);
+ }
+ cmp_data = cmp_data->next;
+ }
+}
/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
If REPLACE is nonzero, we do various things as if the original text
is deleted and a new text is inserted. See the comments in
- replace_range (insdel.c) to know what we are doing. */
+ replace_range (insdel.c) to know what we are doing.
+
+ If REPLACE is zero, it is assumed that the source text is unibyte.
+ Otherwize, it is assumed that the source text is multibyte. */
int
code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
{
int len = to - from, len_byte = to_byte - from_byte;
int require, inserted, inserted_byte;
- int head_skip, tail_skip, total_skip;
+ int head_skip, tail_skip, total_skip = 0;
Lisp_Object saved_coding_symbol;
- int multibyte = !NILP (current_buffer->enable_multibyte_characters);
int first = 1;
- int fake_multibyte = 0;
unsigned char *src, *dst;
Lisp_Object deletion;
int orig_point = PT, orig_len = len;
int prev_Z;
+ int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
+
+ coding->src_multibyte = replace && multibyte_p;
+ coding->dst_multibyte = multibyte_p;
deletion = Qnil;
saved_coding_symbol = Qnil;
if (replace)
{
int saved_from = from;
+ int saved_inhibit_modification_hooks;
prepare_to_modify_buffer (from, to, &from);
if (saved_from != from)
{
to = from + len;
- if (multibyte)
- from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
- else
- from_byte = from, to_byte = to;
+ from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
len_byte = to_byte - from_byte;
}
+
+ /* The code conversion routine can not preserve text properties
+ for now. So, we must remove all text properties in the
+ region. Here, we must suppress all modification hooks. */
+ saved_inhibit_modification_hooks = inhibit_modification_hooks;
+ inhibit_modification_hooks = 1;
+ Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
+ inhibit_modification_hooks = saved_inhibit_modification_hooks;
}
if (! encodep && CODING_REQUIRE_DETECTION (coding))
encodings again in vain. */
coding->type = coding_type_emacs_mule;
}
- if (coding->eol_type == CODING_EOL_UNDECIDED)
+ if (coding->eol_type == CODING_EOL_UNDECIDED
+ && coding->type != coding_type_ccl)
{
saved_coding_symbol = coding->symbol;
detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
}
}
- coding->consumed_char = len, coding->consumed = len_byte;
-
- if (encodep
- ? ! CODING_REQUIRE_ENCODING (coding)
- : ! CODING_REQUIRE_DECODING (coding))
- {
- coding->produced = len_byte;
- if (multibyte
- && ! replace
- /* See the comment of the member heading_ascii in coding.h. */
- && coding->heading_ascii < len_byte)
- {
- /* We still may have to combine byte at the head and the
- tail of the text in the region. */
- if (from < GPT && GPT < to)
- move_gap_both (to, to_byte);
- len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
- adjust_after_insert (from, from_byte, to, to_byte, len);
- coding->produced_char = len;
- }
- else
- {
- if (!replace)
- adjust_after_insert (from, from_byte, to, to_byte, len_byte);
- coding->produced_char = len_byte;
- }
- return 0;
- }
-
/* Now we convert the text. */
/* For encoding, we must process pre-write-conversion in advance. */
- if (encodep
- && ! NILP (coding->pre_write_conversion)
+ if (! inhibit_pre_post_conversion
+ && encodep
&& SYMBOLP (coding->pre_write_conversion)
&& ! NILP (Ffboundp (coding->pre_write_conversion)))
{
new buffer. */
struct buffer *prev = current_buffer;
Lisp_Object new;
+ int count = specpdl_ptr - specpdl;
+ record_unwind_protect (code_convert_region_unwind, Qnil);
+ /* We should not call any more pre-write/post-read-conversion
+ functions while this pre-write-conversion is running. */
+ inhibit_pre_post_conversion = 1;
call2 (coding->pre_write_conversion,
make_number (from), make_number (to));
+ inhibit_pre_post_conversion = 0;
+ /* Discard the unwind protect. */
+ specpdl_ptr--;
+
if (current_buffer != prev)
{
len = ZV - BEGV;
new = Fcurrent_buffer ();
set_buffer_internal_1 (prev);
- del_range_2 (from, from_byte, to, to_byte);
+ del_range_2 (from, from_byte, to, to_byte, 0);
TEMP_SET_PT_BOTH (from, from_byte);
insert_from_buffer (XBUFFER (new), 1, len, 0);
Fkill_buffer (new);
orig_point = from;
orig_len = len;
to = from + len;
- from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
- to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
+ from_byte = CHAR_TO_BYTE (from);
+ to_byte = CHAR_TO_BYTE (to);
len_byte = to_byte - from_byte;
TEMP_SET_PT_BOTH (from, from_byte);
}
if (replace)
deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
+ if (coding->composing != COMPOSITION_DISABLED)
+ {
+ if (encodep)
+ coding_save_composition (coding, from, to, Fcurrent_buffer ());
+ else
+ coding_allocate_composition_data (coding, from);
+ }
+
/* Try to skip the heading and tailing ASCIIs. */
- {
- int from_byte_orig = from_byte, to_byte_orig = to_byte;
-
- if (from < GPT && GPT < to)
- move_gap_both (from, from_byte);
- SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
- if (from_byte == to_byte
- && coding->type != coding_type_ccl
- && ! (coding->mode & CODING_MODE_LAST_BLOCK
- && CODING_REQUIRE_FLUSHING (coding)))
- {
- coding->produced = len_byte;
- coding->produced_char = multibyte ? len : len_byte;
- if (!replace)
- /* We must record and adjust for this new text now. */
- adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
- return 0;
- }
+ if (coding->type != coding_type_ccl)
+ {
+ int from_byte_orig = from_byte, to_byte_orig = to_byte;
- head_skip = from_byte - from_byte_orig;
- tail_skip = to_byte_orig - to_byte;
- total_skip = head_skip + tail_skip;
- from += head_skip;
- to -= tail_skip;
- len -= total_skip; len_byte -= total_skip;
- }
+ if (from < GPT && GPT < to)
+ move_gap_both (from, from_byte);
+ SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
+ if (from_byte == to_byte
+ && (encodep || NILP (coding->post_read_conversion))
+ && ! CODING_REQUIRE_FLUSHING (coding))
+ {
+ coding->produced = len_byte;
+ coding->produced_char = len;
+ if (!replace)
+ /* We must record and adjust for this new text now. */
+ adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
+ return 0;
+ }
- /* The code conversion routine can not preserve text properties for
- now. So, we must remove all text properties in the region.
- Here, we must suppress all modification hooks. */
- if (replace)
- {
- int saved_inhibit_modification_hooks = inhibit_modification_hooks;
- inhibit_modification_hooks = 1;
- Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
- inhibit_modification_hooks = saved_inhibit_modification_hooks;
+ head_skip = from_byte - from_byte_orig;
+ tail_skip = to_byte_orig - to_byte;
+ total_skip = head_skip + tail_skip;
+ from += head_skip;
+ to -= tail_skip;
+ len -= total_skip; len_byte -= total_skip;
}
/* For converion, we must put the gap before the text in addition to
move_gap_both (from, from_byte);
inserted = inserted_byte = 0;
- src = GAP_END_ADDR, dst = GPT_ADDR;
GAP_SIZE += len_byte;
ZV -= len;
ZV_BYTE -= len_byte;
Z_BYTE -= len_byte;
- if (GPT - BEG < beg_unchanged)
- beg_unchanged = GPT - BEG;
- if (Z - GPT < end_unchanged)
- end_unchanged = Z - GPT;
+ if (GPT - BEG < BEG_UNCHANGED)
+ BEG_UNCHANGED = GPT - BEG;
+ if (Z - GPT < END_UNCHANGED)
+ END_UNCHANGED = Z - GPT;
+
+ if (!encodep && coding->src_multibyte)
+ {
+ /* Decoding routines expects that the source text is unibyte.
+ We must convert 8-bit characters of multibyte form to
+ unibyte. */
+ int len_byte_orig = len_byte;
+ len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
+ if (len_byte < len_byte_orig)
+ safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
+ len_byte);
+ coding->src_multibyte = 0;
+ }
for (;;)
{
int result;
- /* The buffer memory is changed from:
- +--------+converted-text+---------+-------original-text------+---+
- |<-from->|<--inserted-->|---------|<-----------len---------->|---|
- |<------------------- GAP_SIZE -------------------->| */
+ /* The buffer memory is now:
+ +--------+converted-text+---------+-------original-text-------+---+
+ |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
+ |<---------------------- GAP ----------------------->| */
+ src = GAP_END_ADDR - len_byte;
+ dst = GPT_ADDR + inserted_byte;
+
if (encodep)
result = encode_coding (coding, src, dst, len_byte, 0);
else
result = decode_coding (coding, src, dst, len_byte, 0);
- /* to:
- +--------+-------converted-text--------+--+---original-text--+---+
- |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
- |<------------------- GAP_SIZE -------------------->| */
- if (coding->fake_multibyte)
- fake_multibyte = 1;
-
- if (!encodep && !multibyte)
- coding->produced_char = coding->produced;
+
+ /* The buffer memory is now:
+ +--------+-------converted-text----+--+------original-text----+---+
+ |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
+ |<---------------------- GAP ----------------------->| */
+
inserted += coding->produced_char;
inserted_byte += coding->produced;
len_byte -= coding->consumed;
+
+ if (result == CODING_FINISH_INSUFFICIENT_CMP)
+ {
+ coding_allocate_composition_data (coding, from + inserted);
+ continue;
+ }
+
src += coding->consumed;
- dst += inserted_byte;
+ dst += coding->produced;
if (result == CODING_FINISH_NORMAL)
{
if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
{
unsigned char *pend = dst, *p = pend - inserted_byte;
+ Lisp_Object eol_type;
/* Encode LFs back to the original eol format (CR or CRLF). */
if (coding->eol_type == CODING_EOL_CR)
while (p < pend) if (*p++ == '\n') count++;
if (src - dst < count)
{
- /* We don't have sufficient room for putting LFs
+ /* We don't have sufficient room for encoding LFs
back to CRLF. We must record converted and
not-yet-converted text back to the buffer
content, enlarge the gap, then record them out of
/* Suppress eol-format conversion in the further conversion. */
coding->eol_type = CODING_EOL_LF;
- /* Restore the original symbol. */
- coding->symbol = saved_coding_symbol;
+ /* Set the coding system symbol to that for Unix-like EOL. */
+ eol_type = Fget (saved_coding_symbol, Qeol_type);
+ if (VECTORP (eol_type)
+ && XVECTOR (eol_type)->size == 3
+ && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
+ coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
+ else
+ coding->symbol = saved_coding_symbol;
continue;
}
inserted_byte += len_byte;
while (len_byte--)
*dst++ = *src++;
- fake_multibyte = 1;
break;
}
if (result == CODING_FINISH_INTERRUPT)
{
/* The conversion procedure was interrupted by a user. */
- fake_multibyte = 1;
break;
}
/* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
{
/* It's quite strange to require more memory without
consuming any bytes. Perhaps CCL program bug. */
- fake_multibyte = 1;
break;
}
if (first)
GAP_SIZE += add;
ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
- /* Don't forget to update SRC, DST. */
- src = GAP_END_ADDR - len_byte;
- dst = GPT_ADDR + inserted_byte;
}
}
if (src - dst > 0) *dst = 0; /* Put an anchor. */
- if (multibyte
- && (encodep
- || fake_multibyte
- || (to - from) != (to_byte - from_byte)))
- inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
+ if (encodep && coding->dst_multibyte)
+ {
+ /* The output is unibyte. We must convert 8-bit characters to
+ multibyte form. */
+ if (inserted_byte * 2 > GAP_SIZE)
+ {
+ GAP_SIZE -= inserted_byte;
+ ZV += inserted_byte; Z += inserted_byte;
+ ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
+ GPT += inserted_byte; GPT_BYTE += inserted_byte;
+ make_gap (inserted_byte - GAP_SIZE);
+ GAP_SIZE += inserted_byte;
+ ZV -= inserted_byte; Z -= inserted_byte;
+ ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
+ GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
+ }
+ inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
+ }
/* If we have shrinked the conversion area, adjust it now. */
if (total_skip > 0)
adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
inserted = Z - prev_Z;
- if (! encodep && ! NILP (coding->post_read_conversion))
+ if (!encodep && coding->cmp_data && coding->cmp_data->used)
+ coding_restore_composition (coding, Fcurrent_buffer ());
+ coding_free_composition_data (coding);
+
+ if (! inhibit_pre_post_conversion
+ && ! encodep && ! NILP (coding->post_read_conversion))
{
Lisp_Object val;
+ int count = specpdl_ptr - specpdl;
if (from != PT)
TEMP_SET_PT_BOTH (from, from_byte);
prev_Z = Z;
+ record_unwind_protect (code_convert_region_unwind, Qnil);
+ /* We should not call any more pre-write/post-read-conversion
+ functions while this post-read-conversion is running. */
+ inhibit_pre_post_conversion = 1;
val = call1 (coding->post_read_conversion, make_number (inserted));
+ inhibit_pre_post_conversion = 0;
+ /* Discard the unwind protect. */
+ specpdl_ptr--;
CHECK_NUMBER (val, 0);
inserted += Z - prev_Z;
}
TEMP_SET_PT (orig_point);
}
- signal_after_change (from, to - from, inserted);
+ if (replace)
+ {
+ signal_after_change (from, to - from, inserted);
+ update_compositions (from, from + inserted, CHECK_BORDER);
+ }
{
coding->consumed = to_byte - from_byte;
}
Lisp_Object
-code_convert_string (str, coding, encodep, nocopy)
+run_pre_post_conversion_on_str (str, coding, encodep)
+ Lisp_Object str;
+ struct coding_system *coding;
+ int encodep;
+{
+ int count = specpdl_ptr - specpdl;
+ struct gcpro gcpro1;
+ struct buffer *prev = current_buffer;
+ int multibyte = STRING_MULTIBYTE (str);
+
+ record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
+ record_unwind_protect (code_convert_region_unwind, Qnil);
+ GCPRO1 (str);
+ temp_output_buffer_setup (" *code-converting-work*");
+ set_buffer_internal (XBUFFER (Vstandard_output));
+ /* We must insert the contents of STR as is without
+ unibyte<->multibyte conversion. For that, we adjust the
+ multibyteness of the working buffer to that of STR. */
+ Ferase_buffer ();
+ current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
+ insert_from_string (str, 0, 0,
+ XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
+ UNGCPRO;
+ inhibit_pre_post_conversion = 1;
+ if (encodep)
+ call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
+ else
+ {
+ TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
+ call1 (coding->post_read_conversion, make_number (Z - BEG));
+ }
+ inhibit_pre_post_conversion = 0;
+ str = make_buffer_string (BEG, Z, 0);
+ return unbind_to (count, str);
+}
+
+Lisp_Object
+decode_coding_string (str, coding, nocopy)
Lisp_Object str;
struct coding_system *coding;
- int encodep, nocopy;
+ int nocopy;
{
int len;
char *buf;
- int from = 0, to = XSTRING (str)->size;
- int to_byte = STRING_BYTES (XSTRING (str));
+ int from, to, to_byte;
struct gcpro gcpro1;
Lisp_Object saved_coding_symbol;
int result;
- saved_coding_symbol = Qnil;
- if (encodep && !NILP (coding->pre_write_conversion)
- || !encodep && !NILP (coding->post_read_conversion))
- {
- /* Since we have to call Lisp functions which assume target text
- is in a buffer, after setting a temporary buffer, call
- code_convert_region. */
- int count = specpdl_ptr - specpdl;
- struct buffer *prev = current_buffer;
-
- record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
- temp_output_buffer_setup (" *code-converting-work*");
- set_buffer_internal (XBUFFER (Vstandard_output));
- if (encodep)
- insert_from_string (str, 0, 0, to, to_byte, 0);
- else
- {
- /* We must insert the contents of STR as is without
- unibyte<->multibyte conversion. */
- current_buffer->enable_multibyte_characters = Qnil;
- insert_from_string (str, 0, 0, to_byte, to_byte, 0);
- current_buffer->enable_multibyte_characters = Qt;
- }
- code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
- if (encodep)
- /* We must return the buffer contents as unibyte string. */
- current_buffer->enable_multibyte_characters = Qnil;
- str = make_buffer_string (BEGV, ZV, 0);
- set_buffer_internal (prev);
- return unbind_to (count, str);
- }
+ from = 0;
+ to = XSTRING (str)->size;
+ to_byte = STRING_BYTES (XSTRING (str));
- if (! encodep && CODING_REQUIRE_DETECTION (coding))
+ saved_coding_symbol = Qnil;
+ if (CODING_REQUIRE_DETECTION (coding))
{
/* See the comments in code_convert_region. */
if (coding->type == coding_type_undecided)
if (coding->type == coding_type_undecided)
coding->type = coding_type_emacs_mule;
}
- if (coding->eol_type == CODING_EOL_UNDECIDED)
+ if (coding->eol_type == CODING_EOL_UNDECIDED
+ && coding->type != coding_type_ccl)
{
saved_coding_symbol = coding->symbol;
detect_eol (coding, XSTRING (str)->data, to_byte);
}
}
- if (encodep
- ? ! CODING_REQUIRE_ENCODING (coding)
- : ! CODING_REQUIRE_DECODING (coding))
- from = to_byte;
- else
+ if (! CODING_REQUIRE_DECODING (coding))
+ {
+ if (!STRING_MULTIBYTE (str))
+ {
+ str = Fstring_as_multibyte (str);
+ nocopy = 1;
+ }
+ return (nocopy ? str : Fcopy_sequence (str));
+ }
+
+ if (STRING_MULTIBYTE (str))
+ {
+ /* Decoding routines expect the source text to be unibyte. */
+ str = Fstring_as_unibyte (str);
+ to_byte = STRING_BYTES (XSTRING (str));
+ nocopy = 1;
+ coding->src_multibyte = 0;
+ }
+ coding->dst_multibyte = 1;
+
+ if (coding->composing != COMPOSITION_DISABLED)
+ coding_allocate_composition_data (coding, from);
+
+ /* Try to skip the heading and tailing ASCIIs. */
+ if (coding->type != coding_type_ccl)
{
- /* Try to skip the heading and tailing ASCIIs. */
+ int from_orig = from;
+
SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
- encodep);
+ 0);
+ if (from == to_byte)
+ return (nocopy ? str : Fcopy_sequence (str));
}
- if (from == to_byte
- && coding->type != coding_type_ccl)
- return (nocopy ? str : Fcopy_sequence (str));
- if (encodep)
- len = encoding_buffer_size (coding, to_byte - from);
- else
- len = decoding_buffer_size (coding, to_byte - from);
+ len = decoding_buffer_size (coding, to_byte - from);
len += from + STRING_BYTES (XSTRING (str)) - to_byte;
GCPRO1 (str);
buf = get_conversion_buffer (len);
if (from > 0)
bcopy (XSTRING (str)->data, buf, from);
- result = (encodep
- ? encode_coding (coding, XSTRING (str)->data + from,
- buf + from, to_byte - from, len)
- : decode_coding (coding, XSTRING (str)->data + from,
- buf + from, to_byte - from, len));
- if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
+ result = decode_coding (coding, XSTRING (str)->data + from,
+ buf + from, to_byte - from, len);
+ if (result == CODING_FINISH_INCONSISTENT_EOL)
{
- /* We simple try to decode the whole string again but without
+ /* We simply try to decode the whole string again but without
eol-conversion this time. */
coding->eol_type = CODING_EOL_LF;
coding->symbol = saved_coding_symbol;
- return code_convert_string (str, coding, encodep, nocopy);
+ coding_free_composition_data (coding);
+ return decode_coding_string (str, coding, nocopy);
}
bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
STRING_BYTES (XSTRING (str)) - to_byte);
len = from + STRING_BYTES (XSTRING (str)) - to_byte;
- if (encodep)
- str = make_unibyte_string (buf, len + coding->produced);
- else
+ str = make_multibyte_string (buf, len + coding->produced_char,
+ len + coding->produced);
+
+ if (coding->cmp_data && coding->cmp_data->used)
+ coding_restore_composition (coding, str);
+ coding_free_composition_data (coding);
+
+ if (SYMBOLP (coding->post_read_conversion)
+ && !NILP (Ffboundp (coding->post_read_conversion)))
+ str = run_pre_post_conversion_on_str (str, coding, 0);
+
+ return str;
+}
+
+Lisp_Object
+encode_coding_string (str, coding, nocopy)
+ Lisp_Object str;
+ struct coding_system *coding;
+ int nocopy;
+{
+ int len;
+ char *buf;
+ int from, to, to_byte;
+ struct gcpro gcpro1;
+ Lisp_Object saved_coding_symbol;
+ int result;
+
+ if (SYMBOLP (coding->pre_write_conversion)
+ && !NILP (Ffboundp (coding->pre_write_conversion)))
+ str = run_pre_post_conversion_on_str (str, coding, 1);
+
+ from = 0;
+ to = XSTRING (str)->size;
+ to_byte = STRING_BYTES (XSTRING (str));
+
+ saved_coding_symbol = Qnil;
+ if (! CODING_REQUIRE_ENCODING (coding))
+ {
+ if (STRING_MULTIBYTE (str))
+ {
+ str = Fstring_as_unibyte (str);
+ nocopy = 1;
+ }
+ return (nocopy ? str : Fcopy_sequence (str));
+ }
+
+ /* Encoding routines determine the multibyteness of the source text
+ by coding->src_multibyte. */
+ coding->src_multibyte = STRING_MULTIBYTE (str);
+ coding->dst_multibyte = 0;
+
+ if (coding->composing != COMPOSITION_DISABLED)
+ coding_save_composition (coding, from, to, str);
+
+ /* Try to skip the heading and tailing ASCIIs. */
+ if (coding->type != coding_type_ccl)
{
- int chars= (coding->fake_multibyte
- ? multibyte_chars_in_text (buf + from, coding->produced)
- : coding->produced_char);
- str = make_multibyte_string (buf, len + chars, len + coding->produced);
+ int from_orig = from;
+
+ SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
+ 1);
+ if (from == to_byte)
+ return (nocopy ? str : Fcopy_sequence (str));
}
+ len = encoding_buffer_size (coding, to_byte - from);
+ len += from + STRING_BYTES (XSTRING (str)) - to_byte;
+ GCPRO1 (str);
+ buf = get_conversion_buffer (len);
+ UNGCPRO;
+
+ if (from > 0)
+ bcopy (XSTRING (str)->data, buf, from);
+ result = encode_coding (coding, XSTRING (str)->data + from,
+ buf + from, to_byte - from, len);
+ bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
+ STRING_BYTES (XSTRING (str)) - to_byte);
+
+ len = from + STRING_BYTES (XSTRING (str)) - to_byte;
+ str = make_unibyte_string (buf, len + coding->produced);
+ coding_free_composition_data (coding);
+
return str;
}
/* At first, gather possible coding systems in VAL. */
val = Qnil;
- for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
+ for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
{
- int idx
- = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
- if (coding_mask & (1 << idx))
+ Lisp_Object category_val, category_index;
+
+ category_index = Fget (XCAR (tmp), Qcoding_category_index);
+ category_val = Fsymbol_value (XCAR (tmp));
+ if (!NILP (category_val)
+ && NATNUMP (category_index)
+ && (coding_mask & (1 << XFASTINT (category_index))))
{
- val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
+ val = Fcons (category_val, val);
if (highest)
break;
}
val = Fnreverse (val);
/* Then, replace the elements with subsidiary coding systems. */
- for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
+ for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
{
if (eol_type != CODING_EOL_UNDECIDED
&& eol_type != CODING_EOL_INCONSISTENT)
{
Lisp_Object eol;
- eol = Fget (XCONS (tmp)->car, Qeol_type);
+ eol = Fget (XCAR (tmp), Qeol_type);
if (VECTORP (eol))
- XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
+ XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
}
}
- return (highest ? XCONS (val)->car : val);
+ return (highest ? XCAR (val) : val);
}
DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
!NILP (highest));
}
+/* Return an intersection of lists L1 and L2. */
+
+static Lisp_Object
+intersection (l1, l2)
+ Lisp_Object l1, l2;
+{
+ Lisp_Object val;
+
+ for (val = Qnil; CONSP (l1); l1 = XCDR (l1))
+ {
+ if (!NILP (Fmemq (XCAR (l1), l2)))
+ val = Fcons (XCAR (l1), val);
+ }
+ return val;
+}
+
+
+/* Subroutine for Fsafe_coding_systems_region_internal.
+
+ Return a list of coding systems that safely encode the multibyte
+ text between P and PEND. SAFE_CODINGS, if non-nil, is a list of
+ possible coding systems. If it is nil, it means that we have not
+ yet found any coding systems.
+
+ WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An
+ element of WORK_TABLE is set to t once the element is looked up.
+
+ If a non-ASCII single byte char is found, set
+ *single_byte_char_found to 1. */
+
+static Lisp_Object
+find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
+ unsigned char *p, *pend;
+ Lisp_Object safe_codings, work_table;
+ int *single_byte_char_found;
+{
+ int c, len, idx;
+ Lisp_Object val;
+
+ while (p < pend)
+ {
+ c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
+ p += len;
+ if (ASCII_BYTE_P (c))
+ /* We can ignore ASCII characters here. */
+ continue;
+ if (SINGLE_BYTE_CHAR_P (c))
+ *single_byte_char_found = 1;
+ if (NILP (safe_codings))
+ continue;
+ /* Check the safe coding systems for C. */
+ val = char_table_ref_and_index (work_table, c, &idx);
+ if (EQ (val, Qt))
+ /* This element was already checked. Ignore it. */
+ continue;
+ /* Remember that we checked this element. */
+ CHAR_TABLE_SET (work_table, idx, Qt);
+
+ /* If there are some safe coding systems for C and we have
+ already found the other set of coding systems for the
+ different characters, get the intersection of them. */
+ if (!EQ (safe_codings, Qt) && !NILP (val))
+ val = intersection (safe_codings, val);
+ safe_codings = val;
+ }
+ return safe_codings;
+}
+
+
+/* Return a list of coding systems that safely encode the text between
+ START and END. If the text contains only ASCII or is unibyte,
+ return t. */
+
+DEFUN ("find-coding-systems-region-internal",
+ Ffind_coding_systems_region_internal,
+ Sfind_coding_systems_region_internal, 2, 2, 0,
+ "Internal use only.")
+ (start, end)
+ Lisp_Object start, end;
+{
+ Lisp_Object work_table, safe_codings;
+ int non_ascii_p = 0;
+ int single_byte_char_found = 0;
+ unsigned char *p1, *p1end, *p2, *p2end, *p;
+ Lisp_Object args[2];
+
+ if (STRINGP (start))
+ {
+ if (!STRING_MULTIBYTE (start))
+ return Qt;
+ p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start));
+ p2 = p2end = p1end;
+ if (XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
+ non_ascii_p = 1;
+ }
+ else
+ {
+ int from, to, stop;
+
+ CHECK_NUMBER_COERCE_MARKER (start, 0);
+ CHECK_NUMBER_COERCE_MARKER (end, 1);
+ if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
+ args_out_of_range (start, end);
+ if (NILP (current_buffer->enable_multibyte_characters))
+ return Qt;
+ from = CHAR_TO_BYTE (XINT (start));
+ to = CHAR_TO_BYTE (XINT (end));
+ stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
+ p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
+ if (stop == to)
+ p2 = p2end = p1end;
+ else
+ p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
+ if (XINT (end) - XINT (start) != to - from)
+ non_ascii_p = 1;
+ }
+
+ if (!non_ascii_p)
+ {
+ /* We are sure that the text contains no multibyte character.
+ Check if it contains eight-bit-graphic. */
+ p = p1;
+ for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
+ if (p == p1end)
+ {
+ for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
+ if (p == p2end)
+ return Qt;
+ }
+ }
+
+ /* The text contains non-ASCII characters. */
+ work_table = Fcopy_sequence (Vchar_coding_system_table);
+ safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
+ &single_byte_char_found);
+ if (p2 < p2end)
+ safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
+ &single_byte_char_found);
+
+ if (!single_byte_char_found)
+ {
+ /* Append generic coding systems. */
+ Lisp_Object args[2];
+ args[0] = safe_codings;
+ args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
+ make_number (0));
+ safe_codings = Fappend (make_number (2), args);
+ }
+ else
+ safe_codings = Fcons (Qraw_text, Fcons (Qemacs_mule, safe_codings));
+ return safe_codings;
+}
+
+
Lisp_Object
code_convert_region1 (start, end, coding_system, encodep)
Lisp_Object start, end, coding_system;
error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
coding.mode |= CODING_MODE_LAST_BLOCK;
+ coding.src_multibyte = coding.dst_multibyte
+ = !NILP (current_buffer->enable_multibyte_characters);
code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
&coding, encodep, 1);
Vlast_coding_system_used = coding.symbol;
error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
coding.mode |= CODING_MODE_LAST_BLOCK;
+ string = (encodep
+ ? encode_coding_string (string, &coding, !NILP (nocopy))
+ : decode_coding_string (string, &coding, !NILP (nocopy)));
Vlast_coding_system_used = coding.symbol;
- return code_convert_string (string, &coding, encodep, !NILP (nocopy));
+
+ return string;
}
DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
}
/* Encode or decode STRING according to CODING_SYSTEM.
- Do not set Vlast_coding_system_used. */
+ Do not set Vlast_coding_system_used.
+
+ This function is called only from macros DECODE_FILE and
+ ENCODE_FILE, thus we ignore character composition. */
Lisp_Object
code_convert_string_norecord (string, coding_system, encodep)
if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
+ coding.composing = COMPOSITION_DISABLED;
coding.mode |= CODING_MODE_LAST_BLOCK;
- return code_convert_string (string, &coding, encodep, Qt);
+ return (encodep
+ ? encode_coding_string (string, &coding, 1)
+ : decode_coding_string (string, &coding, 1));
}
\f
DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
if (s2 < 0x80)
XSETFASTINT (val, s2);
else if (s2 >= 0xA0 || s2 <= 0xDF)
- XSETFASTINT (val,
- MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
+ XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
else
error ("Invalid Shift JIS code: %x", XFASTINT (code));
}
|| (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
error ("Invalid Shift JIS code: %x", XFASTINT (code));
DECODE_SJIS (s1, s2, c1, c2);
- XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
+ XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
}
return val;
}
|| (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
error ("Invalid BIG5 code: %x", XFASTINT (code));
DECODE_BIG5 (b1, b2, charset, c1, c2);
- XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
+ XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
}
return val;
}
setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
/* We had better not send unsafe characters to terminal. */
terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
-
+ /* Characer composition should be disabled. */
+ terminal_coding.composing = COMPOSITION_DISABLED;
+ terminal_coding.src_multibyte = 1;
+ terminal_coding.dst_multibyte = 0;
return Qnil;
}
CHECK_SYMBOL (coding_system, 0);
setup_coding_system (Fcheck_coding_system (coding_system),
&safe_terminal_coding);
+ /* Characer composition should be disabled. */
+ safe_terminal_coding.composing = COMPOSITION_DISABLED;
+ safe_terminal_coding.src_multibyte = 1;
+ safe_terminal_coding.dst_multibyte = 0;
return Qnil;
}
{
CHECK_SYMBOL (coding_system, 0);
setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
+ /* Characer composition should be disabled. */
+ keyboard_coding.composing = COMPOSITION_DISABLED;
return Qnil;
}
if (NILP (chain))
return Qnil;
- for (; CONSP (chain); chain = XCONS (chain)->cdr)
+ for (; CONSP (chain); chain = XCDR (chain))
{
Lisp_Object elt;
- elt = XCONS (chain)->car;
+ elt = XCAR (chain);
if (CONSP (elt)
&& ((STRINGP (target)
- && STRINGP (XCONS (elt)->car)
- && fast_string_match (XCONS (elt)->car, target) >= 0)
- || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
+ && STRINGP (XCAR (elt))
+ && fast_string_match (XCAR (elt), target) >= 0)
+ || (INTEGERP (target) && EQ (target, XCAR (elt)))))
{
- val = XCONS (elt)->cdr;
+ val = XCDR (elt);
/* Here, if VAL is both a valid coding system and a valid
function symbol, we return VAL as a coding system. */
if (CONSP (val))
DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
Supdate_coding_systems_internal, 0, 0, 0,
"Update internal database for ISO2022 and CCL based coding systems.\n\
-When values of the following coding categories are changed, you must\n\
-call this function:\n\
- coding-category-iso-7, coding-category-iso-7-tight,\n\
- coding-category-iso-8-1, coding-category-iso-8-2,\n\
- coding-category-iso-7-else, coding-category-iso-8-else,\n\
- coding-category-ccl")
+When values of any coding categories are changed, you must\n\
+call this function")
()
{
int i;
- for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
+ for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
{
Lisp_Object val;
while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
{
- if (! SYMBOLP (XCONS (val)->car))
+ if (! SYMBOLP (XCAR (val)))
break;
- idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
+ idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
if (idx >= CODING_CATEGORY_IDX_MAX)
break;
coding_priorities[i++] = (1 << idx);
- val = XCONS (val)->cdr;
+ val = XCDR (val);
}
/* If coding-category-list is valid and contains all coding
categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
- the following code saves Emacs from craching. */
+ the following code saves Emacs from crashing. */
while (i < CODING_CATEGORY_IDX_MAX)
coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
for (i = 0x21 ; i < 0x7F; i++)
emacs_code_class[i] = EMACS_ascii_code;
emacs_code_class[0x7F] = EMACS_control_code;
- emacs_code_class[0x80] = EMACS_leading_code_composition;
- for (i = 0x81; i < 0xFF; i++)
+ for (i = 0x80; i < 0xFF; i++)
emacs_code_class[i] = EMACS_invalid_code;
emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
/* ISO2022 specific initialize routine. */
for (i = 0; i < 0x20; i++)
- iso_code_class[i] = ISO_control_code;
+ iso_code_class[i] = ISO_control_0;
for (i = 0x21; i < 0x7F; i++)
iso_code_class[i] = ISO_graphic_plane_0;
for (i = 0x80; i < 0xA0; i++)
- iso_code_class[i] = ISO_control_code;
+ iso_code_class[i] = ISO_control_1;
for (i = 0xA1; i < 0xFF; i++)
iso_code_class[i] = ISO_graphic_plane_1;
iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
#else
system_eol_type = CODING_EOL_LF;
#endif
+
+ inhibit_pre_post_conversion = 0;
}
#ifdef emacs
Qtranslation_table_for_encode = intern ("translation-table-for-encode");
staticpro (&Qtranslation_table_for_encode);
- Qsafe_charsets = intern ("safe-charsets");
- staticpro (&Qsafe_charsets);
+ Qsafe_chars = intern ("safe-chars");
+ staticpro (&Qsafe_chars);
+
+ Qchar_coding_system = intern ("char-coding-system");
+ staticpro (&Qchar_coding_system);
+
+ /* Intern this now in case it isn't already done.
+ Setting this variable twice is harmless.
+ But don't staticpro it here--that is done in alloc.c. */
+ Qchar_table_extra_slots = intern ("char-table-extra-slots");
+ Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
+ Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (1));
Qvalid_codes = intern ("valid-codes");
staticpro (&Qvalid_codes);
defsubr (&Scheck_coding_system);
defsubr (&Sdetect_coding_region);
defsubr (&Sdetect_coding_string);
+ defsubr (&Sfind_coding_systems_region_internal);
defsubr (&Sdecode_coding_region);
defsubr (&Sencode_coding_region);
defsubr (&Sdecode_coding_string);
DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
"Specify the coding system for write operations.\n\
-It is useful to bind this variable with `let', but do not set it globally.\n\
-If the value is a coding system, it is used for encoding on write operation.\n\
-If not, an appropriate element is used from one of the coding system alists:\n\
+Programs bind this variable with `let', but you should not set it globally.\n\
+If the value is a coding system, it is used for encoding of output,\n\
+when writing it to a file and when sending it to a file or subprocess.\n\
+\n\
+If this does not specify a coding system, an appropriate element\n\
+is used from one of the coding system alists:\n\
There are three such tables, `file-coding-system-alist',\n\
-`process-coding-system-alist', and `network-coding-system-alist'.");
+`process-coding-system-alist', and `network-coding-system-alist'.\n\
+For output to files, if the above procedure does not specify a coding system,\n\
+the value of `buffer-file-coding-system' is used.");
Vcoding_system_for_write = Qnil;
DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
See also the function `find-operation-coding-system'.");
Vnetwork_coding_system_alist = Qnil;
+ DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
+ "Coding system to use with system messages.");
+ Vlocale_coding_system = Qnil;
+
+ /* The eol mnemonics are reset in startup.el system-dependently. */
DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
"*String displayed in mode line for UNIX-like (LF) end-of-line format.");
eol_mnemonic_unix = build_string (":");
The default value is `select-safe-coding-system' (which see).");
Vselect_safe_coding_system_function = Qnil;
+ DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
+ "Char-table containing safe coding systems of each characters.\n\
+Each element doesn't include such generic coding systems that can\n\
+encode any characters. They are in the first extra slot.");
+ Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
+
+ DEFVAR_BOOL ("inhibit-iso-escape-detection",
+ &inhibit_iso_escape_detection,
+ "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
+\n\
+By default, on reading a file, Emacs tries to detect how the text is\n\
+encoded. This code detection is sensitive to escape sequences. If\n\
+the sequence is valid as ISO2022, the code is determined as one of\n\
+the ISO2022 encodings, and the file is decoded by the corresponding\n\
+coding system (e.g. `iso-2022-7bit').\n\
+\n\
+However, there may be a case that you want to read escape sequences in\n\
+a file as is. In such a case, you can set this variable to non-nil.\n\
+Then, as the code detection ignores any escape sequences, no file is\n\
+detected as encoded in some ISO2022 encoding. The result is that all\n\
+escape sequences become visible in a buffer.\n\
+\n\
+The default value is nil, and it is strongly recommended not to change\n\
+it. That is because many Emacs Lisp source files that contain\n\
+non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
+in Emacs's distribution, and they won't be decoded correctly on\n\
+reading if you suppress escape sequence detection.\n\
+\n\
+The other way to read escape sequences in a file without decoding is\n\
+to explicitly specify some coding system that doesn't use ISO2022's\n\
+escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
+ inhibit_iso_escape_detection = 0;
+}
+
+char *
+emacs_strerror (error_number)
+ int error_number;
+{
+ char *str;
+
+ synchronize_system_messages_locale ();
+ str = strerror (error_number);
+
+ if (! NILP (Vlocale_coding_system))
+ {
+ Lisp_Object dec = code_convert_string_norecord (build_string (str),
+ Vlocale_coding_system,
+ 0);
+ str = (char *) XSTRING (dec)->data;
+ }
+
+ return str;
}
#endif /* emacs */
+