From df7492f9702e8e5e563699d8201fac5b54ab92b7 Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Fri, 1 Mar 2002 01:17:24 +0000 Subject: [PATCH] Completely re-written. --- src/coding.c | 15589 ++++++++++++++++++++++++++----------------------- src/coding.h | 1381 +++-- 2 files changed, 8838 insertions(+), 8132 deletions(-) rewrite src/coding.c (66%) rewrite src/coding.h (69%) diff --git a/src/coding.c b/src/coding.c dissimilarity index 66% index 37a5e88654..a5104c5909 100644 --- a/src/coding.c +++ b/src/coding.c @@ -1,7414 +1,8175 @@ -/* Coding system handler (conversion, detection, and etc). - Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN. - Licensed to the Free Software Foundation. - Copyright (C) 2001 Free Software Foundation, Inc. - -This file is part of GNU Emacs. - -GNU Emacs is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -GNU Emacs is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GNU Emacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -/*** TABLE OF CONTENTS *** - - 0. General comments - 1. Preamble - 2. Emacs' internal format (emacs-mule) handlers - 3. ISO2022 handlers - 4. Shift-JIS and BIG5 handlers - 5. CCL handlers - 6. End-of-line handlers - 7. C library functions - 8. Emacs Lisp library functions - 9. Post-amble - -*/ - -/*** 0. General comments ***/ - - -/*** GENERAL NOTE on CODING SYSTEMS *** - - A coding system is an encoding mechanism for one or more character - sets. Here's a list of coding systems which Emacs can handle. When - we say "decode", it means converting some other coding system to - Emacs' internal format (emacs-mule), and when we say "encode", - it means converting the coding system emacs-mule to some other - coding system. - - 0. Emacs' internal format (emacs-mule) - - Emacs itself holds a multi-lingual character in buffers and strings - in a special format. Details are described in section 2. - - 1. ISO2022 - - The most famous coding system for multiple character sets. X's - Compound Text, various EUCs (Extended Unix Code), and coding - systems used in Internet communication such as ISO-2022-JP are - all variants of ISO2022. Details are described in section 3. - - 2. SJIS (or Shift-JIS or MS-Kanji-Code) - - A coding system to encode character sets: ASCII, JISX0201, and - JISX0208. Widely used for PC's in Japan. Details are described in - section 4. - - 3. BIG5 - - A coding system to encode the character sets ASCII and Big5. Widely - used for Chinese (mainly in Taiwan and Hong Kong). Details are - described in section 4. In this file, when we write "BIG5" - (all uppercase), we mean the coding system, and when we write - "Big5" (capitalized), we mean the character set. - - 4. Raw text - - A coding system for text containing random 8-bit code. Emacs does - no code conversion on such text except for end-of-line format. - - 5. Other - - If a user wants to read/write text encoded in a coding system not - listed above, he can supply a decoder and an encoder for it as CCL - (Code Conversion Language) programs. Emacs executes the CCL program - while reading/writing. - - Emacs represents a coding system by a Lisp symbol that has a property - `coding-system'. But, before actually using the coding system, the - information about it is set in a structure of type `struct - coding_system' for rapid processing. See section 6 for more details. - -*/ - -/*** GENERAL NOTES on END-OF-LINE FORMAT *** - - How end-of-line of text is encoded depends on the operating system. - For instance, Unix's format is just one byte of `line-feed' code, - whereas DOS's format is two-byte sequence of `carriage-return' and - `line-feed' codes. MacOS's format is usually one byte of - `carriage-return'. - - Since text character encoding and end-of-line encoding are - independent, any coding system described above can have any - end-of-line format. So Emacs has information about end-of-line - format in each coding-system. See section 6 for more details. - -*/ - -/*** GENERAL NOTES on `detect_coding_XXX ()' functions *** - - These functions check if a text between SRC and SRC_END is encoded - in the coding system category XXX. Each returns an integer value in - which appropriate flag bits for the category XXX are set. The flag - bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the - template for these functions. If MULTIBYTEP is nonzero, 8-bit codes - of the range 0x80..0x9F are in multibyte form. */ -#if 0 -int -detect_coding_emacs_mule (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - ... -} -#endif - -/*** GENERAL NOTES on `decode_coding_XXX ()' functions *** - - These functions decode SRC_BYTES length of unibyte text at SOURCE - encoded in CODING to Emacs' internal format. The resulting - multibyte text goes to a place pointed to by DESTINATION, the length - of which should not exceed DST_BYTES. - - These functions set the information about original and decoded texts - in the members `produced', `produced_char', `consumed', and - `consumed_char' of the structure *CODING. They also set the member - `result' to one of CODING_FINISH_XXX indicating how the decoding - finished. - - DST_BYTES zero means that the source area and destination area are - overlapped, which means that we can produce a decoded text until it - reaches the head of the not-yet-decoded source text. - - Below is a template for these functions. */ -#if 0 -static void -decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - ... -} -#endif - -/*** GENERAL NOTES on `encode_coding_XXX ()' functions *** - - These functions encode SRC_BYTES length text at SOURCE from Emacs' - internal multibyte format to CODING. The resulting unibyte text - goes to a place pointed to by DESTINATION, the length of which - should not exceed DST_BYTES. - - These functions set the information about original and encoded texts - in the members `produced', `produced_char', `consumed', and - `consumed_char' of the structure *CODING. They also set the member - `result' to one of CODING_FINISH_XXX indicating how the encoding - finished. - - DST_BYTES zero means that the source area and destination area are - overlapped, which means that we can produce encoded text until it - reaches at the head of the not-yet-encoded source text. - - Below is a template for these functions. */ -#if 0 -static void -encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - ... -} -#endif - -/*** COMMONLY USED MACROS ***/ - -/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely - get one, two, and three bytes from the source text respectively. - If there are not enough bytes in the source, they jump to - `label_end_of_loop'. The caller should set variables `coding', - `src' and `src_end' to appropriate pointer in advance. These - macros are called from decoding routines `decode_coding_XXX', thus - it is assumed that the source text is unibyte. */ - -#define ONE_MORE_BYTE(c1) \ - do { \ - if (src >= src_end) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ - goto label_end_of_loop; \ - } \ - c1 = *src++; \ - } while (0) - -#define TWO_MORE_BYTES(c1, c2) \ - do { \ - if (src + 1 >= src_end) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ - goto label_end_of_loop; \ - } \ - c1 = *src++; \ - c2 = *src++; \ - } while (0) - - -/* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte - form if MULTIBYTEP is nonzero. */ - -#define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \ - do { \ - if (src >= src_end) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ - goto label_end_of_loop; \ - } \ - c1 = *src++; \ - if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ - c1 = *src++ - 0x20; \ - } while (0) - -/* Set C to the next character at the source text pointed by `src'. - If there are not enough characters in the source, jump to - `label_end_of_loop'. The caller should set variables `coding' - `src', `src_end', and `translation_table' to appropriate pointers - in advance. This macro is used in encoding routines - `encode_coding_XXX', thus it assumes that the source text is in - multibyte form except for 8-bit characters. 8-bit characters are - in multibyte form if coding->src_multibyte is nonzero, else they - are represented by a single byte. */ - -#define ONE_MORE_CHAR(c) \ - do { \ - int len = src_end - src; \ - int bytes; \ - if (len <= 0) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ - goto label_end_of_loop; \ - } \ - if (coding->src_multibyte \ - || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \ - c = STRING_CHAR_AND_LENGTH (src, len, bytes); \ - else \ - c = *src, bytes = 1; \ - if (!NILP (translation_table)) \ - c = translate_char (translation_table, c, -1, 0, 0); \ - src += bytes; \ - } while (0) - - -/* Produce a multibyte form of character C to `dst'. Jump to - `label_end_of_loop' if there's not enough space at `dst'. - - If we are now in the middle of a composition sequence, the decoded - character may be ALTCHAR (for the current composition). In that - case, the character goes to coding->cmp_data->data instead of - `dst'. - - This macro is used in decoding routines. */ - -#define EMIT_CHAR(c) \ - do { \ - if (! COMPOSING_P (coding) \ - || coding->composing == COMPOSITION_RELATIVE \ - || coding->composing == COMPOSITION_WITH_RULE) \ - { \ - int bytes = CHAR_BYTES (c); \ - if ((dst + bytes) > (dst_bytes ? dst_end : src)) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_DST; \ - goto label_end_of_loop; \ - } \ - dst += CHAR_STRING (c, dst); \ - coding->produced_char++; \ - } \ - \ - if (COMPOSING_P (coding) \ - && coding->composing != COMPOSITION_RELATIVE) \ - { \ - CODING_ADD_COMPOSITION_COMPONENT (coding, c); \ - coding->composition_rule_follows \ - = coding->composing != COMPOSITION_WITH_ALTCHARS; \ - } \ - } while (0) - - -#define EMIT_ONE_BYTE(c) \ - do { \ - if (dst >= (dst_bytes ? dst_end : src)) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_DST; \ - goto label_end_of_loop; \ - } \ - *dst++ = c; \ - } while (0) - -#define EMIT_TWO_BYTES(c1, c2) \ - do { \ - if (dst + 2 > (dst_bytes ? dst_end : src)) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_DST; \ - goto label_end_of_loop; \ - } \ - *dst++ = c1, *dst++ = c2; \ - } while (0) - -#define EMIT_BYTES(from, to) \ - do { \ - if (dst + (to - from) > (dst_bytes ? dst_end : src)) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_DST; \ - goto label_end_of_loop; \ - } \ - while (from < to) \ - *dst++ = *from++; \ - } while (0) - - -/*** 1. Preamble ***/ - -#ifdef emacs -#include -#endif - -#include - -#ifdef emacs - -#include "lisp.h" -#include "buffer.h" -#include "charset.h" -#include "composite.h" -#include "ccl.h" -#include "coding.h" -#include "window.h" - -#else /* not emacs */ - -#include "mulelib.h" - -#endif /* not emacs */ - -Lisp_Object Qcoding_system, Qeol_type; -Lisp_Object Qbuffer_file_coding_system; -Lisp_Object Qpost_read_conversion, Qpre_write_conversion; -Lisp_Object Qno_conversion, Qundecided; -Lisp_Object Qcoding_system_history; -Lisp_Object Qsafe_chars; -Lisp_Object Qvalid_codes; - -extern Lisp_Object Qinsert_file_contents, Qwrite_region; -Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; -Lisp_Object Qstart_process, Qopen_network_stream; -Lisp_Object Qtarget_idx; - -Lisp_Object Vselect_safe_coding_system_function; - -/* Mnemonic string for each format of end-of-line. */ -Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; -/* Mnemonic string to indicate format of end-of-line is not yet - decided. */ -Lisp_Object eol_mnemonic_undecided; - -/* Format of end-of-line decided by system. This is CODING_EOL_LF on - Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */ -int system_eol_type; - -#ifdef emacs - -Lisp_Object Vcoding_system_list, Vcoding_system_alist; - -Lisp_Object Qcoding_system_p, Qcoding_system_error; - -/* Coding system emacs-mule and raw-text are for converting only - end-of-line format. */ -Lisp_Object Qemacs_mule, Qraw_text; - -/* Coding-systems are handed between Emacs Lisp programs and C internal - routines by the following three variables. */ -/* Coding-system for reading files and receiving data from process. */ -Lisp_Object Vcoding_system_for_read; -/* Coding-system for writing files and sending data to process. */ -Lisp_Object Vcoding_system_for_write; -/* Coding-system actually used in the latest I/O. */ -Lisp_Object Vlast_coding_system_used; - -/* A vector of length 256 which contains information about special - Latin codes (especially for dealing with Microsoft codes). */ -Lisp_Object Vlatin_extra_code_table; - -/* Flag to inhibit code conversion of end-of-line format. */ -int inhibit_eol_conversion; - -/* Flag to inhibit ISO2022 escape sequence detection. */ -int inhibit_iso_escape_detection; - -/* Flag to make buffer-file-coding-system inherit from process-coding. */ -int inherit_process_coding_system; - -/* Coding system to be used to encode text for terminal display. */ -struct coding_system terminal_coding; - -/* Coding system to be used to encode text for terminal display when - terminal coding system is nil. */ -struct coding_system safe_terminal_coding; - -/* Coding system of what is sent from terminal keyboard. */ -struct coding_system keyboard_coding; - -/* Default coding system to be used to write a file. */ -struct coding_system default_buffer_file_coding; - -Lisp_Object Vfile_coding_system_alist; -Lisp_Object Vprocess_coding_system_alist; -Lisp_Object Vnetwork_coding_system_alist; - -Lisp_Object Vlocale_coding_system; - -#endif /* emacs */ - -Lisp_Object Qcoding_category, Qcoding_category_index; - -/* List of symbols `coding-category-xxx' ordered by priority. */ -Lisp_Object Vcoding_category_list; - -/* Table of coding categories (Lisp symbols). */ -Lisp_Object Vcoding_category_table; - -/* Table of names of symbol for each coding-category. */ -char *coding_category_name[CODING_CATEGORY_IDX_MAX] = { - "coding-category-emacs-mule", - "coding-category-sjis", - "coding-category-iso-7", - "coding-category-iso-7-tight", - "coding-category-iso-8-1", - "coding-category-iso-8-2", - "coding-category-iso-7-else", - "coding-category-iso-8-else", - "coding-category-ccl", - "coding-category-big5", - "coding-category-utf-8", - "coding-category-utf-16-be", - "coding-category-utf-16-le", - "coding-category-raw-text", - "coding-category-binary" -}; - -/* Table of pointers to coding systems corresponding to each coding - categories. */ -struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX]; - -/* Table of coding category masks. Nth element is a mask for a coding - category of which priority is Nth. */ -static -int coding_priorities[CODING_CATEGORY_IDX_MAX]; - -/* Flag to tell if we look up translation table on character code - conversion. */ -Lisp_Object Venable_character_translation; -/* Standard translation table to look up on decoding (reading). */ -Lisp_Object Vstandard_translation_table_for_decode; -/* Standard translation table to look up on encoding (writing). */ -Lisp_Object Vstandard_translation_table_for_encode; - -Lisp_Object Qtranslation_table; -Lisp_Object Qtranslation_table_id; -Lisp_Object Qtranslation_table_for_decode; -Lisp_Object Qtranslation_table_for_encode; - -/* Alist of charsets vs revision number. */ -Lisp_Object Vcharset_revision_alist; - -/* Default coding systems used for process I/O. */ -Lisp_Object Vdefault_process_coding_system; - -/* Global flag to tell that we can't call post-read-conversion and - pre-write-conversion functions. Usually the value is zero, but it - is set to 1 temporarily while such functions are running. This is - to avoid infinite recursive call. */ -static int inhibit_pre_post_conversion; - -/* Char-table containing safe coding systems of each character. */ -Lisp_Object Vchar_coding_system_table; -Lisp_Object Qchar_coding_system; - -/* Return `safe-chars' property of coding system CODING. Don't check - validity of CODING. */ - -Lisp_Object -coding_safe_chars (coding) - struct coding_system *coding; -{ - Lisp_Object coding_spec, plist, safe_chars; - - coding_spec = Fget (coding->symbol, Qcoding_system); - plist = XVECTOR (coding_spec)->contents[3]; - safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars); - return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt); -} - -#define CODING_SAFE_CHAR_P(safe_chars, c) \ - (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c))) - - -/*** 2. Emacs internal format (emacs-mule) handlers ***/ - -/* Emacs' internal format for representation of multiple character - sets is a kind of multi-byte encoding, i.e. characters are - represented by variable-length sequences of one-byte codes. - - ASCII characters and control characters (e.g. `tab', `newline') are - represented by one-byte sequences which are their ASCII codes, in - the range 0x00 through 0x7F. - - 8-bit characters of the range 0x80..0x9F are represented by - two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit - code + 0x20). - - 8-bit characters of the range 0xA0..0xFF are represented by - one-byte sequences which are their 8-bit code. - - The other characters are represented by a sequence of `base - leading-code', optional `extended leading-code', and one or two - `position-code's. The length of the sequence is determined by the - base leading-code. Leading-code takes the range 0x81 through 0x9D, - whereas extended leading-code and position-code take the range 0xA0 - through 0xFF. See `charset.h' for more details about leading-code - and position-code. - - --- CODE RANGE of Emacs' internal format --- - character set range - ------------- ----- - ascii 0x00..0x7F - eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF - eight-bit-graphic 0xA0..0xBF - ELSE 0x81..0x9D + [0xA0..0xFF]+ - --------------------------------------------- - - As this is the internal character representation, the format is - usually not used externally (i.e. in a file or in a data sent to a - process). But, it is possible to have a text externally in this - format (i.e. by encoding by the coding system `emacs-mule'). - - In that case, a sequence of one-byte codes has a slightly different - form. - - Firstly, all characters in eight-bit-control are represented by - one-byte sequences which are their 8-bit code. - - Next, character composition data are represented by the byte - sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ..., - where, - METHOD is 0xF0 plus one of composition method (enum - composition_method), - - BYTES is 0xA0 plus the byte length of these composition data, - - CHARS is 0xA0 plus the number of characters composed by these - data, - - COMPONENTs are characters of multibyte form or composition - rules encoded by two-byte of ASCII codes. - - In addition, for backward compatibility, the following formats are - also recognized as composition data on decoding. - - 0x80 MSEQ ... - 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ - - Here, - MSEQ is a multibyte form but in these special format: - ASCII: 0xA0 ASCII_CODE+0x80, - other: LEADING_CODE+0x20 FOLLOWING-BYTE ..., - RULE is a one byte code of the range 0xA0..0xF0 that - represents a composition rule. - */ - -enum emacs_code_class_type emacs_code_class[256]; - -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in Emacs' internal format. If it is, - return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */ - -static int -detect_coding_emacs_mule (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - unsigned char c; - int composing = 0; - /* Dummy for ONE_MORE_BYTE. */ - struct coding_system dummy_coding; - struct coding_system *coding = &dummy_coding; - - while (1) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - - if (composing) - { - if (c < 0xA0) - composing = 0; - else if (c == 0xA0) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - c &= 0x7F; - } - else - c -= 0x20; - } - - if (c < 0x20) - { - if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) - return 0; - } - else if (c >= 0x80 && c < 0xA0) - { - if (c == 0x80) - /* Old leading code for a composite character. */ - composing = 1; - else - { - unsigned char *src_base = src - 1; - int bytes; - - if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base, - bytes)) - return 0; - src = src_base + bytes; - } - } - } - label_end_of_loop: - return CODING_CATEGORY_MASK_EMACS_MULE; -} - - -/* Record the starting position START and METHOD of one composition. */ - -#define CODING_ADD_COMPOSITION_START(coding, start, method) \ - do { \ - struct composition_data *cmp_data = coding->cmp_data; \ - int *data = cmp_data->data + cmp_data->used; \ - coding->cmp_data_start = cmp_data->used; \ - data[0] = -1; \ - data[1] = cmp_data->char_offset + start; \ - data[3] = (int) method; \ - cmp_data->used += 4; \ - } while (0) - -/* Record the ending position END of the current composition. */ - -#define CODING_ADD_COMPOSITION_END(coding, end) \ - do { \ - struct composition_data *cmp_data = coding->cmp_data; \ - int *data = cmp_data->data + coding->cmp_data_start; \ - data[0] = cmp_data->used - coding->cmp_data_start; \ - data[2] = cmp_data->char_offset + end; \ - } while (0) - -/* Record one COMPONENT (alternate character or composition rule). */ - -#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \ - (coding->cmp_data->data[coding->cmp_data->used++] = component) - - -/* Get one byte from a data pointed by SRC and increment SRC. If SRC - is not less than SRC_END, return -1 without incrementing Src. */ - -#define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++) - - -/* Decode a character represented as a component of composition - sequence of Emacs 20 style at SRC. Set C to that character, store - its multibyte form sequence at P, and set P to the end of that - sequence. If no valid character is found, set C to -1. */ - -#define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \ - do { \ - int bytes; \ - \ - c = SAFE_ONE_MORE_BYTE (); \ - if (c < 0) \ - break; \ - if (CHAR_HEAD_P (c)) \ - c = -1; \ - else if (c == 0xA0) \ - { \ - c = SAFE_ONE_MORE_BYTE (); \ - if (c < 0xA0) \ - c = -1; \ - else \ - { \ - c -= 0xA0; \ - *p++ = c; \ - } \ - } \ - else if (BASE_LEADING_CODE_P (c - 0x20)) \ - { \ - unsigned char *p0 = p; \ - \ - c -= 0x20; \ - *p++ = c; \ - bytes = BYTES_BY_CHAR_HEAD (c); \ - while (--bytes) \ - { \ - c = SAFE_ONE_MORE_BYTE (); \ - if (c < 0) \ - break; \ - *p++ = c; \ - } \ - if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)) \ - c = STRING_CHAR (p0, bytes); \ - else \ - c = -1; \ - } \ - else \ - c = -1; \ - } while (0) - - -/* Decode a composition rule represented as a component of composition - sequence of Emacs 20 style at SRC. Set C to the rule. If not - valid rule is found, set C to -1. */ - -#define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \ - do { \ - c = SAFE_ONE_MORE_BYTE (); \ - c -= 0xA0; \ - if (c < 0 || c >= 81) \ - c = -1; \ - else \ - { \ - gref = c / 9, nref = c % 9; \ - c = COMPOSITION_ENCODE_RULE (gref, nref); \ - } \ - } while (0) - - -/* Decode composition sequence encoded by `emacs-mule' at the source - pointed by SRC. SRC_END is the end of source. Store information - of the composition in CODING->cmp_data. - - For backward compatibility, decode also a composition sequence of - Emacs 20 style. In that case, the composition sequence contains - characters that should be extracted into a buffer or string. Store - those characters at *DESTINATION in multibyte form. - - If we encounter an invalid byte sequence, return 0. - If we encounter an insufficient source or destination, or - insufficient space in CODING->cmp_data, return 1. - Otherwise, return consumed bytes in the source. - -*/ -static INLINE int -decode_composition_emacs_mule (coding, src, src_end, - destination, dst_end, dst_bytes) - struct coding_system *coding; - unsigned char *src, *src_end, **destination, *dst_end; - int dst_bytes; -{ - unsigned char *dst = *destination; - int method, data_len, nchars; - unsigned char *src_base = src++; - /* Store components of composition. */ - int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH]; - int ncomponent; - /* Store multibyte form of characters to be composed. This is for - Emacs 20 style composition sequence. */ - unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH]; - unsigned char *bufp = buf; - int c, i, gref, nref; - - if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH - >= COMPOSITION_DATA_SIZE) - { - coding->result = CODING_FINISH_INSUFFICIENT_CMP; - return -1; - } - - ONE_MORE_BYTE (c); - if (c - 0xF0 >= COMPOSITION_RELATIVE - && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS) - { - int with_rule; - - method = c - 0xF0; - with_rule = (method == COMPOSITION_WITH_RULE - || method == COMPOSITION_WITH_RULE_ALTCHARS); - ONE_MORE_BYTE (c); - data_len = c - 0xA0; - if (data_len < 4 - || src_base + data_len > src_end) - return 0; - ONE_MORE_BYTE (c); - nchars = c - 0xA0; - if (c < 1) - return 0; - for (ncomponent = 0; src < src_base + data_len; ncomponent++) - { - /* If it is longer than this, it can't be valid. */ - if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH) - return 0; - - if (ncomponent % 2 && with_rule) - { - ONE_MORE_BYTE (gref); - gref -= 32; - ONE_MORE_BYTE (nref); - nref -= 32; - c = COMPOSITION_ENCODE_RULE (gref, nref); - } - else - { - int bytes; - if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)) - c = STRING_CHAR (src, bytes); - else - c = *src, bytes = 1; - src += bytes; - } - component[ncomponent] = c; - } - } - else - { - /* This may be an old Emacs 20 style format. See the comment at - the section 2 of this file. */ - while (src < src_end && !CHAR_HEAD_P (*src)) src++; - if (src == src_end - && !(coding->mode & CODING_MODE_LAST_BLOCK)) - goto label_end_of_loop; - - src_end = src; - src = src_base + 1; - if (c < 0xC0) - { - method = COMPOSITION_RELATIVE; - for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;) - { - DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp); - if (c < 0) - break; - component[ncomponent++] = c; - } - if (ncomponent < 2) - return 0; - nchars = ncomponent; - } - else if (c == 0xFF) - { - method = COMPOSITION_WITH_RULE; - src++; - DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp); - if (c < 0) - return 0; - component[0] = c; - for (ncomponent = 1; - ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;) - { - DECODE_EMACS_MULE_COMPOSITION_RULE (c); - if (c < 0) - break; - component[ncomponent++] = c; - DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp); - if (c < 0) - break; - component[ncomponent++] = c; - } - if (ncomponent < 3) - return 0; - nchars = (ncomponent + 1) / 2; - } - else - return 0; - } - - if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src)) - { - CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method); - for (i = 0; i < ncomponent; i++) - CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]); - CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars); - if (buf < bufp) - { - unsigned char *p = buf; - EMIT_BYTES (p, bufp); - *destination += bufp - buf; - coding->produced_char += nchars; - } - return (src - src_base); - } - label_end_of_loop: - return -1; -} - -/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ - -static void -decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - unsigned char *src = source; - unsigned char *src_end = source + src_bytes; - unsigned char *dst = destination; - unsigned char *dst_end = destination + dst_bytes; - /* SRC_BASE remembers the start position in source in each loop. - The loop will be exited when there's not enough source code, or - when there's not enough destination area to produce a - character. */ - unsigned char *src_base; - - coding->produced_char = 0; - while ((src_base = src) < src_end) - { - unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p; - int bytes; - - if (*src == '\r') - { - int c = *src++; - - if (coding->eol_type == CODING_EOL_CR) - c = '\n'; - else if (coding->eol_type == CODING_EOL_CRLF) - { - ONE_MORE_BYTE (c); - if (c != '\n') - { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - src--; - c = '\r'; - } - } - *dst++ = c; - coding->produced_char++; - continue; - } - else if (*src == '\n') - { - if ((coding->eol_type == CODING_EOL_CR - || coding->eol_type == CODING_EOL_CRLF) - && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - *dst++ = *src++; - coding->produced_char++; - continue; - } - else if (*src == 0x80) - { - /* Start of composition data. */ - int consumed = decode_composition_emacs_mule (coding, src, src_end, - &dst, dst_end, - dst_bytes); - if (consumed < 0) - goto label_end_of_loop; - else if (consumed > 0) - { - src += consumed; - continue; - } - bytes = CHAR_STRING (*src, tmp); - p = tmp; - src++; - } - else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)) - { - p = src; - src += bytes; - } - else - { - bytes = CHAR_STRING (*src, tmp); - p = tmp; - src++; - } - if (dst + bytes >= (dst_bytes ? dst_end : src)) - { - coding->result = CODING_FINISH_INSUFFICIENT_DST; - break; - } - while (bytes--) *dst++ = *p++; - coding->produced_char++; - } - label_end_of_loop: - coding->consumed = coding->consumed_char = src_base - source; - coding->produced = dst - destination; -} - - -/* Encode composition data stored at DATA into a special byte sequence - starting by 0x80. Update CODING->cmp_data_start and maybe - CODING->cmp_data for the next call. */ - -#define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \ - do { \ - unsigned char buf[1024], *p0 = buf, *p; \ - int len = data[0]; \ - int i; \ - \ - buf[0] = 0x80; \ - buf[1] = 0xF0 + data[3]; /* METHOD */ \ - buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \ - p = buf + 4; \ - if (data[3] == COMPOSITION_WITH_RULE \ - || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \ - { \ - p += CHAR_STRING (data[4], p); \ - for (i = 5; i < len; i += 2) \ - { \ - int gref, nref; \ - COMPOSITION_DECODE_RULE (data[i], gref, nref); \ - *p++ = 0x20 + gref; \ - *p++ = 0x20 + nref; \ - p += CHAR_STRING (data[i + 1], p); \ - } \ - } \ - else \ - { \ - for (i = 4; i < len; i++) \ - p += CHAR_STRING (data[i], p); \ - } \ - buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \ - \ - if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_DST; \ - goto label_end_of_loop; \ - } \ - while (p0 < p) \ - *dst++ = *p0++; \ - coding->cmp_data_start += data[0]; \ - if (coding->cmp_data_start == coding->cmp_data->used \ - && coding->cmp_data->next) \ - { \ - coding->cmp_data = coding->cmp_data->next; \ - coding->cmp_data_start = 0; \ - } \ - } while (0) - - -static void encode_eol P_ ((struct coding_system *, unsigned char *, - unsigned char *, int, int)); - -static void -encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - unsigned char *src = source; - unsigned char *src_end = source + src_bytes; - unsigned char *dst = destination; - unsigned char *dst_end = destination + dst_bytes; - unsigned char *src_base; - int c; - int char_offset; - int *data; - - Lisp_Object translation_table; - - translation_table = Qnil; - - /* Optimization for the case that there's no composition. */ - if (!coding->cmp_data || coding->cmp_data->used == 0) - { - encode_eol (coding, source, destination, src_bytes, dst_bytes); - return; - } - - char_offset = coding->cmp_data->char_offset; - data = coding->cmp_data->data + coding->cmp_data_start; - while (1) - { - src_base = src; - - /* If SRC starts a composition, encode the information about the - composition in advance. */ - if (coding->cmp_data_start < coding->cmp_data->used - && char_offset + coding->consumed_char == data[1]) - { - ENCODE_COMPOSITION_EMACS_MULE (coding, data); - char_offset = coding->cmp_data->char_offset; - data = coding->cmp_data->data + coding->cmp_data_start; - } - - ONE_MORE_CHAR (c); - if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF - || coding->eol_type == CODING_EOL_CR)) - { - if (coding->eol_type == CODING_EOL_CRLF) - EMIT_TWO_BYTES ('\r', c); - else - EMIT_ONE_BYTE ('\r'); - } - else if (SINGLE_BYTE_CHAR_P (c)) - EMIT_ONE_BYTE (c); - else - EMIT_BYTES (src_base, src); - coding->consumed_char++; - } - label_end_of_loop: - coding->consumed = src_base - source; - coding->produced = coding->produced_char = dst - destination; - return; -} - - -/*** 3. ISO2022 handlers ***/ - -/* The following note describes the coding system ISO2022 briefly. - Since the intention of this note is to help understand the - functions in this file, some parts are NOT ACCURATE or are OVERLY - SIMPLIFIED. For thorough understanding, please refer to the - original document of ISO2022. This is equivalent to the standard - ECMA-35, obtainable from (*). - - ISO2022 provides many mechanisms to encode several character sets - in 7-bit and 8-bit environments. For 7-bit environments, all text - is encoded using bytes less than 128. This may make the encoded - text a little bit longer, but the text passes more easily through - several types of gateway, some of which strip off the MSB (Most - Significant Bit). - - There are two kinds of character sets: control character sets and - graphic character sets. The former contain control characters such - as `newline' and `escape' to provide control functions (control - functions are also provided by escape sequences). The latter - contain graphic characters such as 'A' and '-'. Emacs recognizes - two control character sets and many graphic character sets. - - Graphic character sets are classified into one of the following - four classes, according to the number of bytes (DIMENSION) and - number of characters in one dimension (CHARS) of the set: - - DIMENSION1_CHARS94 - - DIMENSION1_CHARS96 - - DIMENSION2_CHARS94 - - DIMENSION2_CHARS96 - - In addition, each character set is assigned an identification tag, - unique for each set, called the "final character" (denoted as - hereafter). The of each character set is decided by ECMA(*) - when it is registered in ISO. The code range of is 0x30..0x7F - (0x30..0x3F are for private use only). - - Note (*): ECMA = European Computer Manufacturers Association - - Here are examples of graphic character sets [NAME()]: - o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ... - o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ... - o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ... - o DIMENSION2_CHARS96 -- none for the moment - - A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR. - C0 [0x00..0x1F] -- control character plane 0 - GL [0x20..0x7F] -- graphic character plane 0 - C1 [0x80..0x9F] -- control character plane 1 - GR [0xA0..0xFF] -- graphic character plane 1 - - A control character set is directly designated and invoked to C0 or - C1 by an escape sequence. The most common case is that: - - ISO646's control character set is designated/invoked to C0, and - - ISO6429's control character set is designated/invoked to C1, - and usually these designations/invocations are omitted in encoded - text. In a 7-bit environment, only C0 can be used, and a control - character for C1 is encoded by an appropriate escape sequence to - fit into the environment. All control characters for C1 are - defined to have corresponding escape sequences. - - A graphic character set is at first designated to one of four - graphic registers (G0 through G3), then these graphic registers are - invoked to GL or GR. These designations and invocations can be - done independently. The most common case is that G0 is invoked to - GL, G1 is invoked to GR, and ASCII is designated to G0. Usually - these invocations and designations are omitted in encoded text. - In a 7-bit environment, only GL can be used. - - When a graphic character set of CHARS94 is invoked to GL, codes - 0x20 and 0x7F of the GL area work as control characters SPACE and - DEL respectively, and codes 0xA0 and 0xFF of the GR area should not - be used. - - There are two ways of invocation: locking-shift and single-shift. - With locking-shift, the invocation lasts until the next different - invocation, whereas with single-shift, the invocation affects the - following character only and doesn't affect the locking-shift - state. Invocations are done by the following control characters or - escape sequences: - - ---------------------------------------------------------------------- - abbrev function cntrl escape seq description - ---------------------------------------------------------------------- - SI/LS0 (shift-in) 0x0F none invoke G0 into GL - SO/LS1 (shift-out) 0x0E none invoke G1 into GL - LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL - LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL - LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*) - LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*) - LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*) - SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char - SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char - ---------------------------------------------------------------------- - (*) These are not used by any known coding system. - - Control characters for these functions are defined by macros - ISO_CODE_XXX in `coding.h'. - - Designations are done by the following escape sequences: - ---------------------------------------------------------------------- - escape sequence description - ---------------------------------------------------------------------- - ESC '(' designate DIMENSION1_CHARS94 to G0 - ESC ')' designate DIMENSION1_CHARS94 to G1 - ESC '*' designate DIMENSION1_CHARS94 to G2 - ESC '+' designate DIMENSION1_CHARS94 to G3 - ESC ',' designate DIMENSION1_CHARS96 to G0 (*) - ESC '-' designate DIMENSION1_CHARS96 to G1 - ESC '.' designate DIMENSION1_CHARS96 to G2 - ESC '/' designate DIMENSION1_CHARS96 to G3 - ESC '$' '(' designate DIMENSION2_CHARS94 to G0 (**) - ESC '$' ')' designate DIMENSION2_CHARS94 to G1 - ESC '$' '*' designate DIMENSION2_CHARS94 to G2 - ESC '$' '+' designate DIMENSION2_CHARS94 to G3 - ESC '$' ',' designate DIMENSION2_CHARS96 to G0 (*) - ESC '$' '-' designate DIMENSION2_CHARS96 to G1 - ESC '$' '.' designate DIMENSION2_CHARS96 to G2 - ESC '$' '/' designate DIMENSION2_CHARS96 to G3 - ---------------------------------------------------------------------- - - In this list, "DIMENSION1_CHARS94" means a graphic character set - of dimension 1, chars 94, and final character , etc... - - Note (*): Although these designations are not allowed in ISO2022, - Emacs accepts them on decoding, and produces them on encoding - CHARS96 character sets in a coding system which is characterized as - 7-bit environment, non-locking-shift, and non-single-shift. - - Note (**): If is '@', 'A', or 'B', the intermediate character - '(' can be omitted. We refer to this as "short-form" hereafter. - - Now you may notice that there are a lot of ways of encoding the - same multilingual text in ISO2022. Actually, there exist many - coding systems such as Compound Text (used in X11's inter client - communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR - (used in Korean Internet), EUC (Extended UNIX Code, used in Asian - localized platforms), and all of these are variants of ISO2022. - - In addition to the above, Emacs handles two more kinds of escape - sequences: ISO6429's direction specification and Emacs' private - sequence for specifying character composition. - - ISO6429's direction specification takes the following form: - o CSI ']' -- end of the current direction - o CSI '0' ']' -- end of the current direction - o CSI '1' ']' -- start of left-to-right text - o CSI '2' ']' -- start of right-to-left text - The control character CSI (0x9B: control sequence introducer) is - abbreviated to the escape sequence ESC '[' in a 7-bit environment. - - Character composition specification takes the following form: - o ESC '0' -- start relative composition - o ESC '1' -- end composition - o ESC '2' -- start rule-base composition (*) - o ESC '3' -- start relative composition with alternate chars (**) - o ESC '4' -- start rule-base composition with alternate chars (**) - Since these are not standard escape sequences of any ISO standard, - the use of them with these meanings is restricted to Emacs only. - - (*) This form is used only in Emacs 20.5 and older versions, - but the newer versions can safely decode it. - (**) This form is used only in Emacs 21.1 and newer versions, - and the older versions can't decode it. - - Here's a list of example usages of these composition escape - sequences (categorized by `enum composition_method'). - - COMPOSITION_RELATIVE: - ESC 0 CHAR [ CHAR ] ESC 1 - COMPOSITION_WITH_RULE: - ESC 2 CHAR [ RULE CHAR ] ESC 1 - COMPOSITION_WITH_ALTCHARS: - ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 - COMPOSITION_WITH_RULE_ALTCHARS: - ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */ - -enum iso_code_class_type iso_code_class[256]; - -#define CHARSET_OK(idx, charset, c) \ - (coding_system_table[idx] \ - && (charset == CHARSET_ASCII \ - || (safe_chars = coding_safe_chars (coding_system_table[idx]), \ - CODING_SAFE_CHAR_P (safe_chars, c))) \ - && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \ - charset) \ - != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) - -#define SHIFT_OUT_OK(idx) \ - (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0) - -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in ISO2022. If it is, return an - integer in which appropriate flag bits any of: - CODING_CATEGORY_MASK_ISO_7 - CODING_CATEGORY_MASK_ISO_7_TIGHT - CODING_CATEGORY_MASK_ISO_8_1 - CODING_CATEGORY_MASK_ISO_8_2 - CODING_CATEGORY_MASK_ISO_7_ELSE - CODING_CATEGORY_MASK_ISO_8_ELSE - are set. If a code which should never appear in ISO2022 is found, - returns 0. */ - -static int -detect_coding_iso2022 (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - int mask = CODING_CATEGORY_MASK_ISO; - int mask_found = 0; - int reg[4], shift_out = 0, single_shifting = 0; - int c, c1, charset; - /* Dummy for ONE_MORE_BYTE. */ - struct coding_system dummy_coding; - struct coding_system *coding = &dummy_coding; - Lisp_Object safe_chars; - - reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; - while (mask && src < src_end) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - switch (c) - { - case ISO_CODE_ESC: - if (inhibit_iso_escape_detection) - break; - single_shifting = 0; - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (c >= '(' && c <= '/') - { - /* Designation sequence for a charset of dimension 1. */ - ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); - if (c1 < ' ' || c1 >= 0x80 - || (charset = iso_charset_table[0][c >= ','][c1]) < 0) - /* Invalid designation sequence. Just ignore. */ - break; - reg[(c - '(') % 4] = charset; - } - else if (c == '$') - { - /* Designation sequence for a charset of dimension 2. */ - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (c >= '@' && c <= 'B') - /* Designation for JISX0208.1978, GB2312, or JISX0208. */ - reg[0] = charset = iso_charset_table[1][0][c]; - else if (c >= '(' && c <= '/') - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); - if (c1 < ' ' || c1 >= 0x80 - || (charset = iso_charset_table[1][c >= ','][c1]) < 0) - /* Invalid designation sequence. Just ignore. */ - break; - reg[(c - '(') % 4] = charset; - } - else - /* Invalid designation sequence. Just ignore. */ - break; - } - else if (c == 'N' || c == 'O') - { - /* ESC for SS2 or SS3. */ - mask &= CODING_CATEGORY_MASK_ISO_7_ELSE; - break; - } - else if (c >= '0' && c <= '4') - { - /* ESC for start/end composition. */ - mask_found |= CODING_CATEGORY_MASK_ISO; - break; - } - else - /* Invalid escape sequence. Just ignore. */ - break; - - /* We found a valid designation sequence for CHARSET. */ - mask &= ~CODING_CATEGORY_MASK_ISO_8BIT; - c = MAKE_CHAR (charset, 0, 0); - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c)) - mask_found |= CODING_CATEGORY_MASK_ISO_7; - else - mask &= ~CODING_CATEGORY_MASK_ISO_7; - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c)) - mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT; - else - mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT; - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c)) - mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE; - else - mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE; - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c)) - mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE; - else - mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE; - break; - - case ISO_CODE_SO: - if (inhibit_iso_escape_detection) - break; - single_shifting = 0; - if (shift_out == 0 - && (reg[1] >= 0 - || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE) - || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))) - { - /* Locking shift out. */ - mask &= ~CODING_CATEGORY_MASK_ISO_7BIT; - mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT; - } - break; - - case ISO_CODE_SI: - if (inhibit_iso_escape_detection) - break; - single_shifting = 0; - if (shift_out == 1) - { - /* Locking shift in. */ - mask &= ~CODING_CATEGORY_MASK_ISO_7BIT; - mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT; - } - break; - - case ISO_CODE_CSI: - single_shifting = 0; - case ISO_CODE_SS2: - case ISO_CODE_SS3: - { - int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE; - - if (inhibit_iso_escape_detection) - break; - if (c != ISO_CODE_CSI) - { - if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags - & CODING_FLAG_ISO_SINGLE_SHIFT) - newmask |= CODING_CATEGORY_MASK_ISO_8_1; - if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags - & CODING_FLAG_ISO_SINGLE_SHIFT) - newmask |= CODING_CATEGORY_MASK_ISO_8_2; - single_shifting = 1; - } - if (VECTORP (Vlatin_extra_code_table) - && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) - { - if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags - & CODING_FLAG_ISO_LATIN_EXTRA) - newmask |= CODING_CATEGORY_MASK_ISO_8_1; - if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags - & CODING_FLAG_ISO_LATIN_EXTRA) - newmask |= CODING_CATEGORY_MASK_ISO_8_2; - } - mask &= newmask; - mask_found |= newmask; - } - break; - - default: - if (c < 0x80) - { - single_shifting = 0; - break; - } - else if (c < 0xA0) - { - single_shifting = 0; - if (VECTORP (Vlatin_extra_code_table) - && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) - { - int newmask = 0; - - if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags - & CODING_FLAG_ISO_LATIN_EXTRA) - newmask |= CODING_CATEGORY_MASK_ISO_8_1; - if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags - & CODING_FLAG_ISO_LATIN_EXTRA) - newmask |= CODING_CATEGORY_MASK_ISO_8_2; - mask &= newmask; - mask_found |= newmask; - } - else - return 0; - } - else - { - mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT - | CODING_CATEGORY_MASK_ISO_7_ELSE); - mask_found |= CODING_CATEGORY_MASK_ISO_8_1; - /* Check the length of succeeding codes of the range - 0xA0..0FF. If the byte length is odd, we exclude - CODING_CATEGORY_MASK_ISO_8_2. We can check this only - when we are not single shifting. */ - if (!single_shifting - && mask & CODING_CATEGORY_MASK_ISO_8_2) - { - int i = 1; - while (src < src_end) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (c < 0xA0) - break; - i++; - } - - if (i & 1 && src < src_end) - mask &= ~CODING_CATEGORY_MASK_ISO_8_2; - else - mask_found |= CODING_CATEGORY_MASK_ISO_8_2; - } - } - break; - } - } - label_end_of_loop: - return (mask & mask_found); -} - -/* Decode a character of which charset is CHARSET, the 1st position - code is C1, the 2nd position code is C2, and return the decoded - character code. If the variable `translation_table' is non-nil, - returned the translated code. */ - -#define DECODE_ISO_CHARACTER(charset, c1, c2) \ - (NILP (translation_table) \ - ? MAKE_CHAR (charset, c1, c2) \ - : translate_char (translation_table, -1, charset, c1, c2)) - -/* Set designation state into CODING. */ -#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \ - do { \ - int charset, c; \ - \ - if (final_char < '0' || final_char >= 128) \ - goto label_invalid_code; \ - charset = ISO_CHARSET_TABLE (make_number (dimension), \ - make_number (chars), \ - make_number (final_char)); \ - c = MAKE_CHAR (charset, 0, 0); \ - if (charset >= 0 \ - && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \ - || CODING_SAFE_CHAR_P (safe_chars, c))) \ - { \ - if (coding->spec.iso2022.last_invalid_designation_register == 0 \ - && reg == 0 \ - && charset == CHARSET_ASCII) \ - { \ - /* We should insert this designation sequence as is so \ - that it is surely written back to a file. */ \ - coding->spec.iso2022.last_invalid_designation_register = -1; \ - goto label_invalid_code; \ - } \ - coding->spec.iso2022.last_invalid_designation_register = -1; \ - if ((coding->mode & CODING_MODE_DIRECTION) \ - && CHARSET_REVERSE_CHARSET (charset) >= 0) \ - charset = CHARSET_REVERSE_CHARSET (charset); \ - CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \ - } \ - else \ - { \ - coding->spec.iso2022.last_invalid_designation_register = reg; \ - goto label_invalid_code; \ - } \ - } while (0) - -/* Allocate a memory block for storing information about compositions. - The block is chained to the already allocated blocks. */ - -void -coding_allocate_composition_data (coding, char_offset) - struct coding_system *coding; - int char_offset; -{ - struct composition_data *cmp_data - = (struct composition_data *) xmalloc (sizeof *cmp_data); - - cmp_data->char_offset = char_offset; - cmp_data->used = 0; - cmp_data->prev = coding->cmp_data; - cmp_data->next = NULL; - if (coding->cmp_data) - coding->cmp_data->next = cmp_data; - coding->cmp_data = cmp_data; - coding->cmp_data_start = 0; -} - -/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4. - ESC 0 : relative composition : ESC 0 CHAR ... ESC 1 - ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 - ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1 - ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1 - */ - -#define DECODE_COMPOSITION_START(c1) \ - do { \ - if (coding->composing == COMPOSITION_DISABLED) \ - { \ - *dst++ = ISO_CODE_ESC; \ - *dst++ = c1 & 0x7f; \ - coding->produced_char += 2; \ - } \ - else if (!COMPOSING_P (coding)) \ - { \ - /* This is surely the start of a composition. We must be sure \ - that coding->cmp_data has enough space to store the \ - information about the composition. If not, terminate the \ - current decoding loop, allocate one more memory block for \ - coding->cmp_data in the caller, then start the decoding \ - loop again. We can't allocate memory here directly because \ - it may cause buffer/string relocation. */ \ - if (!coding->cmp_data \ - || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \ - >= COMPOSITION_DATA_SIZE)) \ - { \ - coding->result = CODING_FINISH_INSUFFICIENT_CMP; \ - goto label_end_of_loop; \ - } \ - coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \ - : c1 == '2' ? COMPOSITION_WITH_RULE \ - : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ - : COMPOSITION_WITH_RULE_ALTCHARS); \ - CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \ - coding->composing); \ - coding->composition_rule_follows = 0; \ - } \ - else \ - { \ - /* We are already handling a composition. If the method is \ - the following two, the codes following the current escape \ - sequence are actual characters stored in a buffer. */ \ - if (coding->composing == COMPOSITION_WITH_ALTCHARS \ - || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \ - { \ - coding->composing = COMPOSITION_RELATIVE; \ - coding->composition_rule_follows = 0; \ - } \ - } \ - } while (0) - -/* Handle composition end sequence ESC 1. */ - -#define DECODE_COMPOSITION_END(c1) \ - do { \ - if (! COMPOSING_P (coding)) \ - { \ - *dst++ = ISO_CODE_ESC; \ - *dst++ = c1; \ - coding->produced_char += 2; \ - } \ - else \ - { \ - CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \ - coding->composing = COMPOSITION_NO; \ - } \ - } while (0) - -/* Decode a composition rule from the byte C1 (and maybe one more byte - from SRC) and store one encoded composition rule in - coding->cmp_data. */ - -#define DECODE_COMPOSITION_RULE(c1) \ - do { \ - int rule = 0; \ - (c1) -= 32; \ - if (c1 < 81) /* old format (before ver.21) */ \ - { \ - int gref = (c1) / 9; \ - int nref = (c1) % 9; \ - if (gref == 4) gref = 10; \ - if (nref == 4) nref = 10; \ - rule = COMPOSITION_ENCODE_RULE (gref, nref); \ - } \ - else if (c1 < 93) /* new format (after ver.21) */ \ - { \ - ONE_MORE_BYTE (c2); \ - rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \ - } \ - CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \ - coding->composition_rule_follows = 0; \ - } while (0) - - -/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ - -static void -decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - unsigned char *src = source; - unsigned char *src_end = source + src_bytes; - unsigned char *dst = destination; - unsigned char *dst_end = destination + dst_bytes; - /* Charsets invoked to graphic plane 0 and 1 respectively. */ - int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); - int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); - /* SRC_BASE remembers the start position in source in each loop. - The loop will be exited when there's not enough source code - (within macro ONE_MORE_BYTE), or when there's not enough - destination area to produce a character (within macro - EMIT_CHAR). */ - unsigned char *src_base; - int c, charset; - Lisp_Object translation_table; - Lisp_Object safe_chars; - - safe_chars = coding_safe_chars (coding); - - if (NILP (Venable_character_translation)) - translation_table = Qnil; - else - { - translation_table = coding->translation_table_for_decode; - if (NILP (translation_table)) - translation_table = Vstandard_translation_table_for_decode; - } - - coding->result = CODING_FINISH_NORMAL; - - while (1) - { - int c1, c2; - - src_base = src; - ONE_MORE_BYTE (c1); - - /* We produce no character or one character. */ - switch (iso_code_class [c1]) - { - case ISO_0x20_or_0x7F: - if (COMPOSING_P (coding) && coding->composition_rule_follows) - { - DECODE_COMPOSITION_RULE (c1); - continue; - } - if (charset0 < 0 || CHARSET_CHARS (charset0) == 94) - { - /* This is SPACE or DEL. */ - charset = CHARSET_ASCII; - break; - } - /* This is a graphic character, we fall down ... */ - - case ISO_graphic_plane_0: - if (COMPOSING_P (coding) && coding->composition_rule_follows) - { - DECODE_COMPOSITION_RULE (c1); - continue; - } - charset = charset0; - break; - - case ISO_0xA0_or_0xFF: - if (charset1 < 0 || CHARSET_CHARS (charset1) == 94 - || coding->flags & CODING_FLAG_ISO_SEVEN_BITS) - goto label_invalid_code; - /* This is a graphic character, we fall down ... */ - - case ISO_graphic_plane_1: - if (charset1 < 0) - goto label_invalid_code; - charset = charset1; - break; - - case ISO_control_0: - if (COMPOSING_P (coding)) - DECODE_COMPOSITION_END ('1'); - - /* All ISO2022 control characters in this class have the - same representation in Emacs internal format. */ - if (c1 == '\n' - && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - && (coding->eol_type == CODING_EOL_CR - || coding->eol_type == CODING_EOL_CRLF)) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - charset = CHARSET_ASCII; - break; - - case ISO_control_1: - if (COMPOSING_P (coding)) - DECODE_COMPOSITION_END ('1'); - goto label_invalid_code; - - case ISO_carriage_return: - if (COMPOSING_P (coding)) - DECODE_COMPOSITION_END ('1'); - - if (coding->eol_type == CODING_EOL_CR) - c1 = '\n'; - else if (coding->eol_type == CODING_EOL_CRLF) - { - ONE_MORE_BYTE (c1); - if (c1 != ISO_CODE_LF) - { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - src--; - c1 = '\r'; - } - } - charset = CHARSET_ASCII; - break; - - case ISO_shift_out: - if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) - || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0) - goto label_invalid_code; - CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; - charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); - continue; - - case ISO_shift_in: - if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)) - goto label_invalid_code; - CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; - charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); - continue; - - case ISO_single_shift_2_7: - case ISO_single_shift_2: - if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) - goto label_invalid_code; - /* SS2 is handled as an escape sequence of ESC 'N' */ - c1 = 'N'; - goto label_escape_sequence; - - case ISO_single_shift_3: - if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) - goto label_invalid_code; - /* SS2 is handled as an escape sequence of ESC 'O' */ - c1 = 'O'; - goto label_escape_sequence; - - case ISO_control_sequence_introducer: - /* CSI is handled as an escape sequence of ESC '[' ... */ - c1 = '['; - goto label_escape_sequence; - - case ISO_escape: - ONE_MORE_BYTE (c1); - label_escape_sequence: - /* Escape sequences handled by Emacs are invocation, - designation, direction specification, and character - composition specification. */ - switch (c1) - { - case '&': /* revision of following character set */ - ONE_MORE_BYTE (c1); - if (!(c1 >= '@' && c1 <= '~')) - goto label_invalid_code; - ONE_MORE_BYTE (c1); - if (c1 != ISO_CODE_ESC) - goto label_invalid_code; - ONE_MORE_BYTE (c1); - goto label_escape_sequence; - - case '$': /* designation of 2-byte character set */ - if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION)) - goto label_invalid_code; - ONE_MORE_BYTE (c1); - if (c1 >= '@' && c1 <= 'B') - { /* designation of JISX0208.1978, GB2312.1980, - or JISX0208.1980 */ - DECODE_DESIGNATION (0, 2, 94, c1); - } - else if (c1 >= 0x28 && c1 <= 0x2B) - { /* designation of DIMENSION2_CHARS94 character set */ - ONE_MORE_BYTE (c2); - DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2); - } - else if (c1 >= 0x2C && c1 <= 0x2F) - { /* designation of DIMENSION2_CHARS96 character set */ - ONE_MORE_BYTE (c2); - DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2); - } - else - goto label_invalid_code; - /* We must update these variables now. */ - charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); - charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); - continue; - - case 'n': /* invocation of locking-shift-2 */ - if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) - || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0) - goto label_invalid_code; - CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; - charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); - continue; - - case 'o': /* invocation of locking-shift-3 */ - if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) - || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0) - goto label_invalid_code; - CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; - charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); - continue; - - case 'N': /* invocation of single-shift-2 */ - if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) - || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0) - goto label_invalid_code; - charset = CODING_SPEC_ISO_DESIGNATION (coding, 2); - ONE_MORE_BYTE (c1); - if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) - goto label_invalid_code; - break; - - case 'O': /* invocation of single-shift-3 */ - if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) - || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0) - goto label_invalid_code; - charset = CODING_SPEC_ISO_DESIGNATION (coding, 3); - ONE_MORE_BYTE (c1); - if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) - goto label_invalid_code; - break; - - case '0': case '2': case '3': case '4': /* start composition */ - DECODE_COMPOSITION_START (c1); - continue; - - case '1': /* end composition */ - DECODE_COMPOSITION_END (c1); - continue; - - case '[': /* specification of direction */ - if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION) - goto label_invalid_code; - /* For the moment, nested direction is not supported. - So, `coding->mode & CODING_MODE_DIRECTION' zero means - left-to-right, and nonzero means right-to-left. */ - ONE_MORE_BYTE (c1); - switch (c1) - { - case ']': /* end of the current direction */ - coding->mode &= ~CODING_MODE_DIRECTION; - - case '0': /* end of the current direction */ - case '1': /* start of left-to-right direction */ - ONE_MORE_BYTE (c1); - if (c1 == ']') - coding->mode &= ~CODING_MODE_DIRECTION; - else - goto label_invalid_code; - break; - - case '2': /* start of right-to-left direction */ - ONE_MORE_BYTE (c1); - if (c1 == ']') - coding->mode |= CODING_MODE_DIRECTION; - else - goto label_invalid_code; - break; - - default: - goto label_invalid_code; - } - continue; - - default: - if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION)) - goto label_invalid_code; - if (c1 >= 0x28 && c1 <= 0x2B) - { /* designation of DIMENSION1_CHARS94 character set */ - ONE_MORE_BYTE (c2); - DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2); - } - else if (c1 >= 0x2C && c1 <= 0x2F) - { /* designation of DIMENSION1_CHARS96 character set */ - ONE_MORE_BYTE (c2); - DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2); - } - else - goto label_invalid_code; - /* We must update these variables now. */ - charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); - charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); - continue; - } - } - - /* Now we know CHARSET and 1st position code C1 of a character. - Produce a multibyte sequence for that character while getting - 2nd position code C2 if necessary. */ - if (CHARSET_DIMENSION (charset) == 2) - { - ONE_MORE_BYTE (c2); - if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0) - /* C2 is not in a valid range. */ - goto label_invalid_code; - } - c = DECODE_ISO_CHARACTER (charset, c1, c2); - EMIT_CHAR (c); - continue; - - label_invalid_code: - coding->errors++; - if (COMPOSING_P (coding)) - DECODE_COMPOSITION_END ('1'); - src = src_base; - c = *src++; - EMIT_CHAR (c); - } - - label_end_of_loop: - coding->consumed = coding->consumed_char = src_base - source; - coding->produced = dst - destination; - return; -} - - -/* ISO2022 encoding stuff. */ - -/* - It is not enough to say just "ISO2022" on encoding, we have to - specify more details. In Emacs, each ISO2022 coding system - variant has the following specifications: - 1. Initial designation to G0 through G3. - 2. Allows short-form designation? - 3. ASCII should be designated to G0 before control characters? - 4. ASCII should be designated to G0 at end of line? - 5. 7-bit environment or 8-bit environment? - 6. Use locking-shift? - 7. Use Single-shift? - And the following two are only for Japanese: - 8. Use ASCII in place of JIS0201-1976-Roman? - 9. Use JISX0208-1983 in place of JISX0208-1978? - These specifications are encoded in `coding->flags' as flag bits - defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more - details. -*/ - -/* Produce codes (escape sequence) for designating CHARSET to graphic - register REG at DST, and increment DST. If of CHARSET is - '@', 'A', or 'B' and the coding system CODING allows, produce - designation sequence of short-form. */ - -#define ENCODE_DESIGNATION(charset, reg, coding) \ - do { \ - unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \ - char *intermediate_char_94 = "()*+"; \ - char *intermediate_char_96 = ",-./"; \ - int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \ - \ - if (revision < 255) \ - { \ - *dst++ = ISO_CODE_ESC; \ - *dst++ = '&'; \ - *dst++ = '@' + revision; \ - } \ - *dst++ = ISO_CODE_ESC; \ - if (CHARSET_DIMENSION (charset) == 1) \ - { \ - if (CHARSET_CHARS (charset) == 94) \ - *dst++ = (unsigned char) (intermediate_char_94[reg]); \ - else \ - *dst++ = (unsigned char) (intermediate_char_96[reg]); \ - } \ - else \ - { \ - *dst++ = '$'; \ - if (CHARSET_CHARS (charset) == 94) \ - { \ - if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \ - || reg != 0 \ - || final_char < '@' || final_char > 'B') \ - *dst++ = (unsigned char) (intermediate_char_94[reg]); \ - } \ - else \ - *dst++ = (unsigned char) (intermediate_char_96[reg]); \ - } \ - *dst++ = final_char; \ - CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \ - } while (0) - -/* The following two macros produce codes (control character or escape - sequence) for ISO2022 single-shift functions (single-shift-2 and - single-shift-3). */ - -#define ENCODE_SINGLE_SHIFT_2 \ - do { \ - if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ - *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \ - else \ - *dst++ = ISO_CODE_SS2; \ - CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \ - } while (0) - -#define ENCODE_SINGLE_SHIFT_3 \ - do { \ - if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ - *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \ - else \ - *dst++ = ISO_CODE_SS3; \ - CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \ - } while (0) - -/* The following four macros produce codes (control character or - escape sequence) for ISO2022 locking-shift functions (shift-in, - shift-out, locking-shift-2, and locking-shift-3). */ - -#define ENCODE_SHIFT_IN \ - do { \ - *dst++ = ISO_CODE_SI; \ - CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \ - } while (0) - -#define ENCODE_SHIFT_OUT \ - do { \ - *dst++ = ISO_CODE_SO; \ - CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \ - } while (0) - -#define ENCODE_LOCKING_SHIFT_2 \ - do { \ - *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \ - CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \ - } while (0) - -#define ENCODE_LOCKING_SHIFT_3 \ - do { \ - *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \ - CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \ - } while (0) - -/* Produce codes for a DIMENSION1 character whose character set is - CHARSET and whose position-code is C1. Designation and invocation - sequences are also produced in advance if necessary. */ - -#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \ - do { \ - if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \ - { \ - if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ - *dst++ = c1 & 0x7F; \ - else \ - *dst++ = c1 | 0x80; \ - CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \ - break; \ - } \ - else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \ - { \ - *dst++ = c1 & 0x7F; \ - break; \ - } \ - else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \ - { \ - *dst++ = c1 | 0x80; \ - break; \ - } \ - else \ - /* Since CHARSET is not yet invoked to any graphic planes, we \ - must invoke it, or, at first, designate it to some graphic \ - register. Then repeat the loop to actually produce the \ - character. */ \ - dst = encode_invocation_designation (charset, coding, dst); \ - } while (1) - -/* Produce codes for a DIMENSION2 character whose character set is - CHARSET and whose position-codes are C1 and C2. Designation and - invocation codes are also produced in advance if necessary. */ - -#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \ - do { \ - if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \ - { \ - if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ - *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \ - else \ - *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \ - CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \ - break; \ - } \ - else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \ - { \ - *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \ - break; \ - } \ - else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \ - { \ - *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \ - break; \ - } \ - else \ - /* Since CHARSET is not yet invoked to any graphic planes, we \ - must invoke it, or, at first, designate it to some graphic \ - register. Then repeat the loop to actually produce the \ - character. */ \ - dst = encode_invocation_designation (charset, coding, dst); \ - } while (1) - -#define ENCODE_ISO_CHARACTER(c) \ - do { \ - int charset, c1, c2; \ - \ - SPLIT_CHAR (c, charset, c1, c2); \ - if (CHARSET_DEFINED_P (charset)) \ - { \ - if (CHARSET_DIMENSION (charset) == 1) \ - { \ - if (charset == CHARSET_ASCII \ - && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \ - charset = charset_latin_jisx0201; \ - ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \ - } \ - else \ - { \ - if (charset == charset_jisx0208 \ - && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \ - charset = charset_jisx0208_1978; \ - ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \ - } \ - } \ - else \ - { \ - *dst++ = c1; \ - if (c2 >= 0) \ - *dst++ = c2; \ - } \ - } while (0) - - -/* Instead of encoding character C, produce one or two `?'s. */ - -#define ENCODE_UNSAFE_CHARACTER(c) \ - do { \ - ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \ - if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \ - ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \ - } while (0) - - -/* Produce designation and invocation codes at a place pointed by DST - to use CHARSET. The element `spec.iso2022' of *CODING is updated. - Return new DST. */ - -unsigned char * -encode_invocation_designation (charset, coding, dst) - int charset; - struct coding_system *coding; - unsigned char *dst; -{ - int reg; /* graphic register number */ - - /* At first, check designations. */ - for (reg = 0; reg < 4; reg++) - if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg)) - break; - - if (reg >= 4) - { - /* CHARSET is not yet designated to any graphic registers. */ - /* At first check the requested designation. */ - reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset); - if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION) - /* Since CHARSET requests no special designation, designate it - to graphic register 0. */ - reg = 0; - - ENCODE_DESIGNATION (charset, reg, coding); - } - - if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg - && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg) - { - /* Since the graphic register REG is not invoked to any graphic - planes, invoke it to graphic plane 0. */ - switch (reg) - { - case 0: /* graphic register 0 */ - ENCODE_SHIFT_IN; - break; - - case 1: /* graphic register 1 */ - ENCODE_SHIFT_OUT; - break; - - case 2: /* graphic register 2 */ - if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) - ENCODE_SINGLE_SHIFT_2; - else - ENCODE_LOCKING_SHIFT_2; - break; - - case 3: /* graphic register 3 */ - if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) - ENCODE_SINGLE_SHIFT_3; - else - ENCODE_LOCKING_SHIFT_3; - break; - } - } - - return dst; -} - -/* Produce 2-byte codes for encoded composition rule RULE. */ - -#define ENCODE_COMPOSITION_RULE(rule) \ - do { \ - int gref, nref; \ - COMPOSITION_DECODE_RULE (rule, gref, nref); \ - *dst++ = 32 + 81 + gref; \ - *dst++ = 32 + nref; \ - } while (0) - -/* Produce codes for indicating the start of a composition sequence - (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers - which specify information about the composition. See the comment - in coding.h for the format of DATA. */ - -#define ENCODE_COMPOSITION_START(coding, data) \ - do { \ - coding->composing = data[3]; \ - *dst++ = ISO_CODE_ESC; \ - if (coding->composing == COMPOSITION_RELATIVE) \ - *dst++ = '0'; \ - else \ - { \ - *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \ - ? '3' : '4'); \ - coding->cmp_data_index = coding->cmp_data_start + 4; \ - coding->composition_rule_follows = 0; \ - } \ - } while (0) - -/* Produce codes for indicating the end of the current composition. */ - -#define ENCODE_COMPOSITION_END(coding, data) \ - do { \ - *dst++ = ISO_CODE_ESC; \ - *dst++ = '1'; \ - coding->cmp_data_start += data[0]; \ - coding->composing = COMPOSITION_NO; \ - if (coding->cmp_data_start == coding->cmp_data->used \ - && coding->cmp_data->next) \ - { \ - coding->cmp_data = coding->cmp_data->next; \ - coding->cmp_data_start = 0; \ - } \ - } while (0) - -/* Produce composition start sequence ESC 0. Here, this sequence - doesn't mean the start of a new composition but means that we have - just produced components (alternate chars and composition rules) of - the composition and the actual text follows in SRC. */ - -#define ENCODE_COMPOSITION_FAKE_START(coding) \ - do { \ - *dst++ = ISO_CODE_ESC; \ - *dst++ = '0'; \ - coding->composing = COMPOSITION_RELATIVE; \ - } while (0) - -/* The following three macros produce codes for indicating direction - of text. */ -#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \ - do { \ - if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \ - *dst++ = ISO_CODE_ESC, *dst++ = '['; \ - else \ - *dst++ = ISO_CODE_CSI; \ - } while (0) - -#define ENCODE_DIRECTION_R2L \ - ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']' - -#define ENCODE_DIRECTION_L2R \ - ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']' - -/* Produce codes for designation and invocation to reset the graphic - planes and registers to initial state. */ -#define ENCODE_RESET_PLANE_AND_REGISTER \ - do { \ - int reg; \ - if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \ - ENCODE_SHIFT_IN; \ - for (reg = 0; reg < 4; reg++) \ - if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \ - && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \ - != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \ - ENCODE_DESIGNATION \ - (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \ - } while (0) - -/* Produce designation sequences of charsets in the line started from - SRC to a place pointed by DST, and return updated DST. - - If the current block ends before any end-of-line, we may fail to - find all the necessary designations. */ - -static unsigned char * -encode_designation_at_bol (coding, translation_table, src, src_end, dst) - struct coding_system *coding; - Lisp_Object translation_table; - unsigned char *src, *src_end, *dst; -{ - int charset, c, found = 0, reg; - /* Table of charsets to be designated to each graphic register. */ - int r[4]; - - for (reg = 0; reg < 4; reg++) - r[reg] = -1; - - while (found < 4) - { - ONE_MORE_CHAR (c); - if (c == '\n') - break; - - charset = CHAR_CHARSET (c); - reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset); - if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0) - { - found++; - r[reg] = charset; - } - } - - label_end_of_loop: - if (found) - { - for (reg = 0; reg < 4; reg++) - if (r[reg] >= 0 - && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg]) - ENCODE_DESIGNATION (r[reg], reg, coding); - } - - return dst; -} - -/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */ - -static void -encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - unsigned char *src = source; - unsigned char *src_end = source + src_bytes; - unsigned char *dst = destination; - unsigned char *dst_end = destination + dst_bytes; - /* Since the maximum bytes produced by each loop is 20, we subtract 19 - from DST_END to assure overflow checking is necessary only at the - head of loop. */ - unsigned char *adjusted_dst_end = dst_end - 19; - /* SRC_BASE remembers the start position in source in each loop. - The loop will be exited when there's not enough source text to - analyze multi-byte codes (within macro ONE_MORE_CHAR), or when - there's not enough destination area to produce encoded codes - (within macro EMIT_BYTES). */ - unsigned char *src_base; - int c; - Lisp_Object translation_table; - Lisp_Object safe_chars; - - safe_chars = coding_safe_chars (coding); - - if (NILP (Venable_character_translation)) - translation_table = Qnil; - else - { - translation_table = coding->translation_table_for_encode; - if (NILP (translation_table)) - translation_table = Vstandard_translation_table_for_encode; - } - - coding->consumed_char = 0; - coding->errors = 0; - while (1) - { - src_base = src; - - if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19))) - { - coding->result = CODING_FINISH_INSUFFICIENT_DST; - break; - } - - if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL - && CODING_SPEC_ISO_BOL (coding)) - { - /* We have to produce designation sequences if any now. */ - dst = encode_designation_at_bol (coding, translation_table, - src, src_end, dst); - CODING_SPEC_ISO_BOL (coding) = 0; - } - - /* Check composition start and end. */ - if (coding->composing != COMPOSITION_DISABLED - && coding->cmp_data_start < coding->cmp_data->used) - { - struct composition_data *cmp_data = coding->cmp_data; - int *data = cmp_data->data + coding->cmp_data_start; - int this_pos = cmp_data->char_offset + coding->consumed_char; - - if (coding->composing == COMPOSITION_RELATIVE) - { - if (this_pos == data[2]) - { - ENCODE_COMPOSITION_END (coding, data); - cmp_data = coding->cmp_data; - data = cmp_data->data + coding->cmp_data_start; - } - } - else if (COMPOSING_P (coding)) - { - /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */ - if (coding->cmp_data_index == coding->cmp_data_start + data[0]) - /* We have consumed components of the composition. - What follows in SRC is the composition's base - text. */ - ENCODE_COMPOSITION_FAKE_START (coding); - else - { - int c = cmp_data->data[coding->cmp_data_index++]; - if (coding->composition_rule_follows) - { - ENCODE_COMPOSITION_RULE (c); - coding->composition_rule_follows = 0; - } - else - { - if (coding->flags & CODING_FLAG_ISO_SAFE - && ! CODING_SAFE_CHAR_P (safe_chars, c)) - ENCODE_UNSAFE_CHARACTER (c); - else - ENCODE_ISO_CHARACTER (c); - if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) - coding->composition_rule_follows = 1; - } - continue; - } - } - if (!COMPOSING_P (coding)) - { - if (this_pos == data[1]) - { - ENCODE_COMPOSITION_START (coding, data); - continue; - } - } - } - - ONE_MORE_CHAR (c); - - /* Now encode the character C. */ - if (c < 0x20 || c == 0x7F) - { - if (c == '\r') - { - if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)) - { - if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL) - ENCODE_RESET_PLANE_AND_REGISTER; - *dst++ = c; - continue; - } - /* fall down to treat '\r' as '\n' ... */ - c = '\n'; - } - if (c == '\n') - { - if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL) - ENCODE_RESET_PLANE_AND_REGISTER; - if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL) - bcopy (coding->spec.iso2022.initial_designation, - coding->spec.iso2022.current_designation, - sizeof coding->spec.iso2022.initial_designation); - if (coding->eol_type == CODING_EOL_LF - || coding->eol_type == CODING_EOL_UNDECIDED) - *dst++ = ISO_CODE_LF; - else if (coding->eol_type == CODING_EOL_CRLF) - *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF; - else - *dst++ = ISO_CODE_CR; - CODING_SPEC_ISO_BOL (coding) = 1; - } - else - { - if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL) - ENCODE_RESET_PLANE_AND_REGISTER; - *dst++ = c; - } - } - else if (ASCII_BYTE_P (c)) - ENCODE_ISO_CHARACTER (c); - else if (SINGLE_BYTE_CHAR_P (c)) - { - *dst++ = c; - coding->errors++; - } - else if (coding->flags & CODING_FLAG_ISO_SAFE - && ! CODING_SAFE_CHAR_P (safe_chars, c)) - ENCODE_UNSAFE_CHARACTER (c); - else - ENCODE_ISO_CHARACTER (c); - - coding->consumed_char++; - } - - label_end_of_loop: - coding->consumed = src_base - source; - coding->produced = coding->produced_char = dst - destination; -} - - -/*** 4. SJIS and BIG5 handlers ***/ - -/* Although SJIS and BIG5 are not ISO coding systems, they are used - quite widely. So, for the moment, Emacs supports them in the bare - C code. But, in the future, they may be supported only by CCL. */ - -/* SJIS is a coding system encoding three character sets: ASCII, right - half of JISX0201-Kana, and JISX0208. An ASCII character is encoded - as is. A character of charset katakana-jisx0201 is encoded by - "position-code + 0x80". A character of charset japanese-jisx0208 - is encoded in 2-byte but two position-codes are divided and shifted - so that it fits in the range below. - - --- CODE RANGE of SJIS --- - (character set) (range) - ASCII 0x00 .. 0x7F - KATAKANA-JISX0201 0xA1 .. 0xDF - JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF - (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC - ------------------------------- - -*/ - -/* BIG5 is a coding system encoding two character sets: ASCII and - Big5. An ASCII character is encoded as is. Big5 is a two-byte - character set and is encoded in two bytes. - - --- CODE RANGE of BIG5 --- - (character set) (range) - ASCII 0x00 .. 0x7F - Big5 (1st byte) 0xA1 .. 0xFE - (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE - -------------------------- - - Since the number of characters in Big5 is larger than maximum - characters in Emacs' charset (96x96), it can't be handled as one - charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' - and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former - contains frequently used characters and the latter contains less - frequently used characters. */ - -/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2 - are the 1st and 2nd position-codes of Big5 in BIG5 coding system. - C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal - format. CHARSET is `charset_big5_1' or `charset_big5_2'. */ - -/* Number of Big5 characters which have the same code in 1st byte. */ -#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40) - -#define DECODE_BIG5(b1, b2, charset, c1, c2) \ - do { \ - unsigned int temp \ - = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \ - if (b1 < 0xC9) \ - charset = charset_big5_1; \ - else \ - { \ - charset = charset_big5_2; \ - temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \ - } \ - c1 = temp / (0xFF - 0xA1) + 0x21; \ - c2 = temp % (0xFF - 0xA1) + 0x21; \ - } while (0) - -#define ENCODE_BIG5(charset, c1, c2, b1, b2) \ - do { \ - unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \ - if (charset == charset_big5_2) \ - temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \ - b1 = temp / BIG5_SAME_ROW + 0xA1; \ - b2 = temp % BIG5_SAME_ROW; \ - b2 += b2 < 0x3F ? 0x40 : 0x62; \ - } while (0) - -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in SJIS. If it is, return - CODING_CATEGORY_MASK_SJIS, else return 0. */ - -static int -detect_coding_sjis (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - int c; - /* Dummy for ONE_MORE_BYTE. */ - struct coding_system dummy_coding; - struct coding_system *coding = &dummy_coding; - - while (1) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (c < 0x80) - continue; - if (c == 0x80 || c == 0xA0 || c > 0xEF) - return 0; - if (c <= 0x9F || c >= 0xE0) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (c < 0x40 || c == 0x7F || c > 0xFC) - return 0; - } - } - label_end_of_loop: - return CODING_CATEGORY_MASK_SJIS; -} - -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in BIG5. If it is, return - CODING_CATEGORY_MASK_BIG5, else return 0. */ - -static int -detect_coding_big5 (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - int c; - /* Dummy for ONE_MORE_BYTE. */ - struct coding_system dummy_coding; - struct coding_system *coding = &dummy_coding; - - while (1) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (c < 0x80) - continue; - if (c < 0xA1 || c > 0xFE) - return 0; - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) - return 0; - } - label_end_of_loop: - return CODING_CATEGORY_MASK_BIG5; -} - -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in UTF-8. If it is, return - CODING_CATEGORY_MASK_UTF_8, else return 0. */ - -#define UTF_8_1_OCTET_P(c) ((c) < 0x80) -#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) -#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0) -#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) -#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) -#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) -#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC) - -static int -detect_coding_utf_8 (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - unsigned char c; - int seq_maybe_bytes; - /* Dummy for ONE_MORE_BYTE. */ - struct coding_system dummy_coding; - struct coding_system *coding = &dummy_coding; - - while (1) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (UTF_8_1_OCTET_P (c)) - continue; - else if (UTF_8_2_OCTET_LEADING_P (c)) - seq_maybe_bytes = 1; - else if (UTF_8_3_OCTET_LEADING_P (c)) - seq_maybe_bytes = 2; - else if (UTF_8_4_OCTET_LEADING_P (c)) - seq_maybe_bytes = 3; - else if (UTF_8_5_OCTET_LEADING_P (c)) - seq_maybe_bytes = 4; - else if (UTF_8_6_OCTET_LEADING_P (c)) - seq_maybe_bytes = 5; - else - return 0; - - do - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (!UTF_8_EXTRA_OCTET_P (c)) - return 0; - seq_maybe_bytes--; - } - while (seq_maybe_bytes > 0); - } - - label_end_of_loop: - return CODING_CATEGORY_MASK_UTF_8; -} - -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in UTF-16 Big Endian (endian == 1) or - Little Endian (otherwise). If it is, return - CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE, - else return 0. */ - -#define UTF_16_INVALID_P(val) \ - (((val) == 0xFFFE) \ - || ((val) == 0xFFFF)) - -#define UTF_16_HIGH_SURROGATE_P(val) \ - (((val) & 0xD800) == 0xD800) - -#define UTF_16_LOW_SURROGATE_P(val) \ - (((val) & 0xDC00) == 0xDC00) - -static int -detect_coding_utf_16 (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - unsigned char c1, c2; - /* Dummy for TWO_MORE_BYTES. */ - struct coding_system dummy_coding; - struct coding_system *coding = &dummy_coding; - - ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); - ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep); - - if ((c1 == 0xFF) && (c2 == 0xFE)) - return CODING_CATEGORY_MASK_UTF_16_LE; - else if ((c1 == 0xFE) && (c2 == 0xFF)) - return CODING_CATEGORY_MASK_UTF_16_BE; - - label_end_of_loop: - return 0; -} - -/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". - If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ - -static void -decode_coding_sjis_big5 (coding, source, destination, - src_bytes, dst_bytes, sjis_p) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; - int sjis_p; -{ - unsigned char *src = source; - unsigned char *src_end = source + src_bytes; - unsigned char *dst = destination; - unsigned char *dst_end = destination + dst_bytes; - /* SRC_BASE remembers the start position in source in each loop. - The loop will be exited when there's not enough source code - (within macro ONE_MORE_BYTE), or when there's not enough - destination area to produce a character (within macro - EMIT_CHAR). */ - unsigned char *src_base; - Lisp_Object translation_table; - - if (NILP (Venable_character_translation)) - translation_table = Qnil; - else - { - translation_table = coding->translation_table_for_decode; - if (NILP (translation_table)) - translation_table = Vstandard_translation_table_for_decode; - } - - coding->produced_char = 0; - while (1) - { - int c, charset, c1, c2; - - src_base = src; - ONE_MORE_BYTE (c1); - - if (c1 < 0x80) - { - charset = CHARSET_ASCII; - if (c1 < 0x20) - { - if (c1 == '\r') - { - if (coding->eol_type == CODING_EOL_CRLF) - { - ONE_MORE_BYTE (c2); - if (c2 == '\n') - c1 = c2; - else if (coding->mode - & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - else - /* To process C2 again, SRC is subtracted by 1. */ - src--; - } - else if (coding->eol_type == CODING_EOL_CR) - c1 = '\n'; - } - else if (c1 == '\n' - && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - && (coding->eol_type == CODING_EOL_CR - || coding->eol_type == CODING_EOL_CRLF)) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - } - } - else - { - if (sjis_p) - { - if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF) - goto label_invalid_code; - if (c1 <= 0x9F || c1 >= 0xE0) - { - /* SJIS -> JISX0208 */ - ONE_MORE_BYTE (c2); - if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC) - goto label_invalid_code; - DECODE_SJIS (c1, c2, c1, c2); - charset = charset_jisx0208; - } - else - /* SJIS -> JISX0201-Kana */ - charset = charset_katakana_jisx0201; - } - else - { - /* BIG5 -> Big5 */ - if (c1 < 0xA0 || c1 > 0xFE) - goto label_invalid_code; - ONE_MORE_BYTE (c2); - if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE) - goto label_invalid_code; - DECODE_BIG5 (c1, c2, charset, c1, c2); - } - } - - c = DECODE_ISO_CHARACTER (charset, c1, c2); - EMIT_CHAR (c); - continue; - - label_invalid_code: - coding->errors++; - src = src_base; - c = *src++; - EMIT_CHAR (c); - } - - label_end_of_loop: - coding->consumed = coding->consumed_char = src_base - source; - coding->produced = dst - destination; - return; -} - -/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". - This function can encode charsets `ascii', `katakana-jisx0201', - `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We - are sure that all these charsets are registered as official charset - (i.e. do not have extended leading-codes). Characters of other - charsets are produced without any encoding. If SJIS_P is 1, encode - SJIS text, else encode BIG5 text. */ - -static void -encode_coding_sjis_big5 (coding, source, destination, - src_bytes, dst_bytes, sjis_p) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; - int sjis_p; -{ - unsigned char *src = source; - unsigned char *src_end = source + src_bytes; - unsigned char *dst = destination; - unsigned char *dst_end = destination + dst_bytes; - /* SRC_BASE remembers the start position in source in each loop. - The loop will be exited when there's not enough source text to - analyze multi-byte codes (within macro ONE_MORE_CHAR), or when - there's not enough destination area to produce encoded codes - (within macro EMIT_BYTES). */ - unsigned char *src_base; - Lisp_Object translation_table; - - if (NILP (Venable_character_translation)) - translation_table = Qnil; - else - { - translation_table = coding->translation_table_for_encode; - if (NILP (translation_table)) - translation_table = Vstandard_translation_table_for_encode; - } - - while (1) - { - int c, charset, c1, c2; - - src_base = src; - ONE_MORE_CHAR (c); - - /* Now encode the character C. */ - if (SINGLE_BYTE_CHAR_P (c)) - { - switch (c) - { - case '\r': - if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY) - { - EMIT_ONE_BYTE (c); - break; - } - c = '\n'; - case '\n': - if (coding->eol_type == CODING_EOL_CRLF) - { - EMIT_TWO_BYTES ('\r', c); - break; - } - else if (coding->eol_type == CODING_EOL_CR) - c = '\r'; - default: - EMIT_ONE_BYTE (c); - } - } - else - { - SPLIT_CHAR (c, charset, c1, c2); - if (sjis_p) - { - if (charset == charset_jisx0208 - || charset == charset_jisx0208_1978) - { - ENCODE_SJIS (c1, c2, c1, c2); - EMIT_TWO_BYTES (c1, c2); - } - else if (charset == charset_katakana_jisx0201) - EMIT_ONE_BYTE (c1 | 0x80); - else if (charset == charset_latin_jisx0201) - EMIT_ONE_BYTE (c1); - else - /* There's no way other than producing the internal - codes as is. */ - EMIT_BYTES (src_base, src); - } - else - { - if (charset == charset_big5_1 || charset == charset_big5_2) - { - ENCODE_BIG5 (charset, c1, c2, c1, c2); - EMIT_TWO_BYTES (c1, c2); - } - else - /* There's no way other than producing the internal - codes as is. */ - EMIT_BYTES (src_base, src); - } - } - coding->consumed_char++; - } - - label_end_of_loop: - coding->consumed = src_base - source; - coding->produced = coding->produced_char = dst - destination; -} - - -/*** 5. CCL handlers ***/ - -/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". - Check if a text is encoded in a coding system of which - encoder/decoder are written in CCL program. If it is, return - CODING_CATEGORY_MASK_CCL, else return 0. */ - -static int -detect_coding_ccl (src, src_end, multibytep) - unsigned char *src, *src_end; - int multibytep; -{ - unsigned char *valid; - int c; - /* Dummy for ONE_MORE_BYTE. */ - struct coding_system dummy_coding; - struct coding_system *coding = &dummy_coding; - - /* No coding system is assigned to coding-category-ccl. */ - if (!coding_system_table[CODING_CATEGORY_IDX_CCL]) - return 0; - - valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; - while (1) - { - ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); - if (! valid[c]) - return 0; - } - label_end_of_loop: - return CODING_CATEGORY_MASK_CCL; -} - - -/*** 6. End-of-line handlers ***/ - -/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ - -static void -decode_eol (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - unsigned char *src = source; - unsigned char *dst = destination; - unsigned char *src_end = src + src_bytes; - unsigned char *dst_end = dst + dst_bytes; - Lisp_Object translation_table; - /* SRC_BASE remembers the start position in source in each loop. - The loop will be exited when there's not enough source code - (within macro ONE_MORE_BYTE), or when there's not enough - destination area to produce a character (within macro - EMIT_CHAR). */ - unsigned char *src_base; - int c; - - translation_table = Qnil; - switch (coding->eol_type) - { - case CODING_EOL_CRLF: - while (1) - { - src_base = src; - ONE_MORE_BYTE (c); - if (c == '\r') - { - ONE_MORE_BYTE (c); - if (c != '\n') - { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - src--; - c = '\r'; - } - } - else if (c == '\n' - && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - EMIT_CHAR (c); - } - break; - - case CODING_EOL_CR: - while (1) - { - src_base = src; - ONE_MORE_BYTE (c); - if (c == '\n') - { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } - } - else if (c == '\r') - c = '\n'; - EMIT_CHAR (c); - } - break; - - default: /* no need for EOL handling */ - while (1) - { - src_base = src; - ONE_MORE_BYTE (c); - EMIT_CHAR (c); - } - } - - label_end_of_loop: - coding->consumed = coding->consumed_char = src_base - source; - coding->produced = dst - destination; - return; -} - -/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode - format of end-of-line according to `coding->eol_type'. It also - convert multibyte form 8-bit characters to unibyte if - CODING->src_multibyte is nonzero. If `coding->mode & - CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text - also means end-of-line. */ - -static void -encode_eol (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - unsigned char *src = source; - unsigned char *dst = destination; - unsigned char *src_end = src + src_bytes; - unsigned char *dst_end = dst + dst_bytes; - Lisp_Object translation_table; - /* SRC_BASE remembers the start position in source in each loop. - The loop will be exited when there's not enough source text to - analyze multi-byte codes (within macro ONE_MORE_CHAR), or when - there's not enough destination area to produce encoded codes - (within macro EMIT_BYTES). */ - unsigned char *src_base; - int c; - int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY; - - translation_table = Qnil; - if (coding->src_multibyte - && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL) - { - src_end--; - src_bytes--; - coding->result = CODING_FINISH_INSUFFICIENT_SRC; - } - - if (coding->eol_type == CODING_EOL_CRLF) - { - while (src < src_end) - { - src_base = src; - c = *src++; - if (c >= 0x20) - EMIT_ONE_BYTE (c); - else if (c == '\n' || (c == '\r' && selective_display)) - EMIT_TWO_BYTES ('\r', '\n'); - else - EMIT_ONE_BYTE (c); - } - src_base = src; - label_end_of_loop: - ; - } - else - { - if (!dst_bytes || src_bytes <= dst_bytes) - { - safe_bcopy (src, dst, src_bytes); - src_base = src_end; - dst += src_bytes; - } - else - { - if (coding->src_multibyte - && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL) - dst_bytes--; - safe_bcopy (src, dst, dst_bytes); - src_base = src + dst_bytes; - dst = destination + dst_bytes; - coding->result = CODING_FINISH_INSUFFICIENT_DST; - } - if (coding->eol_type == CODING_EOL_CR) - { - for (src = destination; src < dst; src++) - if (*src == '\n') *src = '\r'; - } - else if (selective_display) - { - for (src = destination; src < dst; src++) - if (*src == '\r') *src = '\n'; - } - } - if (coding->src_multibyte) - dst = destination + str_as_unibyte (destination, dst - destination); - - coding->consumed = src_base - source; - coding->produced = dst - destination; - coding->produced_char = coding->produced; -} - - -/*** 7. C library functions ***/ - -/* In Emacs Lisp, a coding system is represented by a Lisp symbol which - has a property `coding-system'. The value of this property is a - vector of length 5 (called the coding-vector). Among elements of - this vector, the first (element[0]) and the fifth (element[4]) - carry important information for decoding/encoding. Before - decoding/encoding, this information should be set in fields of a - structure of type `coding_system'. - - The value of the property `coding-system' can be a symbol of another - subsidiary coding-system. In that case, Emacs gets coding-vector - from that symbol. - - `element[0]' contains information to be set in `coding->type'. The - value and its meaning is as follows: - - 0 -- coding_type_emacs_mule - 1 -- coding_type_sjis - 2 -- coding_type_iso2022 - 3 -- coding_type_big5 - 4 -- coding_type_ccl encoder/decoder written in CCL - nil -- coding_type_no_conversion - t -- coding_type_undecided (automatic conversion on decoding, - no-conversion on encoding) - - `element[4]' contains information to be set in `coding->flags' and - `coding->spec'. The meaning varies by `coding->type'. - - If `coding->type' is `coding_type_iso2022', element[4] is a vector - of length 32 (of which the first 13 sub-elements are used now). - Meanings of these sub-elements are: - - sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022' - If the value is an integer of valid charset, the charset is - assumed to be designated to graphic register N initially. - - If the value is minus, it is a minus value of charset which - reserves graphic register N, which means that the charset is - not designated initially but should be designated to graphic - register N just before encoding a character in that charset. - - If the value is nil, graphic register N is never used on - encoding. - - sub-element[N] where N is 4 through 11: to be set in `coding->flags' - Each value takes t or nil. See the section ISO2022 of - `coding.h' for more information. - - If `coding->type' is `coding_type_big5', element[4] is t to denote - BIG5-ETen or nil to denote BIG5-HKU. - - If `coding->type' takes the other value, element[4] is ignored. - - Emacs Lisp's coding systems also carry information about format of - end-of-line in a value of property `eol-type'. If the value is - integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2 - means CODING_EOL_CR. If it is not integer, it should be a vector - of subsidiary coding systems of which property `eol-type' has one - of the above values. - -*/ - -/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL - and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING - is setup so that no conversion is necessary and return -1, else - return 0. */ - -int -setup_coding_system (coding_system, coding) - Lisp_Object coding_system; - struct coding_system *coding; -{ - Lisp_Object coding_spec, coding_type, eol_type, plist; - Lisp_Object val; - - /* At first, zero clear all members. */ - bzero (coding, sizeof (struct coding_system)); - - /* Initialize some fields required for all kinds of coding systems. */ - coding->symbol = coding_system; - coding->heading_ascii = -1; - coding->post_read_conversion = coding->pre_write_conversion = Qnil; - coding->composing = COMPOSITION_DISABLED; - coding->cmp_data = NULL; - - if (NILP (coding_system)) - goto label_invalid_coding_system; - - coding_spec = Fget (coding_system, Qcoding_system); - - if (!VECTORP (coding_spec) - || XVECTOR (coding_spec)->size != 5 - || !CONSP (XVECTOR (coding_spec)->contents[3])) - goto label_invalid_coding_system; - - eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type); - if (VECTORP (eol_type)) - { - coding->eol_type = CODING_EOL_UNDECIDED; - coding->common_flags = CODING_REQUIRE_DETECTION_MASK; - } - else if (XFASTINT (eol_type) == 1) - { - coding->eol_type = CODING_EOL_CRLF; - coding->common_flags - = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; - } - else if (XFASTINT (eol_type) == 2) - { - coding->eol_type = CODING_EOL_CR; - coding->common_flags - = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; - } - else - coding->eol_type = CODING_EOL_LF; - - coding_type = XVECTOR (coding_spec)->contents[0]; - /* Try short cut. */ - if (SYMBOLP (coding_type)) - { - if (EQ (coding_type, Qt)) - { - coding->type = coding_type_undecided; - coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; - } - else - coding->type = coding_type_no_conversion; - /* Initialize this member. Any thing other than - CODING_CATEGORY_IDX_UTF_16_BE and - CODING_CATEGORY_IDX_UTF_16_LE are ok because they have - special treatment in detect_eol. */ - coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE; - - return 0; - } - - /* Get values of coding system properties: - `post-read-conversion', `pre-write-conversion', - `translation-table-for-decode', `translation-table-for-encode'. */ - plist = XVECTOR (coding_spec)->contents[3]; - /* Pre & post conversion functions should be disabled if - inhibit_eol_conversion is nonzero. This is the case that a code - conversion function is called while those functions are running. */ - if (! inhibit_pre_post_conversion) - { - coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion); - coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion); - } - val = Fplist_get (plist, Qtranslation_table_for_decode); - if (SYMBOLP (val)) - val = Fget (val, Qtranslation_table_for_decode); - coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil; - val = Fplist_get (plist, Qtranslation_table_for_encode); - if (SYMBOLP (val)) - val = Fget (val, Qtranslation_table_for_encode); - coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil; - val = Fplist_get (plist, Qcoding_category); - if (!NILP (val)) - { - val = Fget (val, Qcoding_category_index); - if (INTEGERP (val)) - coding->category_idx = XINT (val); - else - goto label_invalid_coding_system; - } - else - goto label_invalid_coding_system; - - /* If the coding system has non-nil `composition' property, enable - composition handling. */ - val = Fplist_get (plist, Qcomposition); - if (!NILP (val)) - coding->composing = COMPOSITION_NO; - - switch (XFASTINT (coding_type)) - { - case 0: - coding->type = coding_type_emacs_mule; - coding->common_flags - |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; - coding->composing = COMPOSITION_NO; - if (!NILP (coding->post_read_conversion)) - coding->common_flags |= CODING_REQUIRE_DECODING_MASK; - if (!NILP (coding->pre_write_conversion)) - coding->common_flags |= CODING_REQUIRE_ENCODING_MASK; - break; - - case 1: - coding->type = coding_type_sjis; - coding->common_flags - |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; - break; - - case 2: - coding->type = coding_type_iso2022; - coding->common_flags - |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; - { - Lisp_Object val, temp; - Lisp_Object *flags; - int i, charset, reg_bits = 0; - - val = XVECTOR (coding_spec)->contents[4]; - - if (!VECTORP (val) || XVECTOR (val)->size != 32) - goto label_invalid_coding_system; - - flags = XVECTOR (val)->contents; - coding->flags - = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM) - | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL) - | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL) - | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS) - | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT) - | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT) - | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN) - | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS) - | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION) - | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL) - | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL) - | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE) - | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA) - ); - - /* Invoke graphic register 0 to plane 0. */ - CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; - /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */ - CODING_SPEC_ISO_INVOCATION (coding, 1) - = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1); - /* Not single shifting at first. */ - CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; - /* Beginning of buffer should also be regarded as bol. */ - CODING_SPEC_ISO_BOL (coding) = 1; - - for (charset = 0; charset <= MAX_CHARSET; charset++) - CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255; - val = Vcharset_revision_alist; - while (CONSP (val)) - { - charset = get_charset_id (Fcar_safe (XCAR (val))); - if (charset >= 0 - && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp)) - && (i = XINT (temp), (i >= 0 && (i + '@') < 128))) - CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i; - val = XCDR (val); - } - - /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations. - FLAGS[REG] can be one of below: - integer CHARSET: CHARSET occupies register I, - t: designate nothing to REG initially, but can be used - by any charsets, - list of integer, nil, or t: designate the first - element (if integer) to REG initially, the remaining - elements (if integer) is designated to REG on request, - if an element is t, REG can be used by any charsets, - nil: REG is never used. */ - for (charset = 0; charset <= MAX_CHARSET; charset++) - CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) - = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION; - for (i = 0; i < 4; i++) - { - if ((INTEGERP (flags[i]) - && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))) - || (charset = get_charset_id (flags[i])) >= 0) - { - CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset; - CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i; - } - else if (EQ (flags[i], Qt)) - { - CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1; - reg_bits |= 1 << i; - coding->flags |= CODING_FLAG_ISO_DESIGNATION; - } - else if (CONSP (flags[i])) - { - Lisp_Object tail; - tail = flags[i]; - - coding->flags |= CODING_FLAG_ISO_DESIGNATION; - if ((INTEGERP (XCAR (tail)) - && (charset = XINT (XCAR (tail)), - CHARSET_VALID_P (charset))) - || (charset = get_charset_id (XCAR (tail))) >= 0) - { - CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset; - CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i; - } - else - CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1; - tail = XCDR (tail); - while (CONSP (tail)) - { - if ((INTEGERP (XCAR (tail)) - && (charset = XINT (XCAR (tail)), - CHARSET_VALID_P (charset))) - || (charset = get_charset_id (XCAR (tail))) >= 0) - CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) - = i; - else if (EQ (XCAR (tail), Qt)) - reg_bits |= 1 << i; - tail = XCDR (tail); - } - } - else - CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1; - - CODING_SPEC_ISO_DESIGNATION (coding, i) - = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i); - } - - if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)) - { - /* REG 1 can be used only by locking shift in 7-bit env. */ - if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) - reg_bits &= ~2; - if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) - /* Without any shifting, only REG 0 and 1 can be used. */ - reg_bits &= 3; - } - - if (reg_bits) - for (charset = 0; charset <= MAX_CHARSET; charset++) - { - if (CHARSET_DEFINED_P (charset) - && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) - == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) - { - /* There exist some default graphic registers to be - used by CHARSET. */ - - /* We had better avoid designating a charset of - CHARS96 to REG 0 as far as possible. */ - if (CHARSET_CHARS (charset) == 96) - CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) - = (reg_bits & 2 - ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0))); - else - CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) - = (reg_bits & 1 - ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3))); - } - } - } - coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK; - coding->spec.iso2022.last_invalid_designation_register = -1; - break; - - case 3: - coding->type = coding_type_big5; - coding->common_flags - |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; - coding->flags - = (NILP (XVECTOR (coding_spec)->contents[4]) - ? CODING_FLAG_BIG5_HKU - : CODING_FLAG_BIG5_ETEN); - break; - - case 4: - coding->type = coding_type_ccl; - coding->common_flags - |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; - { - val = XVECTOR (coding_spec)->contents[4]; - if (! CONSP (val) - || setup_ccl_program (&(coding->spec.ccl.decoder), - XCAR (val)) < 0 - || setup_ccl_program (&(coding->spec.ccl.encoder), - XCDR (val)) < 0) - goto label_invalid_coding_system; - - bzero (coding->spec.ccl.valid_codes, 256); - val = Fplist_get (plist, Qvalid_codes); - if (CONSP (val)) - { - Lisp_Object this; - - for (; CONSP (val); val = XCDR (val)) - { - this = XCAR (val); - if (INTEGERP (this) - && XINT (this) >= 0 && XINT (this) < 256) - coding->spec.ccl.valid_codes[XINT (this)] = 1; - else if (CONSP (this) - && INTEGERP (XCAR (this)) - && INTEGERP (XCDR (this))) - { - int start = XINT (XCAR (this)); - int end = XINT (XCDR (this)); - - if (start >= 0 && start <= end && end < 256) - while (start <= end) - coding->spec.ccl.valid_codes[start++] = 1; - } - } - } - } - coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK; - coding->spec.ccl.cr_carryover = 0; - coding->spec.ccl.eight_bit_carryover[0] = 0; - break; - - case 5: - coding->type = coding_type_raw_text; - break; - - default: - goto label_invalid_coding_system; - } - return 0; - - label_invalid_coding_system: - coding->type = coding_type_no_conversion; - coding->category_idx = CODING_CATEGORY_IDX_BINARY; - coding->common_flags = 0; - coding->eol_type = CODING_EOL_LF; - coding->pre_write_conversion = coding->post_read_conversion = Qnil; - return -1; -} - -/* Free memory blocks allocated for storing composition information. */ - -void -coding_free_composition_data (coding) - struct coding_system *coding; -{ - struct composition_data *cmp_data = coding->cmp_data, *next; - - if (!cmp_data) - return; - /* Memory blocks are chained. At first, rewind to the first, then, - free blocks one by one. */ - while (cmp_data->prev) - cmp_data = cmp_data->prev; - while (cmp_data) - { - next = cmp_data->next; - xfree (cmp_data); - cmp_data = next; - } - coding->cmp_data = NULL; -} - -/* Set `char_offset' member of all memory blocks pointed by - coding->cmp_data to POS. */ - -void -coding_adjust_composition_offset (coding, pos) - struct coding_system *coding; - int pos; -{ - struct composition_data *cmp_data; - - for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next) - cmp_data->char_offset = pos; -} - -/* Setup raw-text or one of its subsidiaries in the structure - coding_system CODING according to the already setup value eol_type - in CODING. CODING should be setup for some coding system in - advance. */ - -void -setup_raw_text_coding_system (coding) - struct coding_system *coding; -{ - if (coding->type != coding_type_raw_text) - { - coding->symbol = Qraw_text; - coding->type = coding_type_raw_text; - if (coding->eol_type != CODING_EOL_UNDECIDED) - { - Lisp_Object subsidiaries; - subsidiaries = Fget (Qraw_text, Qeol_type); - - if (VECTORP (subsidiaries) - && XVECTOR (subsidiaries)->size == 3) - coding->symbol - = XVECTOR (subsidiaries)->contents[coding->eol_type]; - } - setup_coding_system (coding->symbol, coding); - } - return; -} - -/* Emacs has a mechanism to automatically detect a coding system if it - is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But, - it's impossible to distinguish some coding systems accurately - because they use the same range of codes. So, at first, coding - systems are categorized into 7, those are: - - o coding-category-emacs-mule - - The category for a coding system which has the same code range - as Emacs' internal format. Assigned the coding-system (Lisp - symbol) `emacs-mule' by default. - - o coding-category-sjis - - The category for a coding system which has the same code range - as SJIS. Assigned the coding-system (Lisp - symbol) `japanese-shift-jis' by default. - - o coding-category-iso-7 - - The category for a coding system which has the same code range - as ISO2022 of 7-bit environment. This doesn't use any locking - shift and single shift functions. This can encode/decode all - charsets. Assigned the coding-system (Lisp symbol) - `iso-2022-7bit' by default. - - o coding-category-iso-7-tight - - Same as coding-category-iso-7 except that this can - encode/decode only the specified charsets. - - o coding-category-iso-8-1 - - The category for a coding system which has the same code range - as ISO2022 of 8-bit environment and graphic plane 1 used only - for DIMENSION1 charset. This doesn't use any locking shift - and single shift functions. Assigned the coding-system (Lisp - symbol) `iso-latin-1' by default. - - o coding-category-iso-8-2 - - The category for a coding system which has the same code range - as ISO2022 of 8-bit environment and graphic plane 1 used only - for DIMENSION2 charset. This doesn't use any locking shift - and single shift functions. Assigned the coding-system (Lisp - symbol) `japanese-iso-8bit' by default. - - o coding-category-iso-7-else - - The category for a coding system which has the same code range - as ISO2022 of 7-bit environment but uses locking shift or - single shift functions. Assigned the coding-system (Lisp - symbol) `iso-2022-7bit-lock' by default. - - o coding-category-iso-8-else - - The category for a coding system which has the same code range - as ISO2022 of 8-bit environment but uses locking shift or - single shift functions. Assigned the coding-system (Lisp - symbol) `iso-2022-8bit-ss2' by default. - - o coding-category-big5 - - The category for a coding system which has the same code range - as BIG5. Assigned the coding-system (Lisp symbol) - `cn-big5' by default. - - o coding-category-utf-8 - - The category for a coding system which has the same code range - as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp - symbol) `utf-8' by default. - - o coding-category-utf-16-be - - The category for a coding system in which a text has an - Unicode signature (cf. Unicode Standard) in the order of BIG - endian at the head. Assigned the coding-system (Lisp symbol) - `utf-16-be' by default. - - o coding-category-utf-16-le - - The category for a coding system in which a text has an - Unicode signature (cf. Unicode Standard) in the order of - LITTLE endian at the head. Assigned the coding-system (Lisp - symbol) `utf-16-le' by default. - - o coding-category-ccl - - The category for a coding system of which encoder/decoder is - written in CCL programs. The default value is nil, i.e., no - coding system is assigned. - - o coding-category-binary - - The category for a coding system not categorized in any of the - above. Assigned the coding-system (Lisp symbol) - `no-conversion' by default. - - Each of them is a Lisp symbol and the value is an actual - `coding-system' (this is also a Lisp symbol) assigned by a user. - What Emacs does actually is to detect a category of coding system. - Then, it uses a `coding-system' assigned to it. If Emacs can't - decide a single possible category, it selects a category of the - highest priority. Priorities of categories are also specified by a - user in a Lisp variable `coding-category-list'. - -*/ - -static -int ascii_skip_code[256]; - -/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded. - If it detects possible coding systems, return an integer in which - appropriate flag bits are set. Flag bits are defined by macros - CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL, - it should point the table `coding_priorities'. In that case, only - the flag bit for a coding system of the highest priority is set in - the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the - range 0x80..0x9F are in multibyte form. - - How many ASCII characters are at the head is returned as *SKIP. */ - -static int -detect_coding_mask (source, src_bytes, priorities, skip, multibytep) - unsigned char *source; - int src_bytes, *priorities, *skip; - int multibytep; -{ - register unsigned char c; - unsigned char *src = source, *src_end = source + src_bytes; - unsigned int mask, utf16_examined_p, iso2022_examined_p; - int i; - - /* At first, skip all ASCII characters and control characters except - for three ISO2022 specific control characters. */ - ascii_skip_code[ISO_CODE_SO] = 0; - ascii_skip_code[ISO_CODE_SI] = 0; - ascii_skip_code[ISO_CODE_ESC] = 0; - - label_loop_detect_coding: - while (src < src_end && ascii_skip_code[*src]) src++; - *skip = src - source; - - if (src >= src_end) - /* We found nothing other than ASCII. There's nothing to do. */ - return 0; - - c = *src; - /* The text seems to be encoded in some multilingual coding system. - Now, try to find in which coding system the text is encoded. */ - if (c < 0x80) - { - /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ - /* C is an ISO2022 specific control code of C0. */ - mask = detect_coding_iso2022 (src, src_end, multibytep); - if (mask == 0) - { - /* No valid ISO2022 code follows C. Try again. */ - src++; - if (c == ISO_CODE_ESC) - ascii_skip_code[ISO_CODE_ESC] = 1; - else - ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1; - goto label_loop_detect_coding; - } - if (priorities) - { - for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) - { - if (mask & priorities[i]) - return priorities[i]; - } - return CODING_CATEGORY_MASK_RAW_TEXT; - } - } - else - { - int try; - - if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) - c = src[1] - 0x20; - - if (c < 0xA0) - { - /* C is the first byte of SJIS character code, - or a leading-code of Emacs' internal format (emacs-mule), - or the first byte of UTF-16. */ - try = (CODING_CATEGORY_MASK_SJIS - | CODING_CATEGORY_MASK_EMACS_MULE - | CODING_CATEGORY_MASK_UTF_16_BE - | CODING_CATEGORY_MASK_UTF_16_LE); - - /* Or, if C is a special latin extra code, - or is an ISO2022 specific control code of C1 (SS2 or SS3), - or is an ISO2022 control-sequence-introducer (CSI), - we should also consider the possibility of ISO2022 codings. */ - if ((VECTORP (Vlatin_extra_code_table) - && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) - || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) - || (c == ISO_CODE_CSI - && (src < src_end - && (*src == ']' - || ((*src == '0' || *src == '1' || *src == '2') - && src + 1 < src_end - && src[1] == ']'))))) - try |= (CODING_CATEGORY_MASK_ISO_8_ELSE - | CODING_CATEGORY_MASK_ISO_8BIT); - } - else - /* C is a character of ISO2022 in graphic plane right, - or a SJIS's 1-byte character code (i.e. JISX0201), - or the first byte of BIG5's 2-byte code, - or the first byte of UTF-8/16. */ - try = (CODING_CATEGORY_MASK_ISO_8_ELSE - | CODING_CATEGORY_MASK_ISO_8BIT - | CODING_CATEGORY_MASK_SJIS - | CODING_CATEGORY_MASK_BIG5 - | CODING_CATEGORY_MASK_UTF_8 - | CODING_CATEGORY_MASK_UTF_16_BE - | CODING_CATEGORY_MASK_UTF_16_LE); - - /* Or, we may have to consider the possibility of CCL. */ - if (coding_system_table[CODING_CATEGORY_IDX_CCL] - && (coding_system_table[CODING_CATEGORY_IDX_CCL] - ->spec.ccl.valid_codes)[c]) - try |= CODING_CATEGORY_MASK_CCL; - - mask = 0; - utf16_examined_p = iso2022_examined_p = 0; - if (priorities) - { - for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) - { - if (!iso2022_examined_p - && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) - { - mask |= detect_coding_iso2022 (src, src_end, multibytep); - iso2022_examined_p = 1; - } - else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) - mask |= detect_coding_sjis (src, src_end, multibytep); - else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) - mask |= detect_coding_utf_8 (src, src_end, multibytep); - else if (!utf16_examined_p - && (priorities[i] & try & - CODING_CATEGORY_MASK_UTF_16_BE_LE)) - { - mask |= detect_coding_utf_16 (src, src_end, multibytep); - utf16_examined_p = 1; - } - else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) - mask |= detect_coding_big5 (src, src_end, multibytep); - else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) - mask |= detect_coding_emacs_mule (src, src_end, multibytep); - else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) - mask |= detect_coding_ccl (src, src_end, multibytep); - else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) - mask |= CODING_CATEGORY_MASK_RAW_TEXT; - else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) - mask |= CODING_CATEGORY_MASK_BINARY; - if (mask & priorities[i]) - return priorities[i]; - } - return CODING_CATEGORY_MASK_RAW_TEXT; - } - if (try & CODING_CATEGORY_MASK_ISO) - mask |= detect_coding_iso2022 (src, src_end, multibytep); - if (try & CODING_CATEGORY_MASK_SJIS) - mask |= detect_coding_sjis (src, src_end, multibytep); - if (try & CODING_CATEGORY_MASK_BIG5) - mask |= detect_coding_big5 (src, src_end, multibytep); - if (try & CODING_CATEGORY_MASK_UTF_8) - mask |= detect_coding_utf_8 (src, src_end, multibytep); - if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE) - mask |= detect_coding_utf_16 (src, src_end, multibytep); - if (try & CODING_CATEGORY_MASK_EMACS_MULE) - mask |= detect_coding_emacs_mule (src, src_end, multibytep); - if (try & CODING_CATEGORY_MASK_CCL) - mask |= detect_coding_ccl (src, src_end, multibytep); - } - return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); -} - -/* Detect how a text of length SRC_BYTES pointed by SRC is encoded. - The information of the detected coding system is set in CODING. */ - -void -detect_coding (coding, src, src_bytes) - struct coding_system *coding; - unsigned char *src; - int src_bytes; -{ - unsigned int idx; - int skip, mask; - Lisp_Object val; - - val = Vcoding_category_list; - mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip, - coding->src_multibyte); - coding->heading_ascii = skip; - - if (!mask) return; - - /* We found a single coding system of the highest priority in MASK. */ - idx = 0; - while (mask && ! (mask & 1)) mask >>= 1, idx++; - if (! mask) - idx = CODING_CATEGORY_IDX_RAW_TEXT; - - val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]); - - if (coding->eol_type != CODING_EOL_UNDECIDED) - { - Lisp_Object tmp; - - tmp = Fget (val, Qeol_type); - if (VECTORP (tmp)) - val = XVECTOR (tmp)->contents[coding->eol_type]; - } - - /* Setup this new coding system while preserving some slots. */ - { - int src_multibyte = coding->src_multibyte; - int dst_multibyte = coding->dst_multibyte; - - setup_coding_system (val, coding); - coding->src_multibyte = src_multibyte; - coding->dst_multibyte = dst_multibyte; - coding->heading_ascii = skip; - } -} - -/* Detect how end-of-line of a text of length SRC_BYTES pointed by - SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF, - CODING_EOL_CR, and CODING_EOL_UNDECIDED. - - How many non-eol characters are at the head is returned as *SKIP. */ - -#define MAX_EOL_CHECK_COUNT 3 - -static int -detect_eol_type (source, src_bytes, skip) - unsigned char *source; - int src_bytes, *skip; -{ - unsigned char *src = source, *src_end = src + src_bytes; - unsigned char c; - int total = 0; /* How many end-of-lines are found so far. */ - int eol_type = CODING_EOL_UNDECIDED; - int this_eol_type; - - *skip = 0; - - while (src < src_end && total < MAX_EOL_CHECK_COUNT) - { - c = *src++; - if (c == '\n' || c == '\r') - { - if (*skip == 0) - *skip = src - 1 - source; - total++; - if (c == '\n') - this_eol_type = CODING_EOL_LF; - else if (src >= src_end || *src != '\n') - this_eol_type = CODING_EOL_CR; - else - this_eol_type = CODING_EOL_CRLF, src++; - - if (eol_type == CODING_EOL_UNDECIDED) - /* This is the first end-of-line. */ - eol_type = this_eol_type; - else if (eol_type != this_eol_type) - { - /* The found type is different from what found before. */ - eol_type = CODING_EOL_INCONSISTENT; - break; - } - } - } - - if (*skip == 0) - *skip = src_end - source; - return eol_type; -} - -/* Like detect_eol_type, but detect EOL type in 2-octet - big-endian/little-endian format for coding systems utf-16-be and - utf-16-le. */ - -static int -detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p) - unsigned char *source; - int src_bytes, *skip, big_endian_p; -{ - unsigned char *src = source, *src_end = src + src_bytes; - unsigned int c1, c2; - int total = 0; /* How many end-of-lines are found so far. */ - int eol_type = CODING_EOL_UNDECIDED; - int this_eol_type; - int msb, lsb; - - if (big_endian_p) - msb = 0, lsb = 1; - else - msb = 1, lsb = 0; - - *skip = 0; - - while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT) - { - c1 = (src[msb] << 8) | (src[lsb]); - src += 2; - - if (c1 == '\n' || c1 == '\r') - { - if (*skip == 0) - *skip = src - 2 - source; - total++; - if (c1 == '\n') - { - this_eol_type = CODING_EOL_LF; - } - else - { - if ((src + 1) >= src_end) - { - this_eol_type = CODING_EOL_CR; - } - else - { - c2 = (src[msb] << 8) | (src[lsb]); - if (c2 == '\n') - this_eol_type = CODING_EOL_CRLF, src += 2; - else - this_eol_type = CODING_EOL_CR; - } - } - - if (eol_type == CODING_EOL_UNDECIDED) - /* This is the first end-of-line. */ - eol_type = this_eol_type; - else if (eol_type != this_eol_type) - { - /* The found type is different from what found before. */ - eol_type = CODING_EOL_INCONSISTENT; - break; - } - } - } - - if (*skip == 0) - *skip = src_end - source; - return eol_type; -} - -/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC - is encoded. If it detects an appropriate format of end-of-line, it - sets the information in *CODING. */ - -void -detect_eol (coding, src, src_bytes) - struct coding_system *coding; - unsigned char *src; - int src_bytes; -{ - Lisp_Object val; - int skip; - int eol_type; - - switch (coding->category_idx) - { - case CODING_CATEGORY_IDX_UTF_16_BE: - eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1); - break; - case CODING_CATEGORY_IDX_UTF_16_LE: - eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0); - break; - default: - eol_type = detect_eol_type (src, src_bytes, &skip); - break; - } - - if (coding->heading_ascii > skip) - coding->heading_ascii = skip; - else - skip = coding->heading_ascii; - - if (eol_type == CODING_EOL_UNDECIDED) - return; - if (eol_type == CODING_EOL_INCONSISTENT) - { -#if 0 - /* This code is suppressed until we find a better way to - distinguish raw text file and binary file. */ - - /* If we have already detected that the coding is raw-text, the - coding should actually be no-conversion. */ - if (coding->type == coding_type_raw_text) - { - setup_coding_system (Qno_conversion, coding); - return; - } - /* Else, let's decode only text code anyway. */ -#endif /* 0 */ - eol_type = CODING_EOL_LF; - } - - val = Fget (coding->symbol, Qeol_type); - if (VECTORP (val) && XVECTOR (val)->size == 3) - { - int src_multibyte = coding->src_multibyte; - int dst_multibyte = coding->dst_multibyte; - - setup_coding_system (XVECTOR (val)->contents[eol_type], coding); - coding->src_multibyte = src_multibyte; - coding->dst_multibyte = dst_multibyte; - coding->heading_ascii = skip; - } -} - -#define CONVERSION_BUFFER_EXTRA_ROOM 256 - -#define DECODING_BUFFER_MAG(coding) \ - (coding->type == coding_type_iso2022 \ - ? 3 \ - : (coding->type == coding_type_ccl \ - ? coding->spec.ccl.decoder.buf_magnification \ - : 2)) - -/* Return maximum size (bytes) of a buffer enough for decoding - SRC_BYTES of text encoded in CODING. */ - -int -decoding_buffer_size (coding, src_bytes) - struct coding_system *coding; - int src_bytes; -{ - return (src_bytes * DECODING_BUFFER_MAG (coding) - + CONVERSION_BUFFER_EXTRA_ROOM); -} - -/* Return maximum size (bytes) of a buffer enough for encoding - SRC_BYTES of text to CODING. */ - -int -encoding_buffer_size (coding, src_bytes) - struct coding_system *coding; - int src_bytes; -{ - int magnification; - - if (coding->type == coding_type_ccl) - magnification = coding->spec.ccl.encoder.buf_magnification; - else if (CODING_REQUIRE_ENCODING (coding)) - magnification = 3; - else - magnification = 1; - - return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM); -} - -/* Working buffer for code conversion. */ -struct conversion_buffer -{ - int size; /* size of data. */ - int on_stack; /* 1 if allocated by alloca. */ - unsigned char *data; -}; - -/* Don't use alloca for allocating memory space larger than this, lest - we overflow their stack. */ -#define MAX_ALLOCA 16*1024 - -/* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */ -#define allocate_conversion_buffer(buf, len) \ - do { \ - if (len < MAX_ALLOCA) \ - { \ - buf.data = (unsigned char *) alloca (len); \ - buf.on_stack = 1; \ - } \ - else \ - { \ - buf.data = (unsigned char *) xmalloc (len); \ - buf.on_stack = 0; \ - } \ - buf.size = len; \ - } while (0) - -/* Double the allocated memory for *BUF. */ -static void -extend_conversion_buffer (buf) - struct conversion_buffer *buf; -{ - if (buf->on_stack) - { - unsigned char *save = buf->data; - buf->data = (unsigned char *) xmalloc (buf->size * 2); - bcopy (save, buf->data, buf->size); - buf->on_stack = 0; - } - else - { - buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2); - } - buf->size *= 2; -} - -/* Free the allocated memory for BUF if it is not on stack. */ -static void -free_conversion_buffer (buf) - struct conversion_buffer *buf; -{ - if (!buf->on_stack) - xfree (buf->data); -} - -int -ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes, encodep; -{ - struct ccl_program *ccl - = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder; - unsigned char *dst = destination; - - ccl->suppress_error = coding->suppress_error; - ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK; - if (encodep) - { - /* On encoding, EOL format is converted within ccl_driver. For - that, setup proper information in the structure CCL. */ - ccl->eol_type = coding->eol_type; - if (ccl->eol_type ==CODING_EOL_UNDECIDED) - ccl->eol_type = CODING_EOL_LF; - ccl->cr_consumed = coding->spec.ccl.cr_carryover; - } - ccl->multibyte = coding->src_multibyte; - if (coding->spec.ccl.eight_bit_carryover[0] != 0) - { - /* Move carryover bytes to DESTINATION. */ - unsigned char *p = coding->spec.ccl.eight_bit_carryover; - while (*p) - *dst++ = *p++; - coding->spec.ccl.eight_bit_carryover[0] = 0; - if (dst_bytes) - dst_bytes -= dst - destination; - } - - coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes, - &(coding->consumed)) - + dst - destination); - - if (encodep) - { - coding->produced_char = coding->produced; - coding->spec.ccl.cr_carryover = ccl->cr_consumed; - } - else if (!ccl->eight_bit_control) - { - /* The produced bytes forms a valid multibyte sequence. */ - coding->produced_char - = multibyte_chars_in_text (destination, coding->produced); - coding->spec.ccl.eight_bit_carryover[0] = 0; - } - else - { - /* On decoding, the destination should always multibyte. But, - CCL program might have been generated an invalid multibyte - sequence. Here we make such a sequence valid as - multibyte. */ - int bytes - = dst_bytes ? dst_bytes : source + coding->consumed - destination; - - if ((coding->consumed < src_bytes - || !ccl->last_block) - && coding->produced >= 1 - && destination[coding->produced - 1] >= 0x80) - { - /* We should not convert the tailing 8-bit codes to - multibyte form even if they doesn't form a valid - multibyte sequence. They may form a valid sequence in - the next call. */ - int carryover = 0; - - if (destination[coding->produced - 1] < 0xA0) - carryover = 1; - else if (coding->produced >= 2) - { - if (destination[coding->produced - 2] >= 0x80) - { - if (destination[coding->produced - 2] < 0xA0) - carryover = 2; - else if (coding->produced >= 3 - && destination[coding->produced - 3] >= 0x80 - && destination[coding->produced - 3] < 0xA0) - carryover = 3; - } - } - if (carryover > 0) - { - BCOPY_SHORT (destination + coding->produced - carryover, - coding->spec.ccl.eight_bit_carryover, - carryover); - coding->spec.ccl.eight_bit_carryover[carryover] = 0; - coding->produced -= carryover; - } - } - coding->produced = str_as_multibyte (destination, bytes, - coding->produced, - &(coding->produced_char)); - } - - switch (ccl->status) - { - case CCL_STAT_SUSPEND_BY_SRC: - coding->result = CODING_FINISH_INSUFFICIENT_SRC; - break; - case CCL_STAT_SUSPEND_BY_DST: - coding->result = CODING_FINISH_INSUFFICIENT_DST; - break; - case CCL_STAT_QUIT: - case CCL_STAT_INVALID_CMD: - coding->result = CODING_FINISH_INTERRUPT; - break; - default: - coding->result = CODING_FINISH_NORMAL; - break; - } - return coding->result; -} - -/* Decode EOL format of the text at PTR of BYTES length destructively - according to CODING->eol_type. This is called after the CCL - program produced a decoded text at PTR. If we do CRLF->LF - conversion, update CODING->produced and CODING->produced_char. */ - -static void -decode_eol_post_ccl (coding, ptr, bytes) - struct coding_system *coding; - unsigned char *ptr; - int bytes; -{ - Lisp_Object val, saved_coding_symbol; - unsigned char *pend = ptr + bytes; - int dummy; - - /* Remember the current coding system symbol. We set it back when - an inconsistent EOL is found so that `last-coding-system-used' is - set to the coding system that doesn't specify EOL conversion. */ - saved_coding_symbol = coding->symbol; - - coding->spec.ccl.cr_carryover = 0; - if (coding->eol_type == CODING_EOL_UNDECIDED) - { - /* Here, to avoid the call of setup_coding_system, we directly - call detect_eol_type. */ - coding->eol_type = detect_eol_type (ptr, bytes, &dummy); - if (coding->eol_type == CODING_EOL_INCONSISTENT) - coding->eol_type = CODING_EOL_LF; - if (coding->eol_type != CODING_EOL_UNDECIDED) - { - val = Fget (coding->symbol, Qeol_type); - if (VECTORP (val) && XVECTOR (val)->size == 3) - coding->symbol = XVECTOR (val)->contents[coding->eol_type]; - } - coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; - } - - if (coding->eol_type == CODING_EOL_LF - || coding->eol_type == CODING_EOL_UNDECIDED) - { - /* We have nothing to do. */ - ptr = pend; - } - else if (coding->eol_type == CODING_EOL_CRLF) - { - unsigned char *pstart = ptr, *p = ptr; - - if (! (coding->mode & CODING_MODE_LAST_BLOCK) - && *(pend - 1) == '\r') - { - /* If the last character is CR, we can't handle it here - because LF will be in the not-yet-decoded source text. - Recorded that the CR is not yet processed. */ - coding->spec.ccl.cr_carryover = 1; - coding->produced--; - coding->produced_char--; - pend--; - } - while (ptr < pend) - { - if (*ptr == '\r') - { - if (ptr + 1 < pend && *(ptr + 1) == '\n') - { - *p++ = '\n'; - ptr += 2; - } - else - { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - goto undo_eol_conversion; - *p++ = *ptr++; - } - } - else if (*ptr == '\n' - && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - goto undo_eol_conversion; - else - *p++ = *ptr++; - continue; - - undo_eol_conversion: - /* We have faced with inconsistent EOL format at PTR. - Convert all LFs before PTR back to CRLFs. */ - for (p--, ptr--; p >= pstart; p--) - { - if (*p == '\n') - *ptr-- = '\n', *ptr-- = '\r'; - else - *ptr-- = *p; - } - /* If carryover is recorded, cancel it because we don't - convert CRLF anymore. */ - if (coding->spec.ccl.cr_carryover) - { - coding->spec.ccl.cr_carryover = 0; - coding->produced++; - coding->produced_char++; - pend++; - } - p = ptr = pend; - coding->eol_type = CODING_EOL_LF; - coding->symbol = saved_coding_symbol; - } - if (p < pend) - { - /* As each two-byte sequence CRLF was converted to LF, (PEND - - P) is the number of deleted characters. */ - coding->produced -= pend - p; - coding->produced_char -= pend - p; - } - } - else /* i.e. coding->eol_type == CODING_EOL_CR */ - { - unsigned char *p = ptr; - - for (; ptr < pend; ptr++) - { - if (*ptr == '\r') - *ptr = '\n'; - else if (*ptr == '\n' - && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - for (; p < ptr; p++) - { - if (*p == '\n') - *p = '\r'; - } - ptr = pend; - coding->eol_type = CODING_EOL_LF; - coding->symbol = saved_coding_symbol; - } - } - } -} - -/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before - decoding, it may detect coding system and format of end-of-line if - those are not yet decided. The source should be unibyte, the - result is multibyte if CODING->dst_multibyte is nonzero, else - unibyte. */ - -int -decode_coding (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - if (coding->type == coding_type_undecided) - detect_coding (coding, source, src_bytes); - - if (coding->eol_type == CODING_EOL_UNDECIDED - && coding->type != coding_type_ccl) - { - detect_eol (coding, source, src_bytes); - /* We had better recover the original eol format if we - encounter an inconsistent eol format while decoding. */ - coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; - } - - coding->produced = coding->produced_char = 0; - coding->consumed = coding->consumed_char = 0; - coding->errors = 0; - coding->result = CODING_FINISH_NORMAL; - - switch (coding->type) - { - case coding_type_sjis: - decode_coding_sjis_big5 (coding, source, destination, - src_bytes, dst_bytes, 1); - break; - - case coding_type_iso2022: - decode_coding_iso2022 (coding, source, destination, - src_bytes, dst_bytes); - break; - - case coding_type_big5: - decode_coding_sjis_big5 (coding, source, destination, - src_bytes, dst_bytes, 0); - break; - - case coding_type_emacs_mule: - decode_coding_emacs_mule (coding, source, destination, - src_bytes, dst_bytes); - break; - - case coding_type_ccl: - if (coding->spec.ccl.cr_carryover) - { - /* Set the CR which is not processed by the previous call of - decode_eol_post_ccl in DESTINATION. */ - *destination = '\r'; - coding->produced++; - coding->produced_char++; - dst_bytes--; - } - ccl_coding_driver (coding, source, - destination + coding->spec.ccl.cr_carryover, - src_bytes, dst_bytes, 0); - if (coding->eol_type != CODING_EOL_LF) - decode_eol_post_ccl (coding, destination, coding->produced); - break; - - default: - decode_eol (coding, source, destination, src_bytes, dst_bytes); - } - - if (coding->result == CODING_FINISH_INSUFFICIENT_SRC - && coding->mode & CODING_MODE_LAST_BLOCK - && coding->consumed == src_bytes) - coding->result = CODING_FINISH_NORMAL; - - if (coding->mode & CODING_MODE_LAST_BLOCK - && coding->result == CODING_FINISH_INSUFFICIENT_SRC) - { - unsigned char *src = source + coding->consumed; - unsigned char *dst = destination + coding->produced; - - src_bytes -= coding->consumed; - coding->errors++; - if (COMPOSING_P (coding)) - DECODE_COMPOSITION_END ('1'); - while (src_bytes--) - { - int c = *src++; - dst += CHAR_STRING (c, dst); - coding->produced_char++; - } - coding->consumed = coding->consumed_char = src - source; - coding->produced = dst - destination; - coding->result = CODING_FINISH_NORMAL; - } - - if (!coding->dst_multibyte) - { - coding->produced = str_as_unibyte (destination, coding->produced); - coding->produced_char = coding->produced; - } - - return coding->result; -} - -/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The - multibyteness of the source is CODING->src_multibyte, the - multibyteness of the result is always unibyte. */ - -int -encode_coding (coding, source, destination, src_bytes, dst_bytes) - struct coding_system *coding; - unsigned char *source, *destination; - int src_bytes, dst_bytes; -{ - coding->produced = coding->produced_char = 0; - coding->consumed = coding->consumed_char = 0; - coding->errors = 0; - coding->result = CODING_FINISH_NORMAL; - - switch (coding->type) - { - case coding_type_sjis: - encode_coding_sjis_big5 (coding, source, destination, - src_bytes, dst_bytes, 1); - break; - - case coding_type_iso2022: - encode_coding_iso2022 (coding, source, destination, - src_bytes, dst_bytes); - break; - - case coding_type_big5: - encode_coding_sjis_big5 (coding, source, destination, - src_bytes, dst_bytes, 0); - break; - - case coding_type_emacs_mule: - encode_coding_emacs_mule (coding, source, destination, - src_bytes, dst_bytes); - break; - - case coding_type_ccl: - ccl_coding_driver (coding, source, destination, - src_bytes, dst_bytes, 1); - break; - - default: - encode_eol (coding, source, destination, src_bytes, dst_bytes); - } - - if (coding->mode & CODING_MODE_LAST_BLOCK - && coding->result == CODING_FINISH_INSUFFICIENT_SRC) - { - unsigned char *src = source + coding->consumed; - unsigned char *dst = destination + coding->produced; - - if (coding->type == coding_type_iso2022) - ENCODE_RESET_PLANE_AND_REGISTER; - if (COMPOSING_P (coding)) - *dst++ = ISO_CODE_ESC, *dst++ = '1'; - if (coding->consumed < src_bytes) - { - int len = src_bytes - coding->consumed; - - BCOPY_SHORT (src, dst, len); - if (coding->src_multibyte) - len = str_as_unibyte (dst, len); - dst += len; - coding->consumed = src_bytes; - } - coding->produced = coding->produced_char = dst - destination; - coding->result = CODING_FINISH_NORMAL; - } - - if (coding->result == CODING_FINISH_INSUFFICIENT_SRC - && coding->consumed == src_bytes) - coding->result = CODING_FINISH_NORMAL; - - return coding->result; -} - -/* Scan text in the region between *BEG and *END (byte positions), - skip characters which we don't have to decode by coding system - CODING at the head and tail, then set *BEG and *END to the region - of the text we actually have to convert. The caller should move - the gap out of the region in advance if the region is from a - buffer. - - If STR is not NULL, *BEG and *END are indices into STR. */ - -static void -shrink_decoding_region (beg, end, coding, str) - int *beg, *end; - struct coding_system *coding; - unsigned char *str; -{ - unsigned char *begp_orig, *begp, *endp_orig, *endp, c; - int eol_conversion; - Lisp_Object translation_table; - - if (coding->type == coding_type_ccl - || coding->type == coding_type_undecided - || coding->eol_type != CODING_EOL_LF - || !NILP (coding->post_read_conversion) - || coding->composing != COMPOSITION_DISABLED) - { - /* We can't skip any data. */ - return; - } - if (coding->type == coding_type_no_conversion - || coding->type == coding_type_raw_text - || coding->type == coding_type_emacs_mule) - { - /* We need no conversion, but don't have to skip any data here. - Decoding routine handles them effectively anyway. */ - return; - } - - translation_table = coding->translation_table_for_decode; - if (NILP (translation_table) && !NILP (Venable_character_translation)) - translation_table = Vstandard_translation_table_for_decode; - if (CHAR_TABLE_P (translation_table)) - { - int i; - for (i = 0; i < 128; i++) - if (!NILP (CHAR_TABLE_REF (translation_table, i))) - break; - if (i < 128) - /* Some ASCII character should be translated. We give up - shrinking. */ - return; - } - - if (coding->heading_ascii >= 0) - /* Detection routine has already found how much we can skip at the - head. */ - *beg += coding->heading_ascii; - - if (str) - { - begp_orig = begp = str + *beg; - endp_orig = endp = str + *end; - } - else - { - begp_orig = begp = BYTE_POS_ADDR (*beg); - endp_orig = endp = begp + *end - *beg; - } - - eol_conversion = (coding->eol_type == CODING_EOL_CR - || coding->eol_type == CODING_EOL_CRLF); - - switch (coding->type) - { - case coding_type_sjis: - case coding_type_big5: - /* We can skip all ASCII characters at the head. */ - if (coding->heading_ascii < 0) - { - if (eol_conversion) - while (begp < endp && *begp < 0x80 && *begp != '\r') begp++; - else - while (begp < endp && *begp < 0x80) begp++; - } - /* We can skip all ASCII characters at the tail except for the - second byte of SJIS or BIG5 code. */ - if (eol_conversion) - while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--; - else - while (begp < endp && endp[-1] < 0x80) endp--; - /* Do not consider LF as ascii if preceded by CR, since that - confuses eol decoding. */ - if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') - endp++; - if (begp < endp && endp < endp_orig && endp[-1] >= 0x80) - endp++; - break; - - case coding_type_iso2022: - if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII) - /* We can't skip any data. */ - break; - if (coding->heading_ascii < 0) - { - /* We can skip all ASCII characters at the head except for a - few control codes. */ - while (begp < endp && (c = *begp) < 0x80 - && c != ISO_CODE_CR && c != ISO_CODE_SO - && c != ISO_CODE_SI && c != ISO_CODE_ESC - && (!eol_conversion || c != ISO_CODE_LF)) - begp++; - } - switch (coding->category_idx) - { - case CODING_CATEGORY_IDX_ISO_8_1: - case CODING_CATEGORY_IDX_ISO_8_2: - /* We can skip all ASCII characters at the tail. */ - if (eol_conversion) - while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--; - else - while (begp < endp && endp[-1] < 0x80) endp--; - /* Do not consider LF as ascii if preceded by CR, since that - confuses eol decoding. */ - if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') - endp++; - break; - - case CODING_CATEGORY_IDX_ISO_7: - case CODING_CATEGORY_IDX_ISO_7_TIGHT: - { - /* We can skip all characters at the tail except for 8-bit - codes and ESC and the following 2-byte at the tail. */ - unsigned char *eight_bit = NULL; - - if (eol_conversion) - while (begp < endp - && (c = endp[-1]) != ISO_CODE_ESC && c != '\r') - { - if (!eight_bit && c & 0x80) eight_bit = endp; - endp--; - } - else - while (begp < endp - && (c = endp[-1]) != ISO_CODE_ESC) - { - if (!eight_bit && c & 0x80) eight_bit = endp; - endp--; - } - /* Do not consider LF as ascii if preceded by CR, since that - confuses eol decoding. */ - if (begp < endp && endp < endp_orig - && endp[-1] == '\r' && endp[0] == '\n') - endp++; - if (begp < endp && endp[-1] == ISO_CODE_ESC) - { - if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B') - /* This is an ASCII designation sequence. We can - surely skip the tail. But, if we have - encountered an 8-bit code, skip only the codes - after that. */ - endp = eight_bit ? eight_bit : endp + 2; - else - /* Hmmm, we can't skip the tail. */ - endp = endp_orig; - } - else if (eight_bit) - endp = eight_bit; - } - } - break; - - default: - abort (); - } - *beg += begp - begp_orig; - *end += endp - endp_orig; - return; -} - -/* Like shrink_decoding_region but for encoding. */ - -static void -shrink_encoding_region (beg, end, coding, str) - int *beg, *end; - struct coding_system *coding; - unsigned char *str; -{ - unsigned char *begp_orig, *begp, *endp_orig, *endp; - int eol_conversion; - Lisp_Object translation_table; - - if (coding->type == coding_type_ccl - || coding->eol_type == CODING_EOL_CRLF - || coding->eol_type == CODING_EOL_CR - || (coding->cmp_data && coding->cmp_data->used > 0)) - { - /* We can't skip any data. */ - return; - } - if (coding->type == coding_type_no_conversion - || coding->type == coding_type_raw_text - || coding->type == coding_type_emacs_mule - || coding->type == coding_type_undecided) - { - /* We need no conversion, but don't have to skip any data here. - Encoding routine handles them effectively anyway. */ - return; - } - - translation_table = coding->translation_table_for_encode; - if (NILP (translation_table) && !NILP (Venable_character_translation)) - translation_table = Vstandard_translation_table_for_encode; - if (CHAR_TABLE_P (translation_table)) - { - int i; - for (i = 0; i < 128; i++) - if (!NILP (CHAR_TABLE_REF (translation_table, i))) - break; - if (i < 128) - /* Some ASCII character should be translated. We give up - shrinking. */ - return; - } - - if (str) - { - begp_orig = begp = str + *beg; - endp_orig = endp = str + *end; - } - else - { - begp_orig = begp = BYTE_POS_ADDR (*beg); - endp_orig = endp = begp + *end - *beg; - } - - eol_conversion = (coding->eol_type == CODING_EOL_CR - || coding->eol_type == CODING_EOL_CRLF); - - /* Here, we don't have to check coding->pre_write_conversion because - the caller is expected to have handled it already. */ - switch (coding->type) - { - case coding_type_iso2022: - if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII) - /* We can't skip any data. */ - break; - if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL) - { - unsigned char *bol = begp; - while (begp < endp && *begp < 0x80) - { - begp++; - if (begp[-1] == '\n') - bol = begp; - } - begp = bol; - goto label_skip_tail; - } - /* fall down ... */ - - case coding_type_sjis: - case coding_type_big5: - /* We can skip all ASCII characters at the head and tail. */ - if (eol_conversion) - while (begp < endp && *begp < 0x80 && *begp != '\n') begp++; - else - while (begp < endp && *begp < 0x80) begp++; - label_skip_tail: - if (eol_conversion) - while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--; - else - while (begp < endp && *(endp - 1) < 0x80) endp--; - break; - - default: - abort (); - } - - *beg += begp - begp_orig; - *end += endp - endp_orig; - return; -} - -/* As shrinking conversion region requires some overhead, we don't try - shrinking if the length of conversion region is less than this - value. */ -static int shrink_conversion_region_threshhold = 1024; - -#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \ - do { \ - if (*(end) - *(beg) > shrink_conversion_region_threshhold) \ - { \ - if (encodep) shrink_encoding_region (beg, end, coding, str); \ - else shrink_decoding_region (beg, end, coding, str); \ - } \ - } while (0) - -static Lisp_Object -code_convert_region_unwind (dummy) - Lisp_Object dummy; -{ - inhibit_pre_post_conversion = 0; - return Qnil; -} - -/* Store information about all compositions in the range FROM and TO - of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a - buffer or a string, defaults to the current buffer. */ - -void -coding_save_composition (coding, from, to, obj) - struct coding_system *coding; - int from, to; - Lisp_Object obj; -{ - Lisp_Object prop; - int start, end; - - if (coding->composing == COMPOSITION_DISABLED) - return; - if (!coding->cmp_data) - coding_allocate_composition_data (coding, from); - if (!find_composition (from, to, &start, &end, &prop, obj) - || end > to) - return; - if (start < from - && (!find_composition (end, to, &start, &end, &prop, obj) - || end > to)) - return; - coding->composing = COMPOSITION_NO; - do - { - if (COMPOSITION_VALID_P (start, end, prop)) - { - enum composition_method method = COMPOSITION_METHOD (prop); - if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH - >= COMPOSITION_DATA_SIZE) - coding_allocate_composition_data (coding, from); - /* For relative composition, we remember start and end - positions, for the other compositions, we also remember - components. */ - CODING_ADD_COMPOSITION_START (coding, start - from, method); - if (method != COMPOSITION_RELATIVE) - { - /* We must store a*/ - Lisp_Object val, ch; - - val = COMPOSITION_COMPONENTS (prop); - if (CONSP (val)) - while (CONSP (val)) - { - ch = XCAR (val), val = XCDR (val); - CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch)); - } - else if (VECTORP (val) || STRINGP (val)) - { - int len = (VECTORP (val) - ? XVECTOR (val)->size : XSTRING (val)->size); - int i; - for (i = 0; i < len; i++) - { - ch = (STRINGP (val) - ? Faref (val, make_number (i)) - : XVECTOR (val)->contents[i]); - CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch)); - } - } - else /* INTEGERP (val) */ - CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val)); - } - CODING_ADD_COMPOSITION_END (coding, end - from); - } - start = end; - } - while (start < to - && find_composition (start, to, &start, &end, &prop, obj) - && end <= to); - - /* Make coding->cmp_data point to the first memory block. */ - while (coding->cmp_data->prev) - coding->cmp_data = coding->cmp_data->prev; - coding->cmp_data_start = 0; -} - -/* Reflect the saved information about compositions to OBJ. - CODING->cmp_data points to a memory block for the information. OBJ - is a buffer or a string, defaults to the current buffer. */ - -void -coding_restore_composition (coding, obj) - struct coding_system *coding; - Lisp_Object obj; -{ - struct composition_data *cmp_data = coding->cmp_data; - - if (!cmp_data) - return; - - while (cmp_data->prev) - cmp_data = cmp_data->prev; - - while (cmp_data) - { - int i; - - for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0; - i += cmp_data->data[i]) - { - int *data = cmp_data->data + i; - enum composition_method method = (enum composition_method) data[3]; - Lisp_Object components; - - if (method == COMPOSITION_RELATIVE) - components = Qnil; - else - { - int len = data[0] - 4, j; - Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1]; - - for (j = 0; j < len; j++) - args[j] = make_number (data[4 + j]); - components = (method == COMPOSITION_WITH_ALTCHARS - ? Fstring (len, args) : Fvector (len, args)); - } - compose_text (data[1], data[2], components, Qnil, obj); - } - cmp_data = cmp_data->next; - } -} - -/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the - text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by - coding system CODING, and return the status code of code conversion - (currently, this value has no meaning). - - How many characters (and bytes) are converted to how many - characters (and bytes) are recorded in members of the structure - CODING. - - If REPLACE is nonzero, we do various things as if the original text - is deleted and a new text is inserted. See the comments in - replace_range (insdel.c) to know what we are doing. - - If REPLACE is zero, it is assumed that the source text is unibyte. - Otherwise, it is assumed that the source text is multibyte. */ - -int -code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) - int from, from_byte, to, to_byte, encodep, replace; - struct coding_system *coding; -{ - int len = to - from, len_byte = to_byte - from_byte; - int nchars_del = 0, nbytes_del = 0; - int require, inserted, inserted_byte; - int head_skip, tail_skip, total_skip = 0; - Lisp_Object saved_coding_symbol; - int first = 1; - unsigned char *src, *dst; - Lisp_Object deletion; - int orig_point = PT, orig_len = len; - int prev_Z; - int multibyte_p = !NILP (current_buffer->enable_multibyte_characters); - - deletion = Qnil; - saved_coding_symbol = coding->symbol; - - if (from < PT && PT < to) - { - TEMP_SET_PT_BOTH (from, from_byte); - orig_point = from; - } - - if (replace) - { - int saved_from = from; - int saved_inhibit_modification_hooks; - - prepare_to_modify_buffer (from, to, &from); - if (saved_from != from) - { - to = from + len; - from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to); - len_byte = to_byte - from_byte; - } - - /* The code conversion routine can not preserve text properties - for now. So, we must remove all text properties in the - region. Here, we must suppress all modification hooks. */ - saved_inhibit_modification_hooks = inhibit_modification_hooks; - inhibit_modification_hooks = 1; - Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil); - inhibit_modification_hooks = saved_inhibit_modification_hooks; - } - - if (! encodep && CODING_REQUIRE_DETECTION (coding)) - { - /* We must detect encoding of text and eol format. */ - - if (from < GPT && to > GPT) - move_gap_both (from, from_byte); - if (coding->type == coding_type_undecided) - { - detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte); - if (coding->type == coding_type_undecided) - { - /* It seems that the text contains only ASCII, but we - should not leave it undecided because the deeper - decoding routine (decode_coding) tries to detect the - encodings again in vain. */ - coding->type = coding_type_emacs_mule; - coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE; - /* As emacs-mule decoder will handle composition, we - need this setting to allocate coding->cmp_data - later. */ - coding->composing = COMPOSITION_NO; - } - } - if (coding->eol_type == CODING_EOL_UNDECIDED - && coding->type != coding_type_ccl) - { - detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte); - if (coding->eol_type == CODING_EOL_UNDECIDED) - coding->eol_type = CODING_EOL_LF; - /* We had better recover the original eol format if we - encounter an inconsistent eol format while decoding. */ - coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; - } - } - - /* Now we convert the text. */ - - /* For encoding, we must process pre-write-conversion in advance. */ - if (! inhibit_pre_post_conversion - && encodep - && SYMBOLP (coding->pre_write_conversion) - && ! NILP (Ffboundp (coding->pre_write_conversion))) - { - /* The function in pre-write-conversion may put a new text in a - new buffer. */ - struct buffer *prev = current_buffer; - Lisp_Object new; - - record_unwind_protect (code_convert_region_unwind, Qnil); - /* We should not call any more pre-write/post-read-conversion - functions while this pre-write-conversion is running. */ - inhibit_pre_post_conversion = 1; - call2 (coding->pre_write_conversion, - make_number (from), make_number (to)); - inhibit_pre_post_conversion = 0; - /* Discard the unwind protect. */ - specpdl_ptr--; - - if (current_buffer != prev) - { - len = ZV - BEGV; - new = Fcurrent_buffer (); - set_buffer_internal_1 (prev); - del_range_2 (from, from_byte, to, to_byte, 0); - TEMP_SET_PT_BOTH (from, from_byte); - insert_from_buffer (XBUFFER (new), 1, len, 0); - Fkill_buffer (new); - if (orig_point >= to) - orig_point += len - orig_len; - else if (orig_point > from) - orig_point = from; - orig_len = len; - to = from + len; - from_byte = CHAR_TO_BYTE (from); - to_byte = CHAR_TO_BYTE (to); - len_byte = to_byte - from_byte; - TEMP_SET_PT_BOTH (from, from_byte); - } - } - - if (replace) - { - if (! EQ (current_buffer->undo_list, Qt)) - deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1); - else - { - nchars_del = to - from; - nbytes_del = to_byte - from_byte; - } - } - - if (coding->composing != COMPOSITION_DISABLED) - { - if (encodep) - coding_save_composition (coding, from, to, Fcurrent_buffer ()); - else - coding_allocate_composition_data (coding, from); - } - - /* Try to skip the heading and tailing ASCIIs. */ - if (coding->type != coding_type_ccl) - { - int from_byte_orig = from_byte, to_byte_orig = to_byte; - - if (from < GPT && GPT < to) - move_gap_both (from, from_byte); - SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep); - if (from_byte == to_byte - && (encodep || NILP (coding->post_read_conversion)) - && ! CODING_REQUIRE_FLUSHING (coding)) - { - coding->produced = len_byte; - coding->produced_char = len; - if (!replace) - /* We must record and adjust for this new text now. */ - adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len); - return 0; - } - - head_skip = from_byte - from_byte_orig; - tail_skip = to_byte_orig - to_byte; - total_skip = head_skip + tail_skip; - from += head_skip; - to -= tail_skip; - len -= total_skip; len_byte -= total_skip; - } - - /* For conversion, we must put the gap before the text in addition to - making the gap larger for efficient decoding. The required gap - size starts from 2000 which is the magic number used in make_gap. - But, after one batch of conversion, it will be incremented if we - find that it is not enough . */ - require = 2000; - - if (GAP_SIZE < require) - make_gap (require - GAP_SIZE); - move_gap_both (from, from_byte); - - inserted = inserted_byte = 0; - - GAP_SIZE += len_byte; - ZV -= len; - Z -= len; - ZV_BYTE -= len_byte; - Z_BYTE -= len_byte; - - if (GPT - BEG < BEG_UNCHANGED) - BEG_UNCHANGED = GPT - BEG; - if (Z - GPT < END_UNCHANGED) - END_UNCHANGED = Z - GPT; - - if (!encodep && coding->src_multibyte) - { - /* Decoding routines expects that the source text is unibyte. - We must convert 8-bit characters of multibyte form to - unibyte. */ - int len_byte_orig = len_byte; - len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte); - if (len_byte < len_byte_orig) - safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte, - len_byte); - coding->src_multibyte = 0; - } - - for (;;) - { - int result; - - /* The buffer memory is now: - +--------+converted-text+---------+-------original-text-------+---+ - |<-from->|<--inserted-->|---------|<--------len_byte--------->|---| - |<---------------------- GAP ----------------------->| */ - src = GAP_END_ADDR - len_byte; - dst = GPT_ADDR + inserted_byte; - - if (encodep) - result = encode_coding (coding, src, dst, len_byte, 0); - else - { - if (coding->composing != COMPOSITION_DISABLED) - coding->cmp_data->char_offset = from + inserted; - result = decode_coding (coding, src, dst, len_byte, 0); - } - - /* The buffer memory is now: - +--------+-------converted-text----+--+------original-text----+---+ - |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---| - |<---------------------- GAP ----------------------->| */ - - inserted += coding->produced_char; - inserted_byte += coding->produced; - len_byte -= coding->consumed; - - if (result == CODING_FINISH_INSUFFICIENT_CMP) - { - coding_allocate_composition_data (coding, from + inserted); - continue; - } - - src += coding->consumed; - dst += coding->produced; - - if (result == CODING_FINISH_NORMAL) - { - src += len_byte; - break; - } - if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL) - { - unsigned char *pend = dst, *p = pend - inserted_byte; - Lisp_Object eol_type; - - /* Encode LFs back to the original eol format (CR or CRLF). */ - if (coding->eol_type == CODING_EOL_CR) - { - while (p < pend) if (*p++ == '\n') p[-1] = '\r'; - } - else - { - int count = 0; - - while (p < pend) if (*p++ == '\n') count++; - if (src - dst < count) - { - /* We don't have sufficient room for encoding LFs - back to CRLF. We must record converted and - not-yet-converted text back to the buffer - content, enlarge the gap, then record them out of - the buffer contents again. */ - int add = len_byte + inserted_byte; - - GAP_SIZE -= add; - ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add; - GPT += inserted_byte; GPT_BYTE += inserted_byte; - make_gap (count - GAP_SIZE); - GAP_SIZE += add; - ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add; - GPT -= inserted_byte; GPT_BYTE -= inserted_byte; - /* Don't forget to update SRC, DST, and PEND. */ - src = GAP_END_ADDR - len_byte; - dst = GPT_ADDR + inserted_byte; - pend = dst; - } - inserted += count; - inserted_byte += count; - coding->produced += count; - p = dst = pend + count; - while (count) - { - *--p = *--pend; - if (*p == '\n') count--, *--p = '\r'; - } - } - - /* Suppress eol-format conversion in the further conversion. */ - coding->eol_type = CODING_EOL_LF; - - /* Set the coding system symbol to that for Unix-like EOL. */ - eol_type = Fget (saved_coding_symbol, Qeol_type); - if (VECTORP (eol_type) - && XVECTOR (eol_type)->size == 3 - && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF])) - coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF]; - else - coding->symbol = saved_coding_symbol; - - continue; - } - if (len_byte <= 0) - { - if (coding->type != coding_type_ccl - || coding->mode & CODING_MODE_LAST_BLOCK) - break; - coding->mode |= CODING_MODE_LAST_BLOCK; - continue; - } - if (result == CODING_FINISH_INSUFFICIENT_SRC) - { - /* The source text ends in invalid codes. Let's just - make them valid buffer contents, and finish conversion. */ - if (multibyte_p) - { - unsigned char *start = dst; - - inserted += len_byte; - while (len_byte--) - { - int c = *src++; - dst += CHAR_STRING (c, dst); - } - - inserted_byte += dst - start; - } - else - { - inserted += len_byte; - inserted_byte += len_byte; - while (len_byte--) - *dst++ = *src++; - } - break; - } - if (result == CODING_FINISH_INTERRUPT) - { - /* The conversion procedure was interrupted by a user. */ - break; - } - /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */ - if (coding->consumed < 1) - { - /* It's quite strange to require more memory without - consuming any bytes. Perhaps CCL program bug. */ - break; - } - if (first) - { - /* We have just done the first batch of conversion which was - stopped because of insufficient gap. Let's reconsider the - required gap size (i.e. SRT - DST) now. - - We have converted ORIG bytes (== coding->consumed) into - NEW bytes (coding->produced). To convert the remaining - LEN bytes, we may need REQUIRE bytes of gap, where: - REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG) - REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG - Here, we are sure that NEW >= ORIG. */ - float ratio = coding->produced - coding->consumed; - ratio /= coding->consumed; - require = len_byte * ratio; - first = 0; - } - if ((src - dst) < (require + 2000)) - { - /* See the comment above the previous call of make_gap. */ - int add = len_byte + inserted_byte; - - GAP_SIZE -= add; - ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add; - GPT += inserted_byte; GPT_BYTE += inserted_byte; - make_gap (require + 2000); - GAP_SIZE += add; - ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add; - GPT -= inserted_byte; GPT_BYTE -= inserted_byte; - } - } - if (src - dst > 0) *dst = 0; /* Put an anchor. */ - - if (encodep && coding->dst_multibyte) - { - /* The output is unibyte. We must convert 8-bit characters to - multibyte form. */ - if (inserted_byte * 2 > GAP_SIZE) - { - GAP_SIZE -= inserted_byte; - ZV += inserted_byte; Z += inserted_byte; - ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte; - GPT += inserted_byte; GPT_BYTE += inserted_byte; - make_gap (inserted_byte - GAP_SIZE); - GAP_SIZE += inserted_byte; - ZV -= inserted_byte; Z -= inserted_byte; - ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte; - GPT -= inserted_byte; GPT_BYTE -= inserted_byte; - } - inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte); - } - - /* If we shrank the conversion area, adjust it now. */ - if (total_skip > 0) - { - if (tail_skip > 0) - safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip); - inserted += total_skip; inserted_byte += total_skip; - GAP_SIZE += total_skip; - GPT -= head_skip; GPT_BYTE -= head_skip; - ZV -= total_skip; ZV_BYTE -= total_skip; - Z -= total_skip; Z_BYTE -= total_skip; - from -= head_skip; from_byte -= head_skip; - to += tail_skip; to_byte += tail_skip; - } - - prev_Z = Z; - if (! EQ (current_buffer->undo_list, Qt)) - adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte); - else - adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del, - inserted, inserted_byte); - inserted = Z - prev_Z; - - if (!encodep && coding->cmp_data && coding->cmp_data->used) - coding_restore_composition (coding, Fcurrent_buffer ()); - coding_free_composition_data (coding); - - if (! inhibit_pre_post_conversion - && ! encodep && ! NILP (coding->post_read_conversion)) - { - Lisp_Object val; - - if (from != PT) - TEMP_SET_PT_BOTH (from, from_byte); - prev_Z = Z; - record_unwind_protect (code_convert_region_unwind, Qnil); - /* We should not call any more pre-write/post-read-conversion - functions while this post-read-conversion is running. */ - inhibit_pre_post_conversion = 1; - val = call1 (coding->post_read_conversion, make_number (inserted)); - inhibit_pre_post_conversion = 0; - /* Discard the unwind protect. */ - specpdl_ptr--; - CHECK_NUMBER (val); - inserted += Z - prev_Z; - } - - if (orig_point >= from) - { - if (orig_point >= from + orig_len) - orig_point += inserted - orig_len; - else - orig_point = from; - TEMP_SET_PT (orig_point); - } - - if (replace) - { - signal_after_change (from, to - from, inserted); - update_compositions (from, from + inserted, CHECK_BORDER); - } - - { - coding->consumed = to_byte - from_byte; - coding->consumed_char = to - from; - coding->produced = inserted_byte; - coding->produced_char = inserted; - } - - return 0; -} - -Lisp_Object -run_pre_post_conversion_on_str (str, coding, encodep) - Lisp_Object str; - struct coding_system *coding; - int encodep; -{ - int count = specpdl_ptr - specpdl; - struct gcpro gcpro1; - int multibyte = STRING_MULTIBYTE (str); - - record_unwind_protect (Fset_buffer, Fcurrent_buffer ()); - record_unwind_protect (code_convert_region_unwind, Qnil); - GCPRO1 (str); - temp_output_buffer_setup (" *code-converting-work*"); - set_buffer_internal (XBUFFER (Vstandard_output)); - /* We must insert the contents of STR as is without - unibyte<->multibyte conversion. For that, we adjust the - multibyteness of the working buffer to that of STR. */ - Ferase_buffer (); - current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil; - insert_from_string (str, 0, 0, - XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0); - UNGCPRO; - inhibit_pre_post_conversion = 1; - if (encodep) - call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z)); - else - { - TEMP_SET_PT_BOTH (BEG, BEG_BYTE); - call1 (coding->post_read_conversion, make_number (Z - BEG)); - } - inhibit_pre_post_conversion = 0; - str = make_buffer_string (BEG, Z, 1); - return unbind_to (count, str); -} - -Lisp_Object -decode_coding_string (str, coding, nocopy) - Lisp_Object str; - struct coding_system *coding; - int nocopy; -{ - int len; - struct conversion_buffer buf; - int from, to_byte; - Lisp_Object saved_coding_symbol; - int result; - int require_decoding; - int shrinked_bytes = 0; - Lisp_Object newstr; - int consumed, consumed_char, produced, produced_char; - - from = 0; - to_byte = STRING_BYTES (XSTRING (str)); - - saved_coding_symbol = coding->symbol; - coding->src_multibyte = STRING_MULTIBYTE (str); - coding->dst_multibyte = 1; - if (CODING_REQUIRE_DETECTION (coding)) - { - /* See the comments in code_convert_region. */ - if (coding->type == coding_type_undecided) - { - detect_coding (coding, XSTRING (str)->data, to_byte); - if (coding->type == coding_type_undecided) - { - coding->type = coding_type_emacs_mule; - coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE; - /* As emacs-mule decoder will handle composition, we - need this setting to allocate coding->cmp_data - later. */ - coding->composing = COMPOSITION_NO; - } - } - if (coding->eol_type == CODING_EOL_UNDECIDED - && coding->type != coding_type_ccl) - { - saved_coding_symbol = coding->symbol; - detect_eol (coding, XSTRING (str)->data, to_byte); - if (coding->eol_type == CODING_EOL_UNDECIDED) - coding->eol_type = CODING_EOL_LF; - /* We had better recover the original eol format if we - encounter an inconsistent eol format while decoding. */ - coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; - } - } - - if (coding->type == coding_type_no_conversion - || coding->type == coding_type_raw_text) - coding->dst_multibyte = 0; - - require_decoding = CODING_REQUIRE_DECODING (coding); - - if (STRING_MULTIBYTE (str)) - { - /* Decoding routines expect the source text to be unibyte. */ - str = Fstring_as_unibyte (str); - to_byte = STRING_BYTES (XSTRING (str)); - nocopy = 1; - coding->src_multibyte = 0; - } - - /* Try to skip the heading and tailing ASCIIs. */ - if (require_decoding && coding->type != coding_type_ccl) - { - SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data, - 0); - if (from == to_byte) - require_decoding = 0; - shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte); - } - - if (!require_decoding) - { - coding->consumed = STRING_BYTES (XSTRING (str)); - coding->consumed_char = XSTRING (str)->size; - if (coding->dst_multibyte) - { - str = Fstring_as_multibyte (str); - nocopy = 1; - } - coding->produced = STRING_BYTES (XSTRING (str)); - coding->produced_char = XSTRING (str)->size; - return (nocopy ? str : Fcopy_sequence (str)); - } - - if (coding->composing != COMPOSITION_DISABLED) - coding_allocate_composition_data (coding, from); - len = decoding_buffer_size (coding, to_byte - from); - allocate_conversion_buffer (buf, len); - - consumed = consumed_char = produced = produced_char = 0; - while (1) - { - result = decode_coding (coding, XSTRING (str)->data + from + consumed, - buf.data + produced, to_byte - from - consumed, - buf.size - produced); - consumed += coding->consumed; - consumed_char += coding->consumed_char; - produced += coding->produced; - produced_char += coding->produced_char; - if (result == CODING_FINISH_NORMAL - || (result == CODING_FINISH_INSUFFICIENT_SRC - && coding->consumed == 0)) - break; - if (result == CODING_FINISH_INSUFFICIENT_CMP) - coding_allocate_composition_data (coding, from + produced_char); - else if (result == CODING_FINISH_INSUFFICIENT_DST) - extend_conversion_buffer (&buf); - else if (result == CODING_FINISH_INCONSISTENT_EOL) - { - Lisp_Object eol_type; - - /* Recover the original EOL format. */ - if (coding->eol_type == CODING_EOL_CR) - { - unsigned char *p; - for (p = buf.data; p < buf.data + produced; p++) - if (*p == '\n') *p = '\r'; - } - else if (coding->eol_type == CODING_EOL_CRLF) - { - int num_eol = 0; - unsigned char *p0, *p1; - for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++) - if (*p0 == '\n') num_eol++; - if (produced + num_eol >= buf.size) - extend_conversion_buffer (&buf); - for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;) - { - *--p1 = *--p0; - if (*p0 == '\n') *--p1 = '\r'; - } - produced += num_eol; - produced_char += num_eol; - } - /* Suppress eol-format conversion in the further conversion. */ - coding->eol_type = CODING_EOL_LF; - - /* Set the coding system symbol to that for Unix-like EOL. */ - eol_type = Fget (saved_coding_symbol, Qeol_type); - if (VECTORP (eol_type) - && XVECTOR (eol_type)->size == 3 - && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF])) - coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF]; - else - coding->symbol = saved_coding_symbol; - - - } - } - - coding->consumed = consumed; - coding->consumed_char = consumed_char; - coding->produced = produced; - coding->produced_char = produced_char; - - if (coding->dst_multibyte) - newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes, - produced + shrinked_bytes); - else - newstr = make_uninit_string (produced + shrinked_bytes); - if (from > 0) - bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from); - bcopy (buf.data, XSTRING (newstr)->data + from, produced); - if (shrinked_bytes > from) - bcopy (XSTRING (str)->data + to_byte, - XSTRING (newstr)->data + from + produced, - shrinked_bytes - from); - free_conversion_buffer (&buf); - - if (coding->cmp_data && coding->cmp_data->used) - coding_restore_composition (coding, newstr); - coding_free_composition_data (coding); - - if (SYMBOLP (coding->post_read_conversion) - && !NILP (Ffboundp (coding->post_read_conversion))) - newstr = run_pre_post_conversion_on_str (newstr, coding, 0); - - return newstr; -} - -Lisp_Object -encode_coding_string (str, coding, nocopy) - Lisp_Object str; - struct coding_system *coding; - int nocopy; -{ - int len; - struct conversion_buffer buf; - int from, to, to_byte; - int result; - int shrinked_bytes = 0; - Lisp_Object newstr; - int consumed, consumed_char, produced, produced_char; - - if (SYMBOLP (coding->pre_write_conversion) - && !NILP (Ffboundp (coding->pre_write_conversion))) - str = run_pre_post_conversion_on_str (str, coding, 1); - - from = 0; - to = XSTRING (str)->size; - to_byte = STRING_BYTES (XSTRING (str)); - - /* Encoding routines determine the multibyteness of the source text - by coding->src_multibyte. */ - coding->src_multibyte = STRING_MULTIBYTE (str); - coding->dst_multibyte = 0; - if (! CODING_REQUIRE_ENCODING (coding)) - { - coding->consumed = STRING_BYTES (XSTRING (str)); - coding->consumed_char = XSTRING (str)->size; - if (STRING_MULTIBYTE (str)) - { - str = Fstring_as_unibyte (str); - nocopy = 1; - } - coding->produced = STRING_BYTES (XSTRING (str)); - coding->produced_char = XSTRING (str)->size; - return (nocopy ? str : Fcopy_sequence (str)); - } - - if (coding->composing != COMPOSITION_DISABLED) - coding_save_composition (coding, from, to, str); - - /* Try to skip the heading and tailing ASCIIs. */ - if (coding->type != coding_type_ccl) - { - SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data, - 1); - if (from == to_byte) - return (nocopy ? str : Fcopy_sequence (str)); - shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte); - } - - len = encoding_buffer_size (coding, to_byte - from); - allocate_conversion_buffer (buf, len); - - consumed = consumed_char = produced = produced_char = 0; - while (1) - { - result = encode_coding (coding, XSTRING (str)->data + from + consumed, - buf.data + produced, to_byte - from - consumed, - buf.size - produced); - consumed += coding->consumed; - consumed_char += coding->consumed_char; - produced += coding->produced; - produced_char += coding->produced_char; - if (result == CODING_FINISH_NORMAL - || (result == CODING_FINISH_INSUFFICIENT_SRC - && coding->consumed == 0)) - break; - /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */ - extend_conversion_buffer (&buf); - } - - coding->consumed = consumed; - coding->consumed_char = consumed_char; - coding->produced = produced; - coding->produced_char = produced_char; - - newstr = make_uninit_string (produced + shrinked_bytes); - if (from > 0) - bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from); - bcopy (buf.data, XSTRING (newstr)->data + from, produced); - if (shrinked_bytes > from) - bcopy (XSTRING (str)->data + to_byte, - XSTRING (newstr)->data + from + produced, - shrinked_bytes - from); - - free_conversion_buffer (&buf); - coding_free_composition_data (coding); - - return newstr; -} - - -#ifdef emacs -/*** 8. Emacs Lisp library functions ***/ - -DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0, - doc: /* Return t if OBJECT is nil or a coding-system. -See the documentation of `make-coding-system' for information -about coding-system objects. */) - (obj) - Lisp_Object obj; -{ - if (NILP (obj)) - return Qt; - if (!SYMBOLP (obj)) - return Qnil; - /* Get coding-spec vector for OBJ. */ - obj = Fget (obj, Qcoding_system); - return ((VECTORP (obj) && XVECTOR (obj)->size == 5) - ? Qt : Qnil); -} - -DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system, - Sread_non_nil_coding_system, 1, 1, 0, - doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */) - (prompt) - Lisp_Object prompt; -{ - Lisp_Object val; - do - { - val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil, - Qt, Qnil, Qcoding_system_history, Qnil, Qnil); - } - while (XSTRING (val)->size == 0); - return (Fintern (val, Qnil)); -} - -DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0, - doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. -If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */) - (prompt, default_coding_system) - Lisp_Object prompt, default_coding_system; -{ - Lisp_Object val; - if (SYMBOLP (default_coding_system)) - XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name); - val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil, - Qt, Qnil, Qcoding_system_history, - default_coding_system, Qnil); - return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil)); -} - -DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system, - 1, 1, 0, - doc: /* Check validity of CODING-SYSTEM. -If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. -It is valid if it is a symbol with a non-nil `coding-system' property. -The value of property should be a vector of length 5. */) - (coding_system) - Lisp_Object coding_system; -{ - CHECK_SYMBOL (coding_system); - if (!NILP (Fcoding_system_p (coding_system))) - return coding_system; - while (1) - Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); -} - -Lisp_Object -detect_coding_system (src, src_bytes, highest, multibytep) - unsigned char *src; - int src_bytes, highest; - int multibytep; -{ - int coding_mask, eol_type; - Lisp_Object val, tmp; - int dummy; - - coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep); - eol_type = detect_eol_type (src, src_bytes, &dummy); - if (eol_type == CODING_EOL_INCONSISTENT) - eol_type = CODING_EOL_UNDECIDED; - - if (!coding_mask) - { - val = Qundecided; - if (eol_type != CODING_EOL_UNDECIDED) - { - Lisp_Object val2; - val2 = Fget (Qundecided, Qeol_type); - if (VECTORP (val2)) - val = XVECTOR (val2)->contents[eol_type]; - } - return (highest ? val : Fcons (val, Qnil)); - } - - /* At first, gather possible coding systems in VAL. */ - val = Qnil; - for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp)) - { - Lisp_Object category_val, category_index; - - category_index = Fget (XCAR (tmp), Qcoding_category_index); - category_val = Fsymbol_value (XCAR (tmp)); - if (!NILP (category_val) - && NATNUMP (category_index) - && (coding_mask & (1 << XFASTINT (category_index)))) - { - val = Fcons (category_val, val); - if (highest) - break; - } - } - if (!highest) - val = Fnreverse (val); - - /* Then, replace the elements with subsidiary coding systems. */ - for (tmp = val; CONSP (tmp); tmp = XCDR (tmp)) - { - if (eol_type != CODING_EOL_UNDECIDED - && eol_type != CODING_EOL_INCONSISTENT) - { - Lisp_Object eol; - eol = Fget (XCAR (tmp), Qeol_type); - if (VECTORP (eol)) - XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]); - } - } - return (highest ? XCAR (val) : val); -} - -DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, - 2, 3, 0, - doc: /* Detect coding system of the text in the region between START and END. -Return a list of possible coding systems ordered by priority. - -If only ASCII characters are found, it returns a list of single element -`undecided' or its subsidiary coding system according to a detected -end-of-line format. - -If optional argument HIGHEST is non-nil, return the coding system of -highest priority. */) - (start, end, highest) - Lisp_Object start, end, highest; -{ - int from, to; - int from_byte, to_byte; - int include_anchor_byte = 0; - - CHECK_NUMBER_COERCE_MARKER (start); - CHECK_NUMBER_COERCE_MARKER (end); - - validate_region (&start, &end); - from = XINT (start), to = XINT (end); - from_byte = CHAR_TO_BYTE (from); - to_byte = CHAR_TO_BYTE (to); - - if (from < GPT && to >= GPT) - move_gap_both (to, to_byte); - /* If we an anchor byte `\0' follows the region, we include it in - the detecting source. Then code detectors can handle the tailing - byte sequence more accurately. - - Fix me: This is not an perfect solution. It is better that we - add one more argument, say LAST_BLOCK, to all detect_coding_XXX. - */ - if (to == Z || (to == GPT && GAP_SIZE > 0)) - include_anchor_byte = 1; - return detect_coding_system (BYTE_POS_ADDR (from_byte), - to_byte - from_byte + include_anchor_byte, - !NILP (highest), - !NILP (current_buffer - ->enable_multibyte_characters)); -} - -DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, - 1, 2, 0, - doc: /* Detect coding system of the text in STRING. -Return a list of possible coding systems ordered by priority. - -If only ASCII characters are found, it returns a list of single element -`undecided' or its subsidiary coding system according to a detected -end-of-line format. - -If optional argument HIGHEST is non-nil, return the coding system of -highest priority. */) - (string, highest) - Lisp_Object string, highest; -{ - CHECK_STRING (string); - - return detect_coding_system (XSTRING (string)->data, - /* "+ 1" is to include the anchor byte - `\0'. With this, code detectors can - handle the tailing bytes more - accurately. */ - STRING_BYTES (XSTRING (string)) + 1, - !NILP (highest), - STRING_MULTIBYTE (string)); -} - -/* Return an intersection of lists L1 and L2. */ - -static Lisp_Object -intersection (l1, l2) - Lisp_Object l1, l2; -{ - Lisp_Object val; - - for (val = Qnil; CONSP (l1); l1 = XCDR (l1)) - { - if (!NILP (Fmemq (XCAR (l1), l2))) - val = Fcons (XCAR (l1), val); - } - return val; -} - - -/* Subroutine for Fsafe_coding_systems_region_internal. - - Return a list of coding systems that safely encode the multibyte - text between P and PEND. SAFE_CODINGS, if non-nil, is a list of - possible coding systems. If it is nil, it means that we have not - yet found any coding systems. - - WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An - element of WORK_TABLE is set to t once the element is looked up. - - If a non-ASCII single byte char is found, set - *single_byte_char_found to 1. */ - -static Lisp_Object -find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found) - unsigned char *p, *pend; - Lisp_Object safe_codings, work_table; - int *single_byte_char_found; -{ - int c, len, idx; - Lisp_Object val; - - while (p < pend) - { - c = STRING_CHAR_AND_LENGTH (p, pend - p, len); - p += len; - if (ASCII_BYTE_P (c)) - /* We can ignore ASCII characters here. */ - continue; - if (SINGLE_BYTE_CHAR_P (c)) - *single_byte_char_found = 1; - if (NILP (safe_codings)) - continue; - /* Check the safe coding systems for C. */ - val = char_table_ref_and_index (work_table, c, &idx); - if (EQ (val, Qt)) - /* This element was already checked. Ignore it. */ - continue; - /* Remember that we checked this element. */ - CHAR_TABLE_SET (work_table, make_number (idx), Qt); - - /* If there are some safe coding systems for C and we have - already found the other set of coding systems for the - different characters, get the intersection of them. */ - if (!EQ (safe_codings, Qt) && !NILP (val)) - val = intersection (safe_codings, val); - safe_codings = val; - } - return safe_codings; -} - - -/* Return a list of coding systems that safely encode the text between - START and END. If the text contains only ASCII or is unibyte, - return t. */ - -DEFUN ("find-coding-systems-region-internal", - Ffind_coding_systems_region_internal, - Sfind_coding_systems_region_internal, 2, 2, 0, - doc: /* Internal use only. */) - (start, end) - Lisp_Object start, end; -{ - Lisp_Object work_table, safe_codings; - int non_ascii_p = 0; - int single_byte_char_found = 0; - unsigned char *p1, *p1end, *p2, *p2end, *p; - - if (STRINGP (start)) - { - if (!STRING_MULTIBYTE (start)) - return Qt; - p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start)); - p2 = p2end = p1end; - if (XSTRING (start)->size != STRING_BYTES (XSTRING (start))) - non_ascii_p = 1; - } - else - { - int from, to, stop; - - CHECK_NUMBER_COERCE_MARKER (start); - CHECK_NUMBER_COERCE_MARKER (end); - if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) - args_out_of_range (start, end); - if (NILP (current_buffer->enable_multibyte_characters)) - return Qt; - from = CHAR_TO_BYTE (XINT (start)); - to = CHAR_TO_BYTE (XINT (end)); - stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to; - p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from); - if (stop == to) - p2 = p2end = p1end; - else - p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop); - if (XINT (end) - XINT (start) != to - from) - non_ascii_p = 1; - } - - if (!non_ascii_p) - { - /* We are sure that the text contains no multibyte character. - Check if it contains eight-bit-graphic. */ - p = p1; - for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++); - if (p == p1end) - { - for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++); - if (p == p2end) - return Qt; - } - } - - /* The text contains non-ASCII characters. */ - work_table = Fcopy_sequence (Vchar_coding_system_table); - safe_codings = find_safe_codings (p1, p1end, Qt, work_table, - &single_byte_char_found); - if (p2 < p2end) - safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table, - &single_byte_char_found); - - if (EQ (safe_codings, Qt)) - ; /* Nothing to be done. */ - else if (!single_byte_char_found) - { - /* Append generic coding systems. */ - Lisp_Object args[2]; - args[0] = safe_codings; - args[1] = Fchar_table_extra_slot (Vchar_coding_system_table, - make_number (0)); - safe_codings = Fappend (2, args); - } - else - safe_codings = Fcons (Qraw_text, - Fcons (Qemacs_mule, - Fcons (Qno_conversion, safe_codings))); - return safe_codings; -} - - -Lisp_Object -code_convert_region1 (start, end, coding_system, encodep) - Lisp_Object start, end, coding_system; - int encodep; -{ - struct coding_system coding; - int from, to; - - CHECK_NUMBER_COERCE_MARKER (start); - CHECK_NUMBER_COERCE_MARKER (end); - CHECK_SYMBOL (coding_system); - - validate_region (&start, &end); - from = XFASTINT (start); - to = XFASTINT (end); - - if (NILP (coding_system)) - return make_number (to - from); - - if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) - error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data); - - coding.mode |= CODING_MODE_LAST_BLOCK; - coding.src_multibyte = coding.dst_multibyte - = !NILP (current_buffer->enable_multibyte_characters); - code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to), - &coding, encodep, 1); - Vlast_coding_system_used = coding.symbol; - return make_number (coding.produced_char); -} - -DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region, - 3, 3, "r\nzCoding system: ", - doc: /* Decode the current region from the specified coding system. -When called from a program, takes three arguments: -START, END, and CODING-SYSTEM. START and END are buffer positions. -This function sets `last-coding-system-used' to the precise coding system -used (which may be different from CODING-SYSTEM if CODING-SYSTEM is -not fully specified.) -It returns the length of the decoded text. */) - (start, end, coding_system) - Lisp_Object start, end, coding_system; -{ - return code_convert_region1 (start, end, coding_system, 0); -} - -DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region, - 3, 3, "r\nzCoding system: ", - doc: /* Encode the current region into the specified coding system. -When called from a program, takes three arguments: -START, END, and CODING-SYSTEM. START and END are buffer positions. -This function sets `last-coding-system-used' to the precise coding system -used (which may be different from CODING-SYSTEM if CODING-SYSTEM is -not fully specified.) -It returns the length of the encoded text. */) - (start, end, coding_system) - Lisp_Object start, end, coding_system; -{ - return code_convert_region1 (start, end, coding_system, 1); -} - -Lisp_Object -code_convert_string1 (string, coding_system, nocopy, encodep) - Lisp_Object string, coding_system, nocopy; - int encodep; -{ - struct coding_system coding; - - CHECK_STRING (string); - CHECK_SYMBOL (coding_system); - - if (NILP (coding_system)) - return (NILP (nocopy) ? Fcopy_sequence (string) : string); - - if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) - error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data); - - coding.mode |= CODING_MODE_LAST_BLOCK; - string = (encodep - ? encode_coding_string (string, &coding, !NILP (nocopy)) - : decode_coding_string (string, &coding, !NILP (nocopy))); - Vlast_coding_system_used = coding.symbol; - - return string; -} - -DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string, - 2, 3, 0, - doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result. -Optional arg NOCOPY non-nil means it is OK to return STRING itself -if the decoding operation is trivial. -This function sets `last-coding-system-used' to the precise coding system -used (which may be different from CODING-SYSTEM if CODING-SYSTEM is -not fully specified.) */) - (string, coding_system, nocopy) - Lisp_Object string, coding_system, nocopy; -{ - return code_convert_string1 (string, coding_system, nocopy, 0); -} - -DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string, - 2, 3, 0, - doc: /* Encode STRING to CODING-SYSTEM, and return the result. -Optional arg NOCOPY non-nil means it is OK to return STRING itself -if the encoding operation is trivial. -This function sets `last-coding-system-used' to the precise coding system -used (which may be different from CODING-SYSTEM if CODING-SYSTEM is -not fully specified.) */) - (string, coding_system, nocopy) - Lisp_Object string, coding_system, nocopy; -{ - return code_convert_string1 (string, coding_system, nocopy, 1); -} - -/* Encode or decode STRING according to CODING_SYSTEM. - Do not set Vlast_coding_system_used. - - This function is called only from macros DECODE_FILE and - ENCODE_FILE, thus we ignore character composition. */ - -Lisp_Object -code_convert_string_norecord (string, coding_system, encodep) - Lisp_Object string, coding_system; - int encodep; -{ - struct coding_system coding; - - CHECK_STRING (string); - CHECK_SYMBOL (coding_system); - - if (NILP (coding_system)) - return string; - - if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) - error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data); - - coding.composing = COMPOSITION_DISABLED; - coding.mode |= CODING_MODE_LAST_BLOCK; - return (encodep - ? encode_coding_string (string, &coding, 1) - : decode_coding_string (string, &coding, 1)); -} - -DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0, - doc: /* Decode a Japanese character which has CODE in shift_jis encoding. -Return the corresponding character. */) - (code) - Lisp_Object code; -{ - unsigned char c1, c2, s1, s2; - Lisp_Object val; - - CHECK_NUMBER (code); - s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF; - if (s1 == 0) - { - if (s2 < 0x80) - XSETFASTINT (val, s2); - else if (s2 >= 0xA0 || s2 <= 0xDF) - XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0)); - else - error ("Invalid Shift JIS code: %x", XFASTINT (code)); - } - else - { - if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF) - || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)) - error ("Invalid Shift JIS code: %x", XFASTINT (code)); - DECODE_SJIS (s1, s2, c1, c2); - XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2)); - } - return val; -} - -DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0, - doc: /* Encode a Japanese character CHAR to shift_jis encoding. -Return the corresponding code in SJIS. */) - (ch) - Lisp_Object ch; -{ - int charset, c1, c2, s1, s2; - Lisp_Object val; - - CHECK_NUMBER (ch); - SPLIT_CHAR (XFASTINT (ch), charset, c1, c2); - if (charset == CHARSET_ASCII) - { - val = ch; - } - else if (charset == charset_jisx0208 - && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F) - { - ENCODE_SJIS (c1, c2, s1, s2); - XSETFASTINT (val, (s1 << 8) | s2); - } - else if (charset == charset_katakana_jisx0201 - && c1 > 0x20 && c2 < 0xE0) - { - XSETFASTINT (val, c1 | 0x80); - } - else - error ("Can't encode to shift_jis: %d", XFASTINT (ch)); - return val; -} - -DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0, - doc: /* Decode a Big5 character which has CODE in BIG5 coding system. -Return the corresponding character. */) - (code) - Lisp_Object code; -{ - int charset; - unsigned char b1, b2, c1, c2; - Lisp_Object val; - - CHECK_NUMBER (code); - b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF; - if (b1 == 0) - { - if (b2 >= 0x80) - error ("Invalid BIG5 code: %x", XFASTINT (code)); - val = code; - } - else - { - if ((b1 < 0xA1 || b1 > 0xFE) - || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)) - error ("Invalid BIG5 code: %x", XFASTINT (code)); - DECODE_BIG5 (b1, b2, charset, c1, c2); - XSETFASTINT (val, MAKE_CHAR (charset, c1, c2)); - } - return val; -} - -DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0, - doc: /* Encode the Big5 character CHAR to BIG5 coding system. -Return the corresponding character code in Big5. */) - (ch) - Lisp_Object ch; -{ - int charset, c1, c2, b1, b2; - Lisp_Object val; - - CHECK_NUMBER (ch); - SPLIT_CHAR (XFASTINT (ch), charset, c1, c2); - if (charset == CHARSET_ASCII) - { - val = ch; - } - else if ((charset == charset_big5_1 - && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec)) - || (charset == charset_big5_2 - && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2)) - { - ENCODE_BIG5 (charset, c1, c2, b1, b2); - XSETFASTINT (val, (b1 << 8) | b2); - } - else - error ("Can't encode to Big5: %d", XFASTINT (ch)); - return val; -} - -DEFUN ("set-terminal-coding-system-internal", - Fset_terminal_coding_system_internal, - Sset_terminal_coding_system_internal, 1, 1, 0, - doc: /* Internal use only. */) - (coding_system) - Lisp_Object coding_system; -{ - CHECK_SYMBOL (coding_system); - setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding); - /* We had better not send unsafe characters to terminal. */ - terminal_coding.flags |= CODING_FLAG_ISO_SAFE; - /* Character composition should be disabled. */ - terminal_coding.composing = COMPOSITION_DISABLED; - /* Error notification should be suppressed. */ - terminal_coding.suppress_error = 1; - terminal_coding.src_multibyte = 1; - terminal_coding.dst_multibyte = 0; - return Qnil; -} - -DEFUN ("set-safe-terminal-coding-system-internal", - Fset_safe_terminal_coding_system_internal, - Sset_safe_terminal_coding_system_internal, 1, 1, 0, - doc: /* Internal use only. */) - (coding_system) - Lisp_Object coding_system; -{ - CHECK_SYMBOL (coding_system); - setup_coding_system (Fcheck_coding_system (coding_system), - &safe_terminal_coding); - /* Character composition should be disabled. */ - safe_terminal_coding.composing = COMPOSITION_DISABLED; - /* Error notification should be suppressed. */ - terminal_coding.suppress_error = 1; - safe_terminal_coding.src_multibyte = 1; - safe_terminal_coding.dst_multibyte = 0; - return Qnil; -} - -DEFUN ("terminal-coding-system", - Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0, - doc: /* Return coding system specified for terminal output. */) - () -{ - return terminal_coding.symbol; -} - -DEFUN ("set-keyboard-coding-system-internal", - Fset_keyboard_coding_system_internal, - Sset_keyboard_coding_system_internal, 1, 1, 0, - doc: /* Internal use only. */) - (coding_system) - Lisp_Object coding_system; -{ - CHECK_SYMBOL (coding_system); - setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding); - /* Character composition should be disabled. */ - keyboard_coding.composing = COMPOSITION_DISABLED; - return Qnil; -} - -DEFUN ("keyboard-coding-system", - Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0, - doc: /* Return coding system specified for decoding keyboard input. */) - () -{ - return keyboard_coding.symbol; -} - - -DEFUN ("find-operation-coding-system", Ffind_operation_coding_system, - Sfind_operation_coding_system, 1, MANY, 0, - doc: /* Choose a coding system for an operation based on the target name. -The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM). -DECODING-SYSTEM is the coding system to use for decoding -\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system -for encoding (in case OPERATION does encoding). - -The first argument OPERATION specifies an I/O primitive: - For file I/O, `insert-file-contents' or `write-region'. - For process I/O, `call-process', `call-process-region', or `start-process'. - For network I/O, `open-network-stream'. - -The remaining arguments should be the same arguments that were passed -to the primitive. Depending on which primitive, one of those arguments -is selected as the TARGET. For example, if OPERATION does file I/O, -whichever argument specifies the file name is TARGET. - -TARGET has a meaning which depends on OPERATION: - For file I/O, TARGET is a file name. - For process I/O, TARGET is a process name. - For network I/O, TARGET is a service name or a port number - -This function looks up what specified for TARGET in, -`file-coding-system-alist', `process-coding-system-alist', -or `network-coding-system-alist' depending on OPERATION. -They may specify a coding system, a cons of coding systems, -or a function symbol to call. -In the last case, we call the function with one argument, -which is a list of all the arguments given to this function. - -usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */) - (nargs, args) - int nargs; - Lisp_Object *args; -{ - Lisp_Object operation, target_idx, target, val; - register Lisp_Object chain; - - if (nargs < 2) - error ("Too few arguments"); - operation = args[0]; - if (!SYMBOLP (operation) - || !INTEGERP (target_idx = Fget (operation, Qtarget_idx))) - error ("Invalid first argument"); - if (nargs < 1 + XINT (target_idx)) - error ("Too few arguments for operation: %s", - XSYMBOL (operation)->name->data); - target = args[XINT (target_idx) + 1]; - if (!(STRINGP (target) - || (EQ (operation, Qopen_network_stream) && INTEGERP (target)))) - error ("Invalid argument %d", XINT (target_idx) + 1); - - chain = ((EQ (operation, Qinsert_file_contents) - || EQ (operation, Qwrite_region)) - ? Vfile_coding_system_alist - : (EQ (operation, Qopen_network_stream) - ? Vnetwork_coding_system_alist - : Vprocess_coding_system_alist)); - if (NILP (chain)) - return Qnil; - - for (; CONSP (chain); chain = XCDR (chain)) - { - Lisp_Object elt; - elt = XCAR (chain); - - if (CONSP (elt) - && ((STRINGP (target) - && STRINGP (XCAR (elt)) - && fast_string_match (XCAR (elt), target) >= 0) - || (INTEGERP (target) && EQ (target, XCAR (elt))))) - { - val = XCDR (elt); - /* Here, if VAL is both a valid coding system and a valid - function symbol, we return VAL as a coding system. */ - if (CONSP (val)) - return val; - if (! SYMBOLP (val)) - return Qnil; - if (! NILP (Fcoding_system_p (val))) - return Fcons (val, val); - if (! NILP (Ffboundp (val))) - { - val = call1 (val, Flist (nargs, args)); - if (CONSP (val)) - return val; - if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val))) - return Fcons (val, val); - } - return Qnil; - } - } - return Qnil; -} - -DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal, - Supdate_coding_systems_internal, 0, 0, 0, - doc: /* Update internal database for ISO2022 and CCL based coding systems. -When values of any coding categories are changed, you must -call this function. */) - () -{ - int i; - - for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++) - { - Lisp_Object val; - - val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]); - if (!NILP (val)) - { - if (! coding_system_table[i]) - coding_system_table[i] = ((struct coding_system *) - xmalloc (sizeof (struct coding_system))); - setup_coding_system (val, coding_system_table[i]); - } - else if (coding_system_table[i]) - { - xfree (coding_system_table[i]); - coding_system_table[i] = NULL; - } - } - - return Qnil; -} - -DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal, - Sset_coding_priority_internal, 0, 0, 0, - doc: /* Update internal database for the current value of `coding-category-list'. -This function is internal use only. */) - () -{ - int i = 0, idx; - Lisp_Object val; - - val = Vcoding_category_list; - - while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX) - { - if (! SYMBOLP (XCAR (val))) - break; - idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index)); - if (idx >= CODING_CATEGORY_IDX_MAX) - break; - coding_priorities[i++] = (1 << idx); - val = XCDR (val); - } - /* If coding-category-list is valid and contains all coding - categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, - the following code saves Emacs from crashing. */ - while (i < CODING_CATEGORY_IDX_MAX) - coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; - - return Qnil; -} - -#endif /* emacs */ - - -/*** 9. Post-amble ***/ - -void -init_coding_once () -{ - int i; - - /* Emacs' internal format specific initialize routine. */ - for (i = 0; i <= 0x20; i++) - emacs_code_class[i] = EMACS_control_code; - emacs_code_class[0x0A] = EMACS_linefeed_code; - emacs_code_class[0x0D] = EMACS_carriage_return_code; - for (i = 0x21 ; i < 0x7F; i++) - emacs_code_class[i] = EMACS_ascii_code; - emacs_code_class[0x7F] = EMACS_control_code; - for (i = 0x80; i < 0xFF; i++) - emacs_code_class[i] = EMACS_invalid_code; - emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3; - emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3; - emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4; - emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4; - - /* ISO2022 specific initialize routine. */ - for (i = 0; i < 0x20; i++) - iso_code_class[i] = ISO_control_0; - for (i = 0x21; i < 0x7F; i++) - iso_code_class[i] = ISO_graphic_plane_0; - for (i = 0x80; i < 0xA0; i++) - iso_code_class[i] = ISO_control_1; - for (i = 0xA1; i < 0xFF; i++) - iso_code_class[i] = ISO_graphic_plane_1; - iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F; - iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF; - iso_code_class[ISO_CODE_CR] = ISO_carriage_return; - iso_code_class[ISO_CODE_SO] = ISO_shift_out; - iso_code_class[ISO_CODE_SI] = ISO_shift_in; - iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7; - iso_code_class[ISO_CODE_ESC] = ISO_escape; - iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2; - iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3; - iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer; - - setup_coding_system (Qnil, &keyboard_coding); - setup_coding_system (Qnil, &terminal_coding); - setup_coding_system (Qnil, &safe_terminal_coding); - setup_coding_system (Qnil, &default_buffer_file_coding); - - bzero (coding_system_table, sizeof coding_system_table); - - bzero (ascii_skip_code, sizeof ascii_skip_code); - for (i = 0; i < 128; i++) - ascii_skip_code[i] = 1; - -#if defined (MSDOS) || defined (WINDOWSNT) - system_eol_type = CODING_EOL_CRLF; -#else - system_eol_type = CODING_EOL_LF; -#endif - - inhibit_pre_post_conversion = 0; -} - -#ifdef emacs - -void -syms_of_coding () -{ - Qtarget_idx = intern ("target-idx"); - staticpro (&Qtarget_idx); - - Qcoding_system_history = intern ("coding-system-history"); - staticpro (&Qcoding_system_history); - Fset (Qcoding_system_history, Qnil); - - /* Target FILENAME is the first argument. */ - Fput (Qinsert_file_contents, Qtarget_idx, make_number (0)); - /* Target FILENAME is the third argument. */ - Fput (Qwrite_region, Qtarget_idx, make_number (2)); - - Qcall_process = intern ("call-process"); - staticpro (&Qcall_process); - /* Target PROGRAM is the first argument. */ - Fput (Qcall_process, Qtarget_idx, make_number (0)); - - Qcall_process_region = intern ("call-process-region"); - staticpro (&Qcall_process_region); - /* Target PROGRAM is the third argument. */ - Fput (Qcall_process_region, Qtarget_idx, make_number (2)); - - Qstart_process = intern ("start-process"); - staticpro (&Qstart_process); - /* Target PROGRAM is the third argument. */ - Fput (Qstart_process, Qtarget_idx, make_number (2)); - - Qopen_network_stream = intern ("open-network-stream"); - staticpro (&Qopen_network_stream); - /* Target SERVICE is the fourth argument. */ - Fput (Qopen_network_stream, Qtarget_idx, make_number (3)); - - Qcoding_system = intern ("coding-system"); - staticpro (&Qcoding_system); - - Qeol_type = intern ("eol-type"); - staticpro (&Qeol_type); - - Qbuffer_file_coding_system = intern ("buffer-file-coding-system"); - staticpro (&Qbuffer_file_coding_system); - - Qpost_read_conversion = intern ("post-read-conversion"); - staticpro (&Qpost_read_conversion); - - Qpre_write_conversion = intern ("pre-write-conversion"); - staticpro (&Qpre_write_conversion); - - Qno_conversion = intern ("no-conversion"); - staticpro (&Qno_conversion); - - Qundecided = intern ("undecided"); - staticpro (&Qundecided); - - Qcoding_system_p = intern ("coding-system-p"); - staticpro (&Qcoding_system_p); - - Qcoding_system_error = intern ("coding-system-error"); - staticpro (&Qcoding_system_error); - - Fput (Qcoding_system_error, Qerror_conditions, - Fcons (Qcoding_system_error, Fcons (Qerror, Qnil))); - Fput (Qcoding_system_error, Qerror_message, - build_string ("Invalid coding system")); - - Qcoding_category = intern ("coding-category"); - staticpro (&Qcoding_category); - Qcoding_category_index = intern ("coding-category-index"); - staticpro (&Qcoding_category_index); - - Vcoding_category_table - = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil); - staticpro (&Vcoding_category_table); - { - int i; - for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) - { - XVECTOR (Vcoding_category_table)->contents[i] - = intern (coding_category_name[i]); - Fput (XVECTOR (Vcoding_category_table)->contents[i], - Qcoding_category_index, make_number (i)); - } - } - - Qtranslation_table = intern ("translation-table"); - staticpro (&Qtranslation_table); - Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1)); - - Qtranslation_table_id = intern ("translation-table-id"); - staticpro (&Qtranslation_table_id); - - Qtranslation_table_for_decode = intern ("translation-table-for-decode"); - staticpro (&Qtranslation_table_for_decode); - - Qtranslation_table_for_encode = intern ("translation-table-for-encode"); - staticpro (&Qtranslation_table_for_encode); - - Qsafe_chars = intern ("safe-chars"); - staticpro (&Qsafe_chars); - - Qchar_coding_system = intern ("char-coding-system"); - staticpro (&Qchar_coding_system); - - /* Intern this now in case it isn't already done. - Setting this variable twice is harmless. - But don't staticpro it here--that is done in alloc.c. */ - Qchar_table_extra_slots = intern ("char-table-extra-slots"); - Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0)); - Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2)); - - Qvalid_codes = intern ("valid-codes"); - staticpro (&Qvalid_codes); - - Qemacs_mule = intern ("emacs-mule"); - staticpro (&Qemacs_mule); - - Qraw_text = intern ("raw-text"); - staticpro (&Qraw_text); - - defsubr (&Scoding_system_p); - defsubr (&Sread_coding_system); - defsubr (&Sread_non_nil_coding_system); - defsubr (&Scheck_coding_system); - defsubr (&Sdetect_coding_region); - defsubr (&Sdetect_coding_string); - defsubr (&Sfind_coding_systems_region_internal); - defsubr (&Sdecode_coding_region); - defsubr (&Sencode_coding_region); - defsubr (&Sdecode_coding_string); - defsubr (&Sencode_coding_string); - defsubr (&Sdecode_sjis_char); - defsubr (&Sencode_sjis_char); - defsubr (&Sdecode_big5_char); - defsubr (&Sencode_big5_char); - defsubr (&Sset_terminal_coding_system_internal); - defsubr (&Sset_safe_terminal_coding_system_internal); - defsubr (&Sterminal_coding_system); - defsubr (&Sset_keyboard_coding_system_internal); - defsubr (&Skeyboard_coding_system); - defsubr (&Sfind_operation_coding_system); - defsubr (&Supdate_coding_systems_internal); - defsubr (&Sset_coding_priority_internal); - - DEFVAR_LISP ("coding-system-list", &Vcoding_system_list, - doc: /* List of coding systems. - -Do not alter the value of this variable manually. This variable should be -updated by the functions `make-coding-system' and -`define-coding-system-alias'. */); - Vcoding_system_list = Qnil; - - DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist, - doc: /* Alist of coding system names. -Each element is one element list of coding system name. -This variable is given to `completing-read' as TABLE argument. - -Do not alter the value of this variable manually. This variable should be -updated by the functions `make-coding-system' and -`define-coding-system-alias'. */); - Vcoding_system_alist = Qnil; - - DEFVAR_LISP ("coding-category-list", &Vcoding_category_list, - doc: /* List of coding-categories (symbols) ordered by priority. - -On detecting a coding system, Emacs tries code detection algorithms -associated with each coding-category one by one in this order. When -one algorithm agrees with a byte sequence of source text, the coding -system bound to the corresponding coding-category is selected. */); - { - int i; - - Vcoding_category_list = Qnil; - for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--) - Vcoding_category_list - = Fcons (XVECTOR (Vcoding_category_table)->contents[i], - Vcoding_category_list); - } - - DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read, - doc: /* Specify the coding system for read operations. -It is useful to bind this variable with `let', but do not set it globally. -If the value is a coding system, it is used for decoding on read operation. -If not, an appropriate element is used from one of the coding system alists: -There are three such tables, `file-coding-system-alist', -`process-coding-system-alist', and `network-coding-system-alist'. */); - Vcoding_system_for_read = Qnil; - - DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write, - doc: /* Specify the coding system for write operations. -Programs bind this variable with `let', but you should not set it globally. -If the value is a coding system, it is used for encoding of output, -when writing it to a file and when sending it to a file or subprocess. - -If this does not specify a coding system, an appropriate element -is used from one of the coding system alists: -There are three such tables, `file-coding-system-alist', -`process-coding-system-alist', and `network-coding-system-alist'. -For output to files, if the above procedure does not specify a coding system, -the value of `buffer-file-coding-system' is used. */); - Vcoding_system_for_write = Qnil; - - DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used, - doc: /* Coding system used in the latest file or process I/O. */); - Vlast_coding_system_used = Qnil; - - DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion, - doc: /* *Non-nil means always inhibit code conversion of end-of-line format. -See info node `Coding Systems' and info node `Text and Binary' concerning -such conversion. */); - inhibit_eol_conversion = 0; - - DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system, - doc: /* Non-nil means process buffer inherits coding system of process output. -Bind it to t if the process output is to be treated as if it were a file -read from some filesystem. */); - inherit_process_coding_system = 0; - - DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist, - doc: /* Alist to decide a coding system to use for a file I/O operation. -The format is ((PATTERN . VAL) ...), -where PATTERN is a regular expression matching a file name, -VAL is a coding system, a cons of coding systems, or a function symbol. -If VAL is a coding system, it is used for both decoding and encoding -the file contents. -If VAL is a cons of coding systems, the car part is used for decoding, -and the cdr part is used for encoding. -If VAL is a function symbol, the function must return a coding system -or a cons of coding systems which are used as above. The function gets -the arguments with which `find-operation-coding-systems' was called. - -See also the function `find-operation-coding-system' -and the variable `auto-coding-alist'. */); - Vfile_coding_system_alist = Qnil; - - DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist, - doc: /* Alist to decide a coding system to use for a process I/O operation. -The format is ((PATTERN . VAL) ...), -where PATTERN is a regular expression matching a program name, -VAL is a coding system, a cons of coding systems, or a function symbol. -If VAL is a coding system, it is used for both decoding what received -from the program and encoding what sent to the program. -If VAL is a cons of coding systems, the car part is used for decoding, -and the cdr part is used for encoding. -If VAL is a function symbol, the function must return a coding system -or a cons of coding systems which are used as above. - -See also the function `find-operation-coding-system'. */); - Vprocess_coding_system_alist = Qnil; - - DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist, - doc: /* Alist to decide a coding system to use for a network I/O operation. -The format is ((PATTERN . VAL) ...), -where PATTERN is a regular expression matching a network service name -or is a port number to connect to, -VAL is a coding system, a cons of coding systems, or a function symbol. -If VAL is a coding system, it is used for both decoding what received -from the network stream and encoding what sent to the network stream. -If VAL is a cons of coding systems, the car part is used for decoding, -and the cdr part is used for encoding. -If VAL is a function symbol, the function must return a coding system -or a cons of coding systems which are used as above. - -See also the function `find-operation-coding-system'. */); - Vnetwork_coding_system_alist = Qnil; - - DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system, - doc: /* Coding system to use with system messages. -Also used for decoding keyboard input on X Window system. */); - Vlocale_coding_system = Qnil; - - /* The eol mnemonics are reset in startup.el system-dependently. */ - DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix, - doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */); - eol_mnemonic_unix = build_string (":"); - - DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos, - doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */); - eol_mnemonic_dos = build_string ("\\"); - - DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac, - doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */); - eol_mnemonic_mac = build_string ("/"); - - DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided, - doc: /* *String displayed in mode line when end-of-line format is not yet determined. */); - eol_mnemonic_undecided = build_string (":"); - - DEFVAR_LISP ("enable-character-translation", &Venable_character_translation, - doc: /* *Non-nil enables character translation while encoding and decoding. */); - Venable_character_translation = Qt; - - DEFVAR_LISP ("standard-translation-table-for-decode", - &Vstandard_translation_table_for_decode, - doc: /* Table for translating characters while decoding. */); - Vstandard_translation_table_for_decode = Qnil; - - DEFVAR_LISP ("standard-translation-table-for-encode", - &Vstandard_translation_table_for_encode, - doc: /* Table for translating characters while encoding. */); - Vstandard_translation_table_for_encode = Qnil; - - DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist, - doc: /* Alist of charsets vs revision numbers. -While encoding, if a charset (car part of an element) is found, -designate it with the escape sequence identifying revision (cdr part of the element). */); - Vcharset_revision_alist = Qnil; - - DEFVAR_LISP ("default-process-coding-system", - &Vdefault_process_coding_system, - doc: /* Cons of coding systems used for process I/O by default. -The car part is used for decoding a process output, -the cdr part is used for encoding a text to be sent to a process. */); - Vdefault_process_coding_system = Qnil; - - DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table, - doc: /* Table of extra Latin codes in the range 128..159 (inclusive). -This is a vector of length 256. -If Nth element is non-nil, the existence of code N in a file -\(or output of subprocess) doesn't prevent it to be detected as -a coding system of ISO 2022 variant which has a flag -`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file -or reading output of a subprocess. -Only 128th through 159th elements has a meaning. */); - Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil); - - DEFVAR_LISP ("select-safe-coding-system-function", - &Vselect_safe_coding_system_function, - doc: /* Function to call to select safe coding system for encoding a text. - -If set, this function is called to force a user to select a proper -coding system which can encode the text in the case that a default -coding system used in each operation can't encode the text. - -The default value is `select-safe-coding-system' (which see). */); - Vselect_safe_coding_system_function = Qnil; - - DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table, - doc: /* Char-table containing safe coding systems of each characters. -Each element doesn't include such generic coding systems that can -encode any characters. They are in the first extra slot. */); - Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil); - - DEFVAR_BOOL ("inhibit-iso-escape-detection", - &inhibit_iso_escape_detection, - doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection. - -By default, on reading a file, Emacs tries to detect how the text is -encoded. This code detection is sensitive to escape sequences. If -the sequence is valid as ISO2022, the code is determined as one of -the ISO2022 encodings, and the file is decoded by the corresponding -coding system (e.g. `iso-2022-7bit'). - -However, there may be a case that you want to read escape sequences in -a file as is. In such a case, you can set this variable to non-nil. -Then, as the code detection ignores any escape sequences, no file is -detected as encoded in some ISO2022 encoding. The result is that all -escape sequences become visible in a buffer. - -The default value is nil, and it is strongly recommended not to change -it. That is because many Emacs Lisp source files that contain -non-ASCII characters are encoded by the coding system `iso-2022-7bit' -in Emacs's distribution, and they won't be decoded correctly on -reading if you suppress escape sequence detection. - -The other way to read escape sequences in a file without decoding is -to explicitly specify some coding system that doesn't use ISO2022's -escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */); - inhibit_iso_escape_detection = 0; -} - -char * -emacs_strerror (error_number) - int error_number; -{ - char *str; - - synchronize_system_messages_locale (); - str = strerror (error_number); - - if (! NILP (Vlocale_coding_system)) - { - Lisp_Object dec = code_convert_string_norecord (build_string (str), - Vlocale_coding_system, - 0); - str = (char *) XSTRING (dec)->data; - } - - return str; -} - -#endif /* emacs */ - +/* Coding system handler (conversion, detection, and etc). + Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN. + Licensed to the Free Software Foundation. + Copyright (C) 2001 Free Software Foundation, Inc. + Copyright (C) 2001, 2002 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H13PRO009 + +This file is part of GNU Emacs. + +GNU Emacs is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Emacs is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Emacs; see the file COPYING. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +/*** TABLE OF CONTENTS *** + + 0. General comments + 1. Preamble + 2. Emacs' internal format (emacs-utf-8) handlers + 3. UTF-8 handlers + 4. UTF-16 handlers + 5. Charset-base coding systems handlers + 6. emacs-mule (old Emacs' internal format) handlers + 7. ISO2022 handlers + 8. Shift-JIS and BIG5 handlers + 9. CCL handlers + 10. C library functions + 11. Emacs Lisp library functions + 12. Postamble + +*/ + +/*** 0. General comments *** + + +CODING SYSTEM + + Coding system is an encoding mechanism of one or more character + sets. Here's a list of coding system types supported by Emacs. + When we say "decode", it means converting a text encoded by some + coding system into Emacs' internal format (emacs-utf-8), and when we + say "encode", it means converting a text of emacs-utf-8 to some + other coding system. + + Emacs represents a coding system by a Lisp symbol. Each symbol is a + key to the hash table Vcharset_hash_table. This hash table + associates the symbol to the corresponding detailed specifications. + + Before using a coding system for decoding and encoding, we setup a + structure of type `struct coding_system'. This structure keeps + various information about a specific code conversion (e.g. the + location of source and destination data). + + Coding systems are classified into the following types by how to + represent a character in a byte sequence. Here's a brief descrition + about type. + + o Emacs' internal format (emacs-utf-8) + + The extended UTF-8 which allows eight-bit raw bytes mixed with + character codes. Emacs holds characters in buffers and strings by + this format. + + o UTF-8 + + o UTF-16 + + o Charset-base coding system + + A coding system defined by one or more (coded) character sets. + Decoding and encoding are done by code converter defined for each + character set. + + o Old Emacs' internal format (emacs-mule) + + The coding system adopted by an old versions of Emacs (20 and 21). + + o ISO2022-base coding system + + The most famous coding system for multiple character sets. X's + Compound Text, various EUCs (Extended Unix Code), and coding systems + used in the Internet communication such as ISO-2022-JP are all + variants of ISO2022. + + o SJIS (or Shift-JIS or MS-Kanji-Code) + + A coding system to encode character sets: ASCII, JISX0201, and + JISX0208. Widely used for PC's in Japan. Details are described in + section 8. + + o BIG5 + + A coding system to encode character sets: ASCII and Big5. Widely + used by Chinese (mainly in Taiwan and Hong Kong). Details are + described in section 8. In this file, when we write "big5" (all + lowercase), we mean the coding system, and when we write "Big5" + (capitalized), we mean the character set. + + o CCL + + If a user wants to decode/encode a text encoded in a coding system + not listed above, he can supply a decoder and an encoder for it in + CCL (Code Conversion Language) programs. Emacs executes the CCL + program while decoding/encoding. + + o Raw-text + + A coding system for a text containing raw eight-bit data. Emacs + treat each byte of source text as a character (except for + end-of-line conversion). + + o No-conversion + + Like raw text, but don't do end-of-line conversion. + + +END-OF-LINE FORMAT + + How end-of-line of a text is encoded depends on a system. For + instance, Unix's format is just one byte of LF (line-feed) code, + whereas DOS's format is two-byte sequence of `carriage-return' and + `line-feed' codes. MacOS's format is usually one byte of + `carriage-return'. + + Since text characters encoding and end-of-line encoding are + independent, any coding system described above can take any format + of end-of-line (except for no-conversion). + +*/ + +/* COMMON MACROS */ + + +/*** GENERAL NOTES on `detect_coding_XXX ()' functions *** + + These functions check if a byte sequence specified as a source in + CODING conforms to the format of XXX. Return 1 if the data contains + a byte sequence which can be decoded into non-ASCII characters by + the coding system. Otherwize (i.e. the data contains only ASCII + characters or invalid sequence) return 0. + + It also resets some bits of an integer pointed by MASK. The macros + CATEGORY_MASK_XXX specifies each bit of this integer. + + Below is the template of these functions. */ + +#if 0 +static int +detect_coding_XXX (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int c; + int found = 0; + ...; + + while (1) + { + /* Get one byte from the source. If the souce is exausted, jump + to no_more_source:. */ + ONE_MORE_BYTE (c); + /* Check if it conforms to XXX. If not, break the loop. */ + } + /* As the data is invalid for XXX, reset a proper bits. */ + *mask &= ~CODING_CATEGORY_XXX; + return 0; + no_more_source: + /* The source exausted. */ + if (!found) + /* ASCII characters only. */ + return 0; + /* Some data should be decoded into non-ASCII characters. */ + *mask &= CODING_CATEGORY_XXX; + return 1; +} +#endif + +/*** GENERAL NOTES on `decode_coding_XXX ()' functions *** + + These functions decode a byte sequence specified as a source by + CODING. The resulting multibyte text goes to a place pointed to by + CODING->charbuf, the length of which should not exceed + CODING->charbuf_size; + + These functions set the information of original and decoded texts in + CODING->consumed, CODING->consumed_char, and CODING->charbuf_used. + They also set CODING->result to one of CODING_RESULT_XXX indicating + how the decoding is finished. + + Below is the template of these functions. */ + +#if 0 +static void +decode_coding_XXXX (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + /* SRC_BASE remembers the start position in source in each loop. + The loop will be exited when there's not enough source code, or + when there's no room in CHARBUF for a decoded character. */ + unsigned char *src_base; + /* A buffer to produce decoded characters. */ + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int multibytep = coding->src_multibyte; + + while (1) + { + src_base = src; + if (charbuf < charbuf_end) + /* No more room to produce a decoded character. */ + break; + ONE_MORE_BYTE (c); + /* Decode it. */ + } + + no_more_source: + if (src_base < src_end + && coding->mode & CODING_MODE_LAST_BLOCK) + /* If the source ends by partial bytes to construct a character, + treat them as eight-bit raw data. */ + while (src_base < src_end && charbuf < charbuf_end) + *charbuf++ = *src_base++; + /* Remember how many bytes and characters we consumed. If the + source is multibyte, the bytes and chars are not identical. */ + coding->consumed = coding->consumed_char = src_base - coding->source; + /* Remember how many characters we produced. */ + coding->charbuf_used = charbuf - coding->charbuf; +} +#endif + +/*** GENERAL NOTES on `encode_coding_XXX ()' functions *** + + These functions encode SRC_BYTES length text at SOURCE of Emacs' + internal multibyte format by CODING. The resulting byte sequence + goes to a place pointed to by DESTINATION, the length of which + should not exceed DST_BYTES. + + These functions set the information of original and encoded texts in + the members produced, produced_char, consumed, and consumed_char of + the structure *CODING. They also set the member result to one of + CODING_RESULT_XXX indicating how the encoding finished. + + DST_BYTES zero means that source area and destination area are + overlapped, which means that we can produce a encoded text until it + reaches at the head of not-yet-encoded source text. + + Below is a template of these functions. */ +#if 0 +static void +encode_coding_XXX (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf->charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_; + int produced_chars = 0; + + for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++) + { + int c = *charbuf; + /* Encode C into DST, and increment DST. */ + } + label_no_more_destination: + /* How many chars and bytes we produced. */ + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; +} +#endif + + +/*** 1. Preamble ***/ + +#include +#include + +#include "lisp.h" +#include "buffer.h" +#include "character.h" +#include "charset.h" +#include "ccl.h" +#include "composite.h" +#include "coding.h" +#include "window.h" + +Lisp_Object Vcoding_system_hash_table; + +Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type; +Lisp_Object Qunix, Qdos, Qmac; +Lisp_Object Qbuffer_file_coding_system; +Lisp_Object Qpost_read_conversion, Qpre_write_conversion; +Lisp_Object Qdefault_char; +Lisp_Object Qno_conversion, Qundecided; +Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; +Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le; +Lisp_Object Qsignature, Qendian, Qbig, Qlittle; +Lisp_Object Qcoding_system_history; +Lisp_Object Qvalid_codes; + +extern Lisp_Object Qinsert_file_contents, Qwrite_region; +Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; +Lisp_Object Qstart_process, Qopen_network_stream; +Lisp_Object Qtarget_idx; + +Lisp_Object Vselect_safe_coding_system_function; + +/* Mnemonic string for each format of end-of-line. */ +Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; +/* Mnemonic string to indicate format of end-of-line is not yet + decided. */ +Lisp_Object eol_mnemonic_undecided; + +#ifdef emacs + +Lisp_Object Vcoding_system_list, Vcoding_system_alist; + +Lisp_Object Qcoding_system_p, Qcoding_system_error; + +/* Coding system emacs-mule and raw-text are for converting only + end-of-line format. */ +Lisp_Object Qemacs_mule, Qraw_text; + +/* Coding-systems are handed between Emacs Lisp programs and C internal + routines by the following three variables. */ +/* Coding-system for reading files and receiving data from process. */ +Lisp_Object Vcoding_system_for_read; +/* Coding-system for writing files and sending data to process. */ +Lisp_Object Vcoding_system_for_write; +/* Coding-system actually used in the latest I/O. */ +Lisp_Object Vlast_coding_system_used; + +/* A vector of length 256 which contains information about special + Latin codes (especially for dealing with Microsoft codes). */ +Lisp_Object Vlatin_extra_code_table; + +/* Flag to inhibit code conversion of end-of-line format. */ +int inhibit_eol_conversion; + +/* Flag to inhibit ISO2022 escape sequence detection. */ +int inhibit_iso_escape_detection; + +/* Flag to make buffer-file-coding-system inherit from process-coding. */ +int inherit_process_coding_system; + +/* Coding system to be used to encode text for terminal display. */ +struct coding_system terminal_coding; + +/* Coding system to be used to encode text for terminal display when + terminal coding system is nil. */ +struct coding_system safe_terminal_coding; + +/* Coding system of what is sent from terminal keyboard. */ +struct coding_system keyboard_coding; + +Lisp_Object Vfile_coding_system_alist; +Lisp_Object Vprocess_coding_system_alist; +Lisp_Object Vnetwork_coding_system_alist; + +Lisp_Object Vlocale_coding_system; + +#endif /* emacs */ + +/* Flag to tell if we look up translation table on character code + conversion. */ +Lisp_Object Venable_character_translation; +/* Standard translation table to look up on decoding (reading). */ +Lisp_Object Vstandard_translation_table_for_decode; +/* Standard translation table to look up on encoding (writing). */ +Lisp_Object Vstandard_translation_table_for_encode; + +Lisp_Object Qtranslation_table; +Lisp_Object Qtranslation_table_id; +Lisp_Object Qtranslation_table_for_decode; +Lisp_Object Qtranslation_table_for_encode; + +/* Alist of charsets vs revision number. */ +static Lisp_Object Vcharset_revision_table; + +/* Default coding systems used for process I/O. */ +Lisp_Object Vdefault_process_coding_system; + +/* Global flag to tell that we can't call post-read-conversion and + pre-write-conversion functions. Usually the value is zero, but it + is set to 1 temporarily while such functions are running. This is + to avoid infinite recursive call. */ +static int inhibit_pre_post_conversion; + +/* Char-table containing safe coding systems of each character. */ +Lisp_Object Vchar_coding_system_table; +Lisp_Object Qchar_coding_system; + +/* Two special coding systems. */ +Lisp_Object Vsjis_coding_system; +Lisp_Object Vbig5_coding_system; + + +static int detect_coding_utf_8 P_ ((struct coding_system *, int *)); +static void decode_coding_utf_8 P_ ((struct coding_system *)); +static int encode_coding_utf_8 P_ ((struct coding_system *)); + +static int detect_coding_utf_16 P_ ((struct coding_system *, int *)); +static void decode_coding_utf_16 P_ ((struct coding_system *)); +static int encode_coding_utf_16 P_ ((struct coding_system *)); + +static int detect_coding_iso_2022 P_ ((struct coding_system *, int *)); +static void decode_coding_iso_2022 P_ ((struct coding_system *)); +static int encode_coding_iso_2022 P_ ((struct coding_system *)); + +static int detect_coding_emacs_mule P_ ((struct coding_system *, int *)); +static void decode_coding_emacs_mule P_ ((struct coding_system *)); +static int encode_coding_emacs_mule P_ ((struct coding_system *)); + +static int detect_coding_sjis P_ ((struct coding_system *, int *)); +static void decode_coding_sjis P_ ((struct coding_system *)); +static int encode_coding_sjis P_ ((struct coding_system *)); + +static int detect_coding_big5 P_ ((struct coding_system *, int *)); +static void decode_coding_big5 P_ ((struct coding_system *)); +static int encode_coding_big5 P_ ((struct coding_system *)); + +static int detect_coding_ccl P_ ((struct coding_system *, int *)); +static void decode_coding_ccl P_ ((struct coding_system *)); +static int encode_coding_ccl P_ ((struct coding_system *)); + +static void decode_coding_raw_text P_ ((struct coding_system *)); +static int encode_coding_raw_text P_ ((struct coding_system *)); + + +/* ISO2022 section */ + +#define CODING_ISO_INITIAL(coding, reg) \ + (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \ + coding_attr_iso_initial), \ + reg))) + + +#define CODING_ISO_REQUEST(coding, charset_id) \ + ((charset_id <= (coding)->max_charset_id \ + ? (coding)->safe_charsets[charset_id] \ + : -1)) + + +#define CODING_ISO_FLAGS(coding) \ + ((coding)->spec.iso_2022.flags) +#define CODING_ISO_DESIGNATION(coding, reg) \ + ((coding)->spec.iso_2022.current_designation[reg]) +#define CODING_ISO_INVOCATION(coding, plane) \ + ((coding)->spec.iso_2022.current_invocation[plane]) +#define CODING_ISO_SINGLE_SHIFTING(coding) \ + ((coding)->spec.iso_2022.single_shifting) +#define CODING_ISO_BOL(coding) \ + ((coding)->spec.iso_2022.bol) +#define CODING_ISO_INVOKED_CHARSET(coding, plane) \ + CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane))) + +/* Control characters of ISO2022. */ + /* code */ /* function */ +#define ISO_CODE_LF 0x0A /* line-feed */ +#define ISO_CODE_CR 0x0D /* carriage-return */ +#define ISO_CODE_SO 0x0E /* shift-out */ +#define ISO_CODE_SI 0x0F /* shift-in */ +#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */ +#define ISO_CODE_ESC 0x1B /* escape */ +#define ISO_CODE_SS2 0x8E /* single-shift-2 */ +#define ISO_CODE_SS3 0x8F /* single-shift-3 */ +#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */ + +/* All code (1-byte) of ISO2022 is classified into one of the + followings. */ +enum iso_code_class_type + { + ISO_control_0, /* Control codes in the range + 0x00..0x1F and 0x7F, except for the + following 5 codes. */ + ISO_carriage_return, /* ISO_CODE_CR (0x0D) */ + ISO_shift_out, /* ISO_CODE_SO (0x0E) */ + ISO_shift_in, /* ISO_CODE_SI (0x0F) */ + ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */ + ISO_escape, /* ISO_CODE_SO (0x1B) */ + ISO_control_1, /* Control codes in the range + 0x80..0x9F, except for the + following 3 codes. */ + ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */ + ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */ + ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */ + ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */ + ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */ + ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */ + ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */ + }; + +/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the + `iso-flags' attribute of an iso2022 coding system. */ + +/* If set, produce long-form designation sequence (e.g. ESC $ ( A) + instead of the correct short-form sequence (e.g. ESC $ A). */ +#define CODING_ISO_FLAG_LONG_FORM 0x0001 + +/* If set, reset graphic planes and registers at end-of-line to the + initial state. */ +#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002 + +/* If set, reset graphic planes and registers before any control + characters to the initial state. */ +#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004 + +/* If set, encode by 7-bit environment. */ +#define CODING_ISO_FLAG_SEVEN_BITS 0x0008 + +/* If set, use locking-shift function. */ +#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010 + +/* If set, use single-shift function. Overwrite + CODING_ISO_FLAG_LOCKING_SHIFT. */ +#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020 + +/* If set, use designation escape sequence. */ +#define CODING_ISO_FLAG_DESIGNATION 0x0040 + +/* If set, produce revision number sequence. */ +#define CODING_ISO_FLAG_REVISION 0x0080 + +/* If set, produce ISO6429's direction specifying sequence. */ +#define CODING_ISO_FLAG_DIRECTION 0x0100 + +/* If set, assume designation states are reset at beginning of line on + output. */ +#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200 + +/* If set, designation sequence should be placed at beginning of line + on output. */ +#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400 + +/* If set, do not encode unsafe charactes on output. */ +#define CODING_ISO_FLAG_SAFE 0x0800 + +/* If set, extra latin codes (128..159) are accepted as a valid code + on input. */ +#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000 + +#define CODING_ISO_FLAG_COMPOSITION 0x2000 + +#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 + +#define CODING_ISO_FLAG_FULL_SUPPORT 0x8000 + +/* A character to be produced on output if encoding of the original + character is prohibited by CODING_ISO_FLAG_SAFE. */ +#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?' + + +/* UTF-16 section */ +#define CODING_UTF_16_BOM(coding) \ + ((coding)->spec.utf_16.bom) + +#define CODING_UTF_16_ENDIAN(coding) \ + ((coding)->spec.utf_16.endian) + +#define CODING_UTF_16_SURROGATE(coding) \ + ((coding)->spec.utf_16.surrogate) + + +/* CCL section */ +#define CODING_CCL_DECODER(coding) \ + AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder) +#define CODING_CCL_ENCODER(coding) \ + AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder) +#define CODING_CCL_VALIDS(coding) \ + (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \ + ->data) + +/* Index for each coding category in `coding_category_table' */ + +enum coding_category + { + coding_category_iso_7, + coding_category_iso_7_tight, + coding_category_iso_8_1, + coding_category_iso_8_2, + coding_category_iso_7_else, + coding_category_iso_8_else, + coding_category_utf_8, + coding_category_utf_16_auto, + coding_category_utf_16_be, + coding_category_utf_16_le, + coding_category_utf_16_be_nosig, + coding_category_utf_16_le_nosig, + coding_category_charset, + coding_category_sjis, + coding_category_big5, + coding_category_ccl, + coding_category_emacs_mule, + /* All above are targets of code detection. */ + coding_category_raw_text, + coding_category_undecided, + coding_category_max + }; + +/* Definitions of flag bits used in detect_coding_XXXX. */ +#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7) +#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight) +#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1) +#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) +#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) +#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) +#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) +#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) +#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) +#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig) +#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig) +#define CATEGORY_MASK_CHARSET (1 << coding_category_charset) +#define CATEGORY_MASK_SJIS (1 << coding_category_sjis) +#define CATEGORY_MASK_BIG5 (1 << coding_category_big5) +#define CATEGORY_MASK_CCL (1 << coding_category_ccl) +#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule) + +/* This value is returned if detect_coding_mask () find nothing other + than ASCII characters. */ +#define CATEGORY_MASK_ANY \ + (CATEGORY_MASK_ISO_7 \ + | CATEGORY_MASK_ISO_7_TIGHT \ + | CATEGORY_MASK_ISO_8_1 \ + | CATEGORY_MASK_ISO_8_2 \ + | CATEGORY_MASK_ISO_7_ELSE \ + | CATEGORY_MASK_ISO_8_ELSE \ + | CATEGORY_MASK_UTF_8 \ + | CATEGORY_MASK_UTF_16_BE \ + | CATEGORY_MASK_UTF_16_LE \ + | CATEGORY_MASK_UTF_16_BE_NOSIG \ + | CATEGORY_MASK_UTF_16_LE_NOSIG \ + | CATEGORY_MASK_CHARSET \ + | CATEGORY_MASK_SJIS \ + | CATEGORY_MASK_BIG5 \ + | CATEGORY_MASK_CCL \ + | CATEGORY_MASK_EMACS_MULE) + + +#define CATEGORY_MASK_ISO_7BIT \ + (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT) + +#define CATEGORY_MASK_ISO_8BIT \ + (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2) + +#define CATEGORY_MASK_ISO_ELSE \ + (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE) + +#define CATEGORY_MASK_ISO_ESCAPE \ + (CATEGORY_MASK_ISO_7 \ + | CATEGORY_MASK_ISO_7_TIGHT \ + | CATEGORY_MASK_ISO_7_ELSE \ + | CATEGORY_MASK_ISO_8_ELSE) + +#define CATEGORY_MASK_ISO \ + ( CATEGORY_MASK_ISO_7BIT \ + | CATEGORY_MASK_ISO_8BIT \ + | CATEGORY_MASK_ISO_ELSE) + +#define CATEGORY_MASK_UTF_16 \ + (CATEGORY_MASK_UTF_16_BE \ + | CATEGORY_MASK_UTF_16_LE \ + | CATEGORY_MASK_UTF_16_BE_NOSIG \ + | CATEGORY_MASK_UTF_16_LE_NOSIG) + + +/* List of symbols `coding-category-xxx' ordered by priority. This + variable is exposed to Emacs Lisp. */ +static Lisp_Object Vcoding_category_list; + +/* Table of coding categories (Lisp symbols). This variable is for + internal use oly. */ +static Lisp_Object Vcoding_category_table; + +/* Table of coding-categories ordered by priority. */ +static enum coding_category coding_priorities[coding_category_max]; + +/* Nth element is a coding context for the coding system bound to the + Nth coding category. */ +static struct coding_system coding_categories[coding_category_max]; + +static int detected_mask[coding_category_raw_text] = + { CATEGORY_MASK_ISO, + CATEGORY_MASK_ISO, + CATEGORY_MASK_ISO, + CATEGORY_MASK_ISO, + CATEGORY_MASK_ISO, + CATEGORY_MASK_ISO, + CATEGORY_MASK_UTF_8, + CATEGORY_MASK_UTF_16, + CATEGORY_MASK_UTF_16, + CATEGORY_MASK_UTF_16, + CATEGORY_MASK_UTF_16, + CATEGORY_MASK_UTF_16, + CATEGORY_MASK_CHARSET, + CATEGORY_MASK_SJIS, + CATEGORY_MASK_BIG5, + CATEGORY_MASK_CCL, + CATEGORY_MASK_EMACS_MULE + }; + +/*** Commonly used macros and functions ***/ + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef max +#define max(a, b) ((a) > (b) ? (a) : (b)) +#endif + +#define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \ + do { \ + attrs = CODING_ID_ATTRS (coding->id); \ + eol_type = CODING_ID_EOL_TYPE (coding->id); \ + if (VECTORP (eol_type)) \ + eol_type = Qunix; \ + charset_list = CODING_ATTR_CHARSET_LIST (attrs); \ + } while (0) + + +/* Safely get one byte from the source text pointed by SRC which ends + at SRC_END, and set C to that byte. If there are not enough bytes + in the source, it jumps to `no_more_source'. The caller + should declare and set these variables appropriately in advance: + src, src_end, multibytep +*/ + +#define ONE_MORE_BYTE(c) \ + do { \ + if (src == src_end) \ + { \ + if (src_base < src) \ + coding->result = CODING_RESULT_INSUFFICIENT_SRC; \ + goto no_more_source; \ + } \ + c = *src++; \ + if (multibytep && (c & 0x80)) \ + { \ + if ((c & 0xFE) != 0xC0) \ + error ("Undecodable char found"); \ + c = ((c & 1) << 6) | *src++; \ + } \ + consumed_chars++; \ + } while (0) + + +#define ONE_MORE_BYTE_NO_CHECK(c) \ + do { \ + c = *src++; \ + if (multibytep && (c & 0x80)) \ + { \ + if ((c & 0xFE) != 0xC0) \ + error ("Undecodable char found"); \ + c = ((c & 1) << 6) | *src++; \ + } \ + } while (0) + + +/* Store a byte C in the place pointed by DST and increment DST to the + next free point, and increment PRODUCED_CHARS. The caller should + assure that C is 0..127, and declare and set the variable `dst' + appropriately in advance. +*/ + + +#define EMIT_ONE_ASCII_BYTE(c) \ + do { \ + produced_chars++; \ + *dst++ = (c); \ + } while (0) + + +/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */ + +#define EMIT_TWO_ASCII_BYTES(c1, c2) \ + do { \ + produced_chars += 2; \ + *dst++ = (c1), *dst++ = (c2); \ + } while (0) + + +/* Store a byte C in the place pointed by DST and increment DST to the + next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is + nonzero, store in an appropriate multibyte from. The caller should + declare and set the variables `dst' and `multibytep' appropriately + in advance. */ + +#define EMIT_ONE_BYTE(c) \ + do { \ + produced_chars++; \ + if (multibytep) \ + { \ + int ch = (c); \ + if (ch >= 0x80) \ + ch = BYTE8_TO_CHAR (ch); \ + CHAR_STRING_ADVANCE (ch, dst); \ + } \ + else \ + *dst++ = (c); \ + } while (0) + + +/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ + +#define EMIT_TWO_BYTES(c1, c2) \ + do { \ + produced_chars += 2; \ + if (multibytep) \ + { \ + CHAR_STRING_ADVANCE ((int) (c1), dst); \ + CHAR_STRING_ADVANCE ((int) (c2), dst); \ + } \ + else \ + { \ + *dst++ = (c1); \ + *dst++ = (c2); \ + } \ + } while (0) + + +#define EMIT_THREE_BYTES(c1, c2, c3) \ + do { \ + EMIT_ONE_BYTE (c1); \ + EMIT_TWO_BYTES (c2, c3); \ + } while (0) + + +#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \ + do { \ + EMIT_TWO_BYTES (c1, c2); \ + EMIT_TWO_BYTES (c3, c4); \ + } while (0) + + +#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \ + do { \ + charset_map_loaded = 0; \ + c = DECODE_CHAR (charset, code); \ + if (charset_map_loaded) \ + { \ + unsigned char *orig = coding->source; \ + EMACS_INT offset; \ + \ + coding_set_source (coding); \ + offset = coding->source - orig; \ + src += offset; \ + src_base += offset; \ + src_end += offset; \ + } \ + } while (0) + + +#define ASSURE_DESTINATION(bytes) \ + do { \ + if (dst + (bytes) >= dst_end) \ + { \ + int more_bytes = charbuf_end - charbuf + (bytes); \ + \ + dst = alloc_destination (coding, more_bytes, dst); \ + dst_end = coding->destination + coding->dst_bytes; \ + } \ + } while (0) + + + +static void +coding_set_source (coding) + struct coding_system *coding; +{ + if (BUFFERP (coding->src_object)) + { + if (coding->src_pos < 0) + coding->source = GAP_END_ADDR + coding->src_pos_byte; + else + { + if (coding->src_pos < GPT + && coding->src_pos + coding->src_chars >= GPT) + move_gap_both (coding->src_pos, coding->src_pos_byte); + coding->source = BYTE_POS_ADDR (coding->src_pos_byte); + } + } + else if (STRINGP (coding->src_object)) + { + coding->source = (XSTRING (coding->src_object)->data + + coding->src_pos_byte); + } + else + /* Otherwise, the source is C string and is never relocated + automatically. Thus we don't have to update anything. */ + ; +} + +static void +coding_set_destination (coding) + struct coding_system *coding; +{ + if (BUFFERP (coding->dst_object)) + { + /* We are sure that coding->dst_pos_byte is before the gap of the + buffer. */ + coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object)) + + coding->dst_pos_byte - 1); + if (coding->src_pos < 0) + /* The source and destination is in the same buffer. */ + coding->dst_bytes = (GAP_END_ADDR + - (coding->src_bytes - coding->consumed) + - coding->destination); + else + coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object)) + - coding->destination); + } + else + /* Otherwise, the destination is C string and is never relocated + automatically. Thus we don't have to update anything. */ + ; +} + + +static void +coding_alloc_by_realloc (coding, bytes) + struct coding_system *coding; + EMACS_INT bytes; +{ + coding->destination = (unsigned char *) xrealloc (coding->destination, + coding->dst_bytes + bytes); + coding->dst_bytes += bytes; +} + +static void +coding_alloc_by_making_gap (coding, bytes) + struct coding_system *coding; + EMACS_INT bytes; +{ + Lisp_Object this_buffer; + + this_buffer = Fcurrent_buffer (); + if (EQ (this_buffer, coding->dst_object)) + { + EMACS_INT add = coding->src_bytes - coding->consumed; + + GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add; + make_gap (bytes); + GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add; + } + else + { + set_buffer_internal (XBUFFER (coding->dst_object)); + make_gap (bytes); + set_buffer_internal (XBUFFER (this_buffer)); + } +} + + +static unsigned char * +alloc_destination (coding, nbytes, dst) + struct coding_system *coding; + int nbytes; + unsigned char *dst; +{ + EMACS_INT offset = dst - coding->destination; + + if (BUFFERP (coding->dst_object)) + coding_alloc_by_making_gap (coding, nbytes); + else + coding_alloc_by_realloc (coding, nbytes); + coding->result = CODING_RESULT_SUCCESS; + coding_set_destination (coding); + dst = coding->destination + offset; + return dst; +} + + +/*** 2. Emacs' internal format (emacs-utf-8) ***/ + + + + +/*** 3. UTF-8 ***/ + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in UTF-8. If it is, return + CATEGORY_MASK_UTF_8, else return 0. */ + +#define UTF_8_1_OCTET_P(c) ((c) < 0x80) +#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) +#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0) +#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) +#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) +#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) + +static int +detect_coding_utf_8 (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int found = 0; + + /* A coding system of this category is always ASCII compatible. */ + src += coding->head_ascii; + + while (1) + { + int c, c1, c2, c3, c4; + + ONE_MORE_BYTE (c); + if (UTF_8_1_OCTET_P (c)) + continue; + ONE_MORE_BYTE (c1); + if (! UTF_8_EXTRA_OCTET_P (c1)) + break; + if (UTF_8_2_OCTET_LEADING_P (c)) + { + found++; + continue; + } + ONE_MORE_BYTE (c2); + if (! UTF_8_EXTRA_OCTET_P (c2)) + break; + if (UTF_8_3_OCTET_LEADING_P (c)) + { + found++; + continue; + } + ONE_MORE_BYTE (c3); + if (! UTF_8_EXTRA_OCTET_P (c3)) + break; + if (UTF_8_4_OCTET_LEADING_P (c)) + { + found++; + continue; + } + ONE_MORE_BYTE (c4); + if (! UTF_8_EXTRA_OCTET_P (c4)) + break; + if (UTF_8_5_OCTET_LEADING_P (c)) + { + found++; + continue; + } + break; + } + *mask &= ~CATEGORY_MASK_UTF_8; + return 0; + + no_more_source: + if (! found) + return 0; + *mask &= CATEGORY_MASK_UTF_8; + return 1; +} + + +static void +decode_coding_utf_8 (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + unsigned char *src_base; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int consumed_chars = 0, consumed_chars_base; + int multibytep = coding->src_multibyte; + Lisp_Object attr, eol_type, charset_list; + + CODING_GET_INFO (coding, attr, eol_type, charset_list); + + while (1) + { + int c, c1, c2, c3, c4, c5; + + src_base = src; + consumed_chars_base = consumed_chars; + + if (charbuf >= charbuf_end) + break; + + ONE_MORE_BYTE (c1); + if (UTF_8_1_OCTET_P(c1)) + { + c = c1; + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src == src_end) + goto no_more_source; + if (*src == '\n') + ONE_MORE_BYTE (c); + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + } + else + { + ONE_MORE_BYTE (c2); + if (! UTF_8_EXTRA_OCTET_P (c2)) + goto invalid_code; + if (UTF_8_2_OCTET_LEADING_P (c1)) + c = ((c1 & 0x1F) << 6) | (c2 & 0x3F); + else + { + ONE_MORE_BYTE (c3); + if (! UTF_8_EXTRA_OCTET_P (c3)) + goto invalid_code; + if (UTF_8_3_OCTET_LEADING_P (c1)) + c = (((c1 & 0xF) << 12) + | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); + else + { + ONE_MORE_BYTE (c4); + if (! UTF_8_EXTRA_OCTET_P (c4)) + goto invalid_code; + if (UTF_8_4_OCTET_LEADING_P (c1)) + c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12) + | ((c3 & 0x3F) << 6) | (c4 & 0x3F)); + else + { + ONE_MORE_BYTE (c5); + if (! UTF_8_EXTRA_OCTET_P (c5)) + goto invalid_code; + if (UTF_8_5_OCTET_LEADING_P (c1)) + { + c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18) + | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6) + | (c5 & 0x3F)); + if (c > MAX_CHAR) + goto invalid_code; + } + else + goto invalid_code; + } + } + } + } + + *charbuf++ = c; + continue; + + invalid_code: + src = src_base; + consumed_chars = consumed_chars_base; + ONE_MORE_BYTE (c); + *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); + coding->errors++; + } + + no_more_source: + coding->consumed_char += consumed_chars_base; + coding->consumed = src_base - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + + +static int +encode_coding_utf_8 (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int produced_chars; + int c; + + if (multibytep) + { + int safe_room = MAX_MULTIBYTE_LENGTH * 2; + + while (charbuf < charbuf_end) + { + unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str; + + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + CHAR_STRING_ADVANCE (c, pend); + for (p = str; p < pend; p++) + EMIT_ONE_BYTE (*p); + } + } + else + { + int safe_room = MAX_MULTIBYTE_LENGTH; + + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + dst += CHAR_STRING (c, dst); + produced_chars++; + } + } + coding->result = CODING_RESULT_SUCCESS; + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in UTF-16 Big Endian (endian == 1) or + Little Endian (otherwise). If it is, return + CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE, + else return 0. */ + +#define UTF_16_HIGH_SURROGATE_P(val) \ + (((val) & 0xFC00) == 0xD800) + +#define UTF_16_LOW_SURROGATE_P(val) \ + (((val) & 0xFC00) == 0xDC00) + +#define UTF_16_INVALID_P(val) \ + (((val) == 0xFFFE) \ + || ((val) == 0xFFFF) \ + || UTF_16_LOW_SURROGATE_P (val)) + + +static int +detect_coding_utf_16 (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int c1, c2; + + ONE_MORE_BYTE (c1); + ONE_MORE_BYTE (c2); + + if ((c1 == 0xFF) && (c2 == 0xFE)) + { + *mask &= CATEGORY_MASK_UTF_16_LE; + return 1; + } + else if ((c1 == 0xFE) && (c2 == 0xFF)) + { + *mask &= CATEGORY_MASK_UTF_16_BE; + return 1; + } + no_more_source: + return 0; +} + +static void +decode_coding_utf_16 (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + unsigned char *src_base, *surrogate_high_base; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int consumed_chars = 0, consumed_chars_base; + int multibytep = coding->src_multibyte; + enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); + enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); + int surrogate = CODING_UTF_16_SURROGATE (coding); + Lisp_Object attr, eol_type, charset_list; + + CODING_GET_INFO (coding, attr, eol_type, charset_list); + + if (bom != utf_16_without_bom) + { + int c, c1, c2; + + src_base = src; + ONE_MORE_BYTE (c1); + ONE_MORE_BYTE (c2); + c = (c1 << 16) | c2; + if (bom == utf_16_with_bom) + { + if (endian == utf_16_big_endian + ? c != 0xFFFE : c != 0xFEFF) + { + /* We are sure that there's enouph room at CHARBUF. */ + *charbuf++ = c1; + *charbuf++ = c2; + coding->errors++; + } + } + else + { + if (c == 0xFFFE) + CODING_UTF_16_ENDIAN (coding) + = endian = utf_16_big_endian; + else if (c == 0xFEFF) + CODING_UTF_16_ENDIAN (coding) + = endian = utf_16_little_endian; + else + { + CODING_UTF_16_ENDIAN (coding) + = endian = utf_16_big_endian; + src = src_base; + } + } + CODING_UTF_16_BOM (coding) = utf_16_with_bom; + } + + while (1) + { + int c, c1, c2; + + src_base = src; + consumed_chars_base = consumed_chars; + + if (charbuf + 2 >= charbuf_end) + break; + + ONE_MORE_BYTE (c1); + ONE_MORE_BYTE (c2); + c = (endian == utf_16_big_endian + ? ((c1 << 16) | c2) : ((c2 << 16) | c1)); + if (surrogate) + { + if (! UTF_16_LOW_SURROGATE_P (c)) + { + if (endian == utf_16_big_endian) + c1 = surrogate >> 8, c2 = surrogate & 0xFF; + else + c1 = surrogate & 0xFF, c2 = surrogate >> 8; + *charbuf++ = c1; + *charbuf++ = c2; + coding->errors++; + if (UTF_16_HIGH_SURROGATE_P (c)) + CODING_UTF_16_SURROGATE (coding) = surrogate = c; + else + *charbuf++ = c; + } + else + { + c = ((surrogate - 0xD800) << 10) | (c - 0xDC00); + CODING_UTF_16_SURROGATE (coding) = surrogate = 0; + *charbuf++ = c; + } + } + else + { + if (UTF_16_HIGH_SURROGATE_P (c)) + CODING_UTF_16_SURROGATE (coding) = surrogate = c; + else + *charbuf++ = c; + } + } + + no_more_source: + coding->consumed_char += consumed_chars_base; + coding->consumed = src_base - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + +static int +encode_coding_utf_16 (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int safe_room = 8; + enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); + int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian; + int produced_chars = 0; + Lisp_Object attrs, eol_type, charset_list; + int c; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + + if (bom == utf_16_with_bom) + { + ASSURE_DESTINATION (safe_room); + if (big_endian) + EMIT_TWO_BYTES (0xFF, 0xFE); + else + EMIT_TWO_BYTES (0xFE, 0xFF); + CODING_UTF_16_BOM (coding) = utf_16_without_bom; + } + + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + if (c >= 0x110000) + c = 0xFFFF; + + if (c < 0x10000) + { + if (big_endian) + EMIT_TWO_BYTES (c >> 8, c & 0xFF); + else + EMIT_TWO_BYTES (c & 0xFF, c >> 8); + } + else + { + int c1, c2; + + c -= 0x10000; + c1 = (c >> 10) + 0xD800; + c2 = (c & 0x3FF) + 0xDC00; + if (big_endian) + EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF); + else + EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8); + } + } + coding->result = CODING_RESULT_SUCCESS; + coding->produced = dst - coding->destination; + coding->produced_char += produced_chars; + return 0; +} + + +/*** 6. Old Emacs' internal format (emacs-mule) ***/ + +/* Emacs' internal format for representation of multiple character + sets is a kind of multi-byte encoding, i.e. characters are + represented by variable-length sequences of one-byte codes. + + ASCII characters and control characters (e.g. `tab', `newline') are + represented by one-byte sequences which are their ASCII codes, in + the range 0x00 through 0x7F. + + 8-bit characters of the range 0x80..0x9F are represented by + two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit + code + 0x20). + + 8-bit characters of the range 0xA0..0xFF are represented by + one-byte sequences which are their 8-bit code. + + The other characters are represented by a sequence of `base + leading-code', optional `extended leading-code', and one or two + `position-code's. The length of the sequence is determined by the + base leading-code. Leading-code takes the range 0x81 through 0x9D, + whereas extended leading-code and position-code take the range 0xA0 + through 0xFF. See `charset.h' for more details about leading-code + and position-code. + + --- CODE RANGE of Emacs' internal format --- + character set range + ------------- ----- + ascii 0x00..0x7F + eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF + eight-bit-graphic 0xA0..0xBF + ELSE 0x81..0x9D + [0xA0..0xFF]+ + --------------------------------------------- + + As this is the internal character representation, the format is + usually not used externally (i.e. in a file or in a data sent to a + process). But, it is possible to have a text externally in this + format (i.e. by encoding by the coding system `emacs-mule'). + + In that case, a sequence of one-byte codes has a slightly different + form. + + At first, all characters in eight-bit-control are represented by + one-byte sequences which are their 8-bit code. + + Next, character composition data are represented by the byte + sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ..., + where, + METHOD is 0xF0 plus one of composition method (enum + composition_method), + + BYTES is 0xA0 plus a byte length of this composition data, + + CHARS is 0x20 plus a number of characters composed by this + data, + + COMPONENTs are characters of multibye form or composition + rules encoded by two-byte of ASCII codes. + + In addition, for backward compatibility, the following formats are + also recognized as composition data on decoding. + + 0x80 MSEQ ... + 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ + + Here, + MSEQ is a multibyte form but in these special format: + ASCII: 0xA0 ASCII_CODE+0x80, + other: LEADING_CODE+0x20 FOLLOWING-BYTE ..., + RULE is a one byte code of the range 0xA0..0xF0 that + represents a composition rule. + */ + +char emacs_mule_bytes[256]; + +/* Leading-code followed by extended leading-code. */ +#define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */ +#define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */ +#define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */ +#define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */ + + +int +emacs_mule_char (coding, composition, nbytes, nchars) + struct coding_system *coding; + int composition; + int *nbytes, *nchars; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + unsigned char *src_base = src; + struct charset *charset; + unsigned code; + int c; + int consumed_chars = 0; + + ONE_MORE_BYTE (c); + if (composition) + { + c -= 0x20; + if (c == 0x80) + { + ONE_MORE_BYTE (c); + if (c < 0xA0) + goto invalid_code; + *nbytes = src - src_base; + *nchars = consumed_chars; + return (c - 0x80); + } + } + + switch (emacs_mule_bytes[c]) + { + case 2: + if (! (charset = emacs_mule_charset[c])) + goto invalid_code; + ONE_MORE_BYTE (c); + code = c & 0x7F; + break; + + case 3: + if (c == LEADING_CODE_PRIVATE_11 + || c == LEADING_CODE_PRIVATE_12) + { + ONE_MORE_BYTE (c); + if (! (charset = emacs_mule_charset[c])) + goto invalid_code; + ONE_MORE_BYTE (c); + code = c & 0x7F; + } + else + { + if (! (charset = emacs_mule_charset[c])) + goto invalid_code; + ONE_MORE_BYTE (c); + code = (c & 0x7F) << 7; + ONE_MORE_BYTE (c); + code |= c & 0x7F; + } + break; + + case 4: + if (! (charset = emacs_mule_charset[c])) + goto invalid_code; + ONE_MORE_BYTE (c); + code = (c & 0x7F) << 7; + ONE_MORE_BYTE (c); + code |= c & 0x7F; + break; + + case 1: + code = c; + charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii + : code < 0xA0 ? charset_8_bit_control + : charset_8_bit_graphic); + break; + + default: + abort (); + } + c = DECODE_CHAR (charset, code); + if (c < 0) + goto invalid_code; + *nbytes = src - src_base; + *nchars = consumed_chars; + return c; + + no_more_source: + return -2; + + invalid_code: + return -1; +} + + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in `emacs-mule'. */ + +static int +detect_coding_emacs_mule (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int c; + int found = 0; + + /* A coding system of this category is always ASCII compatible. */ + src += coding->head_ascii; + + while (1) + { + ONE_MORE_BYTE (c); + + if (c == 0x80) + { + /* Perhaps the start of composite character. We simple skip + it because analyzing it is too heavy for detecting. But, + at least, we check that the composite character + constitues of more than 4 bytes. */ + unsigned char *src_base; + + repeat: + src_base = src; + do + { + ONE_MORE_BYTE (c); + } + while (c >= 0xA0); + + if (src - src_base <= 4) + break; + found = 1; + if (c == 0x80) + goto repeat; + } + + if (c < 0x80) + { + if (c < 0x20 + && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)) + break; + } + else + { + unsigned char *src_base = src - 1; + + do + { + ONE_MORE_BYTE (c); + } + while (c >= 0xA0); + if (src - src_base != emacs_mule_bytes[*src_base]) + break; + found = 1; + } + } + *mask &= ~CATEGORY_MASK_EMACS_MULE; + return 0; + + no_more_source: + if (!found) + return 0; + *mask &= CATEGORY_MASK_EMACS_MULE; + return 1; +} + + +/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ + +/* Decode a character represented as a component of composition + sequence of Emacs 20/21 style at SRC. Set C to that character and + update SRC to the head of next character (or an encoded composition + rule). If SRC doesn't points a composition component, set C to -1. + If SRC points an invalid byte sequence, global exit by a return + value 0. */ + +#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \ + if (1) \ + { \ + int c; \ + int nbytes, nchars; \ + \ + if (src == src_end) \ + break; \ + c = emacs_mule_char (coding, 1, &nbytes, &nchars); \ + if (c < 0) \ + { \ + if (c == -2) \ + break; \ + goto invalid_code; \ + } \ + *buf++ = c; \ + src += nbytes; \ + consumed_chars += nchars; \ + } \ + else + + +/* Decode a composition rule represented as a component of composition + sequence of Emacs 20 style at SRC. Set C to the rule. If SRC + points an invalid byte sequence, set C to -1. */ + +#define DECODE_EMACS_MULE_COMPOSITION_RULE(buf) \ + do { \ + int c, gref, nref; \ + \ + if (src < src_end) \ + goto invalid_code; \ + ONE_MORE_BYTE_NO_CHECK (c); \ + c -= 0xA0; \ + if (c < 0 || c >= 81) \ + goto invalid_code; \ + \ + gref = c / 9, nref = c % 9; \ + *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ + } while (0) + + +#define ADD_COMPOSITION_DATA(buf, method, nchars) \ + do { \ + *buf++ = -5; \ + *buf++ = coding->produced_char + char_offset; \ + *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \ + *buf++ = method; \ + *buf++ = nchars; \ + } while (0) + + +#define DECODE_EMACS_MULE_21_COMPOSITION(c) \ + do { \ + /* Emacs 21 style format. The first three bytes at SRC are \ + (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \ + the byte length of this composition information, CHARS is the \ + number of characters composed by this composition. */ \ + enum composition_method method = c - 0xF0; \ + int consumed_chars_limit; \ + int nbytes, nchars; \ + \ + ONE_MORE_BYTE (c); \ + nbytes = c - 0xA0; \ + if (nbytes < 3) \ + goto invalid_code; \ + ONE_MORE_BYTE (c); \ + nchars = c - 0xA0; \ + ADD_COMPOSITION_DATA (charbuf, method, nchars); \ + consumed_chars_limit = consumed_chars_base + nbytes; \ + if (method != COMPOSITION_RELATIVE) \ + { \ + int i = 0; \ + while (consumed_chars < consumed_chars_limit) \ + { \ + if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \ + DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf); \ + else \ + DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \ + } \ + if (consumed_chars < consumed_chars_limit) \ + goto invalid_code; \ + } \ + } while (0) + + +#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \ + do { \ + /* Emacs 20 style format for relative composition. */ \ + /* Store multibyte form of characters to be composed. */ \ + int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ + int *buf = components; \ + int i, j; \ + \ + src = src_base; \ + ONE_MORE_BYTE (c); /* skip 0x80 */ \ + for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ + DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ + if (i < 2) \ + goto invalid_code; \ + ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \ + for (j = 0; j < i; j++) \ + *charbuf++ = components[j]; \ + } while (0) + + +#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \ + do { \ + /* Emacs 20 style format for rule-base composition. */ \ + /* Store multibyte form of characters to be composed. */ \ + int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ + int *buf = components; \ + int i, j; \ + \ + DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ + for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ + { \ + DECODE_EMACS_MULE_COMPOSITION_RULE (buf); \ + DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ + } \ + if (i < 1 || (buf - components) % 2 == 0) \ + goto invalid_code; \ + if (charbuf + i + (i / 2) + 1 < charbuf_end) \ + goto no_more_source; \ + ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \ + for (j = 0; j < i; j++) \ + *charbuf++ = components[j]; \ + for (j = 0; j < i; j += 2) \ + *charbuf++ = components[j]; \ + } while (0) + + +static void +decode_coding_emacs_mule (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + unsigned char *src_base; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int consumed_chars = 0, consumed_chars_base; + int char_offset = 0; + int multibytep = coding->src_multibyte; + Lisp_Object attrs, eol_type, charset_list; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + + while (1) + { + int c; + + src_base = src; + consumed_chars_base = consumed_chars; + + if (charbuf >= charbuf_end) + break; + + ONE_MORE_BYTE (c); + + if (c < 0x80) + { + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src == src_end) + goto no_more_source; + if (*src == '\n') + ONE_MORE_BYTE (c); + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + *charbuf++ = c; + char_offset++; + } + else if (c == 0x80) + { + if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end) + break; + ONE_MORE_BYTE (c); + if (c - 0xF0 >= COMPOSITION_RELATIVE + && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS) + DECODE_EMACS_MULE_21_COMPOSITION (c); + else if (c < 0xC0) + DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c); + else if (c == 0xFF) + DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); + else + goto invalid_code; + } + else if (c < 0xA0 && emacs_mule_bytes[c] > 1) + { + int nbytes, nchars; + src--; + c = emacs_mule_char (coding, 0, &nbytes, &nchars); + if (c < 0) + { + if (c == -2) + break; + goto invalid_code; + } + *charbuf++ = c; + char_offset++; + } + continue; + + invalid_code: + src = src_base; + consumed_chars = consumed_chars_base; + ONE_MORE_BYTE (c); + *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); + coding->errors++; + } + + no_more_source: + coding->consumed_char += consumed_chars_base; + coding->consumed = src_base - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + + +#define EMACS_MULE_LEADING_CODES(id, codes) \ + do { \ + if (id < 0xA0) \ + codes[0] = id, codes[1] = 0; \ + else if (id < 0xE0) \ + codes[0] = 0x9A, codes[1] = id; \ + else if (id < 0xF0) \ + codes[0] = 0x9B, codes[1] = id; \ + else if (id < 0xF5) \ + codes[0] = 0x9C, codes[1] = id; \ + else \ + codes[0] = 0x9D, codes[1] = id; \ + } while (0); + + +static int +encode_coding_emacs_mule (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int safe_room = 8; + unsigned char *adjusted_dst_end =dst_end - 8; + int produced_chars = 0; + Lisp_Object attrs, eol_type, charset_list; + int c; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + if (ASCII_CHAR_P (c)) + EMIT_ONE_ASCII_BYTE (c); + else + { + struct charset *charset; + unsigned code; + int dimension; + int emacs_mule_id; + unsigned char leading_codes[2]; + + charset = char_charset (c, charset_list, &code); + if (! charset) + { + c = coding->default_char; + if (ASCII_CHAR_P (c)) + { + EMIT_ONE_ASCII_BYTE (c); + continue; + } + charset = char_charset (c, charset_list, &code); + } + dimension = CHARSET_DIMENSION (charset); + emacs_mule_id = CHARSET_EMACS_MULE_ID (charset); + EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes); + EMIT_ONE_BYTE (leading_codes[0]); + if (leading_codes[1]) + EMIT_ONE_BYTE (leading_codes[1]); + if (dimension == 1) + EMIT_ONE_BYTE (code); + else + { + EMIT_ONE_BYTE (code >> 8); + EMIT_ONE_BYTE (code & 0xFF); + } + } + } + coding->result = CODING_RESULT_SUCCESS; + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + + +/*** 7. ISO2022 handlers ***/ + +/* The following note describes the coding system ISO2022 briefly. + Since the intention of this note is to help understand the + functions in this file, some parts are NOT ACCURATE or OVERLY + SIMPLIFIED. For thorough understanding, please refer to the + original document of ISO2022. + + ISO2022 provides many mechanisms to encode several character sets + in 7-bit and 8-bit environments. For 7-bite environments, all text + is encoded using bytes less than 128. This may make the encoded + text a little bit longer, but the text passes more easily through + several gateways, some of which strip off MSB (Most Signigant Bit). + + There are two kinds of character sets: control character set and + graphic character set. The former contains control characters such + as `newline' and `escape' to provide control functions (control + functions are also provided by escape sequences). The latter + contains graphic characters such as 'A' and '-'. Emacs recognizes + two control character sets and many graphic character sets. + + Graphic character sets are classified into one of the following + four classes, according to the number of bytes (DIMENSION) and + number of characters in one dimension (CHARS) of the set: + - DIMENSION1_CHARS94 + - DIMENSION1_CHARS96 + - DIMENSION2_CHARS94 + - DIMENSION2_CHARS96 + + In addition, each character set is assigned an identification tag, + unique for each set, called "final character" (denoted as + hereafter). The of each character set is decided by ECMA(*) + when it is registered in ISO. The code range of is 0x30..0x7F + (0x30..0x3F are for private use only). + + Note (*): ECMA = European Computer Manufacturers Association + + Here are examples of graphic character set [NAME()]: + o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ... + o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ... + o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ... + o DIMENSION2_CHARS96 -- none for the moment + + A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR. + C0 [0x00..0x1F] -- control character plane 0 + GL [0x20..0x7F] -- graphic character plane 0 + C1 [0x80..0x9F] -- control character plane 1 + GR [0xA0..0xFF] -- graphic character plane 1 + + A control character set is directly designated and invoked to C0 or + C1 by an escape sequence. The most common case is that: + - ISO646's control character set is designated/invoked to C0, and + - ISO6429's control character set is designated/invoked to C1, + and usually these designations/invocations are omitted in encoded + text. In a 7-bit environment, only C0 can be used, and a control + character for C1 is encoded by an appropriate escape sequence to + fit into the environment. All control characters for C1 are + defined to have corresponding escape sequences. + + A graphic character set is at first designated to one of four + graphic registers (G0 through G3), then these graphic registers are + invoked to GL or GR. These designations and invocations can be + done independently. The most common case is that G0 is invoked to + GL, G1 is invoked to GR, and ASCII is designated to G0. Usually + these invocations and designations are omitted in encoded text. + In a 7-bit environment, only GL can be used. + + When a graphic character set of CHARS94 is invoked to GL, codes + 0x20 and 0x7F of the GL area work as control characters SPACE and + DEL respectively, and codes 0xA0 and 0xFF of the GR area should not + be used. + + There are two ways of invocation: locking-shift and single-shift. + With locking-shift, the invocation lasts until the next different + invocation, whereas with single-shift, the invocation affects the + following character only and doesn't affect the locking-shift + state. Invocations are done by the following control characters or + escape sequences: + + ---------------------------------------------------------------------- + abbrev function cntrl escape seq description + ---------------------------------------------------------------------- + SI/LS0 (shift-in) 0x0F none invoke G0 into GL + SO/LS1 (shift-out) 0x0E none invoke G1 into GL + LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL + LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL + LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*) + LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*) + LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*) + SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char + SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char + ---------------------------------------------------------------------- + (*) These are not used by any known coding system. + + Control characters for these functions are defined by macros + ISO_CODE_XXX in `coding.h'. + + Designations are done by the following escape sequences: + ---------------------------------------------------------------------- + escape sequence description + ---------------------------------------------------------------------- + ESC '(' designate DIMENSION1_CHARS94 to G0 + ESC ')' designate DIMENSION1_CHARS94 to G1 + ESC '*' designate DIMENSION1_CHARS94 to G2 + ESC '+' designate DIMENSION1_CHARS94 to G3 + ESC ',' designate DIMENSION1_CHARS96 to G0 (*) + ESC '-' designate DIMENSION1_CHARS96 to G1 + ESC '.' designate DIMENSION1_CHARS96 to G2 + ESC '/' designate DIMENSION1_CHARS96 to G3 + ESC '$' '(' designate DIMENSION2_CHARS94 to G0 (**) + ESC '$' ')' designate DIMENSION2_CHARS94 to G1 + ESC '$' '*' designate DIMENSION2_CHARS94 to G2 + ESC '$' '+' designate DIMENSION2_CHARS94 to G3 + ESC '$' ',' designate DIMENSION2_CHARS96 to G0 (*) + ESC '$' '-' designate DIMENSION2_CHARS96 to G1 + ESC '$' '.' designate DIMENSION2_CHARS96 to G2 + ESC '$' '/' designate DIMENSION2_CHARS96 to G3 + ---------------------------------------------------------------------- + + In this list, "DIMENSION1_CHARS94" means a graphic character set + of dimension 1, chars 94, and final character , etc... + + Note (*): Although these designations are not allowed in ISO2022, + Emacs accepts them on decoding, and produces them on encoding + CHARS96 character sets in a coding system which is characterized as + 7-bit environment, non-locking-shift, and non-single-shift. + + Note (**): If is '@', 'A', or 'B', the intermediate character + '(' must be omitted. We refer to this as "short-form" hereafter. + + Now you may notice that there are a lot of ways for encoding the + same multilingual text in ISO2022. Actually, there exist many + coding systems such as Compound Text (used in X11's inter client + communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR + (used in Korean internet), EUC (Extended UNIX Code, used in Asian + localized platforms), and all of these are variants of ISO2022. + + In addition to the above, Emacs handles two more kinds of escape + sequences: ISO6429's direction specification and Emacs' private + sequence for specifying character composition. + + ISO6429's direction specification takes the following form: + o CSI ']' -- end of the current direction + o CSI '0' ']' -- end of the current direction + o CSI '1' ']' -- start of left-to-right text + o CSI '2' ']' -- start of right-to-left text + The control character CSI (0x9B: control sequence introducer) is + abbreviated to the escape sequence ESC '[' in a 7-bit environment. + + Character composition specification takes the following form: + o ESC '0' -- start relative composition + o ESC '1' -- end composition + o ESC '2' -- start rule-base composition (*) + o ESC '3' -- start relative composition with alternate chars (**) + o ESC '4' -- start rule-base composition with alternate chars (**) + Since these are not standard escape sequences of any ISO standard, + the use of them for these meaning is restricted to Emacs only. + + (*) This form is used only in Emacs 20.5 and the older versions, + but the newer versions can safely decode it. + (**) This form is used only in Emacs 21.1 and the newer versions, + and the older versions can't decode it. + + Here's a list of examples usages of these composition escape + sequences (categorized by `enum composition_method'). + + COMPOSITION_RELATIVE: + ESC 0 CHAR [ CHAR ] ESC 1 + COMPOSITOIN_WITH_RULE: + ESC 2 CHAR [ RULE CHAR ] ESC 1 + COMPOSITION_WITH_ALTCHARS: + ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 + COMPOSITION_WITH_RULE_ALTCHARS: + ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */ + +enum iso_code_class_type iso_code_class[256]; + +#define SAFE_CHARSET_P(coding, id) \ + ((id) <= (coding)->max_charset_id \ + && (coding)->safe_charsets[id] >= 0) + + +#define SHIFT_OUT_OK(category) \ + (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0) + +static void +setup_iso_safe_charsets (Lisp_Object attrs) +{ + Lisp_Object charset_list, safe_charsets; + Lisp_Object request; + Lisp_Object reg_usage; + Lisp_Object tail; + int reg94, reg96; + int flags = XINT (AREF (attrs, coding_attr_iso_flags)); + int max_charset_id; + + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + if ((flags & CODING_ISO_FLAG_FULL_SUPPORT) + && ! EQ (charset_list, Viso_2022_charset_list)) + { + CODING_ATTR_CHARSET_LIST (attrs) + = charset_list = Viso_2022_charset_list; + ASET (attrs, coding_attr_safe_charsets, Qnil); + } + + if (STRINGP (AREF (attrs, coding_attr_safe_charsets))) + return; + + max_charset_id = 0; + for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) + { + int id = XINT (XCAR (tail)); + if (max_charset_id < id) + max_charset_id = id; + } + + safe_charsets = Fmake_string (make_number (max_charset_id + 1), + make_number (255)); + request = AREF (attrs, coding_attr_iso_request); + reg_usage = AREF (attrs, coding_attr_iso_usage); + reg94 = XINT (XCAR (reg_usage)); + reg96 = XINT (XCDR (reg_usage)); + + for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) + { + Lisp_Object id; + Lisp_Object reg; + struct charset *charset; + + id = XCAR (tail); + charset = CHARSET_FROM_ID (XINT (id)); + reg = Fcdr (Fassq (request, id)); + if (! NILP (reg)) + XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg); + else if (charset->iso_chars_96) + { + if (reg96 < 4) + XSTRING (safe_charsets)->data[XINT (id)] = reg96; + } + else + { + if (reg94 < 4) + XSTRING (safe_charsets)->data[XINT (id)] = reg94; + } + } + ASET (attrs, coding_attr_safe_charsets, safe_charsets); +} + + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in ISO2022. If it is, returns an + integer in which appropriate flag bits any of: + CATEGORY_MASK_ISO_7 + CATEGORY_MASK_ISO_7_TIGHT + CATEGORY_MASK_ISO_8_1 + CATEGORY_MASK_ISO_8_2 + CATEGORY_MASK_ISO_7_ELSE + CATEGORY_MASK_ISO_8_ELSE + are set. If a code which should never appear in ISO2022 is found, + returns 0. */ + +static int +detect_coding_iso_2022 (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int mask_iso = CATEGORY_MASK_ISO; + int mask_found = 0, mask_8bit_found = 0; + int reg[4], shift_out = 0, single_shifting = 0; + int id; + int c, c1; + int consumed_chars = 0; + int i; + + for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++) + { + struct coding_system *this = &(coding_categories[i]); + Lisp_Object attrs, val; + + attrs = CODING_ID_ATTRS (this->id); + if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT + && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list)) + setup_iso_safe_charsets (attrs); + val = CODING_ATTR_SAFE_CHARSETS (attrs); + this->max_charset_id = XSTRING (val)->size - 1; + this->safe_charsets = (char *) XSTRING (val)->data; + } + + /* A coding system of this category is always ASCII compatible. */ + src += coding->head_ascii; + + reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1; + while (mask_iso && src < src_end) + { + ONE_MORE_BYTE (c); + switch (c) + { + case ISO_CODE_ESC: + if (inhibit_iso_escape_detection) + break; + single_shifting = 0; + ONE_MORE_BYTE (c); + if (c >= '(' && c <= '/') + { + /* Designation sequence for a charset of dimension 1. */ + ONE_MORE_BYTE (c1); + if (c1 < ' ' || c1 >= 0x80 + || (id = iso_charset_table[0][c >= ','][c1]) < 0) + /* Invalid designation sequence. Just ignore. */ + break; + reg[(c - '(') % 4] = id; + } + else if (c == '$') + { + /* Designation sequence for a charset of dimension 2. */ + ONE_MORE_BYTE (c); + if (c >= '@' && c <= 'B') + /* Designation for JISX0208.1978, GB2312, or JISX0208. */ + reg[0] = id = iso_charset_table[1][0][c]; + else if (c >= '(' && c <= '/') + { + ONE_MORE_BYTE (c1); + if (c1 < ' ' || c1 >= 0x80 + || (id = iso_charset_table[1][c >= ','][c1]) < 0) + /* Invalid designation sequence. Just ignore. */ + break; + reg[(c - '(') % 4] = id; + } + else + /* Invalid designation sequence. Just ignore. */ + break; + } + else if (c == 'N' || c == 'O') + { + /* ESC for SS2 or SS3. */ + mask_iso &= CATEGORY_MASK_ISO_7_ELSE; + break; + } + else if (c >= '0' && c <= '4') + { + /* ESC for start/end composition. */ + mask_found |= CATEGORY_MASK_ISO; + break; + } + else + { + /* Invalid escape sequence. */ + mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE; + break; + } + + /* We found a valid designation sequence for CHARSET. */ + mask_iso &= ~CATEGORY_MASK_ISO_8BIT; + if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7], + id)) + mask_found |= CATEGORY_MASK_ISO_7; + else + mask_iso &= ~CATEGORY_MASK_ISO_7; + if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight], + id)) + mask_found |= CATEGORY_MASK_ISO_7_TIGHT; + else + mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT; + if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else], + id)) + mask_found |= CATEGORY_MASK_ISO_7_ELSE; + else + mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE; + if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else], + id)) + mask_found |= CATEGORY_MASK_ISO_8_ELSE; + else + mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE; + break; + + case ISO_CODE_SO: + if (inhibit_iso_escape_detection) + break; + single_shifting = 0; + if (shift_out == 0 + && (reg[1] >= 0 + || SHIFT_OUT_OK (coding_category_iso_7_else) + || SHIFT_OUT_OK (coding_category_iso_8_else))) + { + /* Locking shift out. */ + mask_iso &= ~CATEGORY_MASK_ISO_7BIT; + mask_found |= CATEGORY_MASK_ISO_ELSE; + } + break; + + case ISO_CODE_SI: + if (inhibit_iso_escape_detection) + break; + single_shifting = 0; + if (shift_out == 1) + { + /* Locking shift in. */ + mask_iso &= ~CATEGORY_MASK_ISO_7BIT; + mask_found |= CATEGORY_MASK_ISO_ELSE; + } + break; + + case ISO_CODE_CSI: + single_shifting = 0; + case ISO_CODE_SS2: + case ISO_CODE_SS3: + { + int newmask = CATEGORY_MASK_ISO_8_ELSE; + + if (inhibit_iso_escape_detection) + break; + if (c != ISO_CODE_CSI) + { + if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) + & CODING_ISO_FLAG_SINGLE_SHIFT) + newmask |= CATEGORY_MASK_ISO_8_1; + if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) + & CODING_ISO_FLAG_SINGLE_SHIFT) + newmask |= CATEGORY_MASK_ISO_8_2; + single_shifting = 1; + } + if (VECTORP (Vlatin_extra_code_table) + && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) + { + if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) + & CODING_ISO_FLAG_LATIN_EXTRA) + newmask |= CATEGORY_MASK_ISO_8_1; + if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) + & CODING_ISO_FLAG_LATIN_EXTRA) + newmask |= CATEGORY_MASK_ISO_8_2; + } + mask_iso &= newmask; + mask_found |= newmask; + } + break; + + default: + if (c < 0x80) + { + single_shifting = 0; + break; + } + else if (c < 0xA0) + { + single_shifting = 0; + mask_8bit_found = 1; + if (VECTORP (Vlatin_extra_code_table) + && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) + { + int newmask = 0; + + if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) + & CODING_ISO_FLAG_LATIN_EXTRA) + newmask |= CATEGORY_MASK_ISO_8_1; + if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) + & CODING_ISO_FLAG_LATIN_EXTRA) + newmask |= CATEGORY_MASK_ISO_8_2; + mask_iso &= newmask; + mask_found |= newmask; + } + else + return 0; + } + else + { + mask_iso &= ~(CATEGORY_MASK_ISO_7BIT + | CATEGORY_MASK_ISO_7_ELSE); + mask_found |= CATEGORY_MASK_ISO_8_1; + mask_8bit_found = 1; + /* Check the length of succeeding codes of the range + 0xA0..0FF. If the byte length is odd, we exclude + CATEGORY_MASK_ISO_8_2. We can check this only + when we are not single shifting. */ + if (!single_shifting + && mask_iso & CATEGORY_MASK_ISO_8_2) + { + int i = 1; + while (src < src_end) + { + ONE_MORE_BYTE (c); + if (c < 0xA0) + break; + i++; + } + + if (i & 1 && src < src_end) + mask_iso &= ~CATEGORY_MASK_ISO_8_2; + else + mask_found |= CATEGORY_MASK_ISO_8_2; + } + } + break; + } + } + no_more_source: + if (!mask_iso) + { + *mask &= ~CATEGORY_MASK_ISO; + return 0; + } + if (!mask_found) + return 0; + *mask &= mask_iso & mask_found; + if (! mask_8bit_found) + *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE); + return 1; +} + + +/* Set designation state into CODING. */ +#define DECODE_DESIGNATION(reg, dim, chars_96, final) \ + do { \ + int id, prev; \ + \ + if (final < '0' || final >= 128 \ + || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \ + || !SAFE_CHARSET_P (coding, id)) \ + { \ + CODING_ISO_DESIGNATION (coding, reg) = -2; \ + goto invalid_code; \ + } \ + prev = CODING_ISO_DESIGNATION (coding, reg); \ + CODING_ISO_DESIGNATION (coding, reg) = id; \ + /* If there was an invalid designation to REG previously, and this \ + designation is ASCII to REG, we should keep this designation \ + sequence. */ \ + if (prev == -2 && id == charset_ascii) \ + goto invalid_code; \ + } while (0) + + +#define MAYBE_FINISH_COMPOSITION() \ + do { \ + int i; \ + if (composition_state == COMPOSING_NO) \ + break; \ + /* It is assured that we have enough room for producing \ + characters stored in the table `components'. */ \ + if (charbuf + component_idx > charbuf_end) \ + goto no_more_source; \ + composition_state = COMPOSING_NO; \ + if (method == COMPOSITION_RELATIVE \ + || method == COMPOSITION_WITH_ALTCHARS) \ + { \ + for (i = 0; i < component_idx; i++) \ + *charbuf++ = components[i]; \ + char_offset += component_idx; \ + } \ + else \ + { \ + for (i = 0; i < component_idx; i += 2) \ + *charbuf++ = components[i]; \ + char_offset += (component_idx / 2) + 1; \ + } \ + } while (0) + + +/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4. + ESC 0 : relative composition : ESC 0 CHAR ... ESC 1 + ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 + ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1 + ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1 + */ + +#define DECODE_COMPOSITION_START(c1) \ + do { \ + if (c1 == '0' \ + && composition_state == COMPOSING_COMPONENT_CHAR) \ + { \ + component_len = component_idx; \ + composition_state = COMPOSING_CHAR; \ + } \ + else \ + { \ + unsigned char *p; \ + \ + MAYBE_FINISH_COMPOSITION (); \ + if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \ + goto no_more_source; \ + for (p = src; p < src_end - 1; p++) \ + if (*p == ISO_CODE_ESC && p[1] == '1') \ + break; \ + if (p == src_end - 1) \ + { \ + if (coding->mode & CODING_MODE_LAST_BLOCK) \ + goto invalid_code; \ + goto no_more_source; \ + } \ + \ + /* This is surely the start of a composition. */ \ + method = (c1 == '0' ? COMPOSITION_RELATIVE \ + : c1 == '2' ? COMPOSITION_WITH_RULE \ + : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ + : COMPOSITION_WITH_RULE_ALTCHARS); \ + composition_state = (c1 <= '2' ? COMPOSING_CHAR \ + : COMPOSING_COMPONENT_CHAR); \ + component_idx = component_len = 0; \ + } \ + } while (0) + + +/* Handle compositoin end sequence ESC 1. */ + +#define DECODE_COMPOSITION_END() \ + do { \ + int nchars = (component_len > 0 ? component_idx - component_len \ + : method == COMPOSITION_RELATIVE ? component_idx \ + : (component_idx + 1) / 2); \ + int i; \ + int *saved_charbuf = charbuf; \ + \ + ADD_COMPOSITION_DATA (charbuf, method, nchars); \ + if (method != COMPOSITION_RELATIVE) \ + { \ + if (component_len == 0) \ + for (i = 0; i < component_idx; i++) \ + *charbuf++ = components[i]; \ + else \ + for (i = 0; i < component_len; i++) \ + *charbuf++ = components[i]; \ + *saved_charbuf = saved_charbuf - charbuf; \ + } \ + if (method == COMPOSITION_WITH_RULE) \ + for (i = 0; i < component_idx; i += 2, char_offset++) \ + *charbuf++ = components[i]; \ + else \ + for (i = component_len; i < component_idx; i++, char_offset++) \ + *charbuf++ = components[i]; \ + coding->annotated = 1; \ + composition_state = COMPOSING_NO; \ + } while (0) + + +/* Decode a composition rule from the byte C1 (and maybe one more byte + from SRC) and store one encoded composition rule in + coding->cmp_data. */ + +#define DECODE_COMPOSITION_RULE(c1) \ + do { \ + (c1) -= 32; \ + if (c1 < 81) /* old format (before ver.21) */ \ + { \ + int gref = (c1) / 9; \ + int nref = (c1) % 9; \ + if (gref == 4) gref = 10; \ + if (nref == 4) nref = 10; \ + c1 = COMPOSITION_ENCODE_RULE (gref, nref); \ + } \ + else if (c1 < 93) /* new format (after ver.21) */ \ + { \ + ONE_MORE_BYTE (c2); \ + c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \ + } \ + else \ + c1 = 0; \ + } while (0) + + +/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ + +static void +decode_coding_iso_2022 (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + unsigned char *src_base; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size - 4; + int consumed_chars = 0, consumed_chars_base; + int char_offset = 0; + int multibytep = coding->src_multibyte; + /* Charsets invoked to graphic plane 0 and 1 respectively. */ + int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); + int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); + struct charset *charset; + int c; + /* For handling composition sequence. */ +#define COMPOSING_NO 0 +#define COMPOSING_CHAR 1 +#define COMPOSING_RULE 2 +#define COMPOSING_COMPONENT_CHAR 3 +#define COMPOSING_COMPONENT_RULE 4 + + int composition_state = COMPOSING_NO; + enum composition_method method; + int components[MAX_COMPOSITION_COMPONENTS * 2 + 1]; + int component_idx; + int component_len; + Lisp_Object attrs, eol_type, charset_list; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + setup_iso_safe_charsets (attrs); + + while (1) + { + int c1, c2; + + src_base = src; + consumed_chars_base = consumed_chars; + + if (charbuf >= charbuf_end) + break; + + ONE_MORE_BYTE (c1); + + /* We produce no character or one character. */ + switch (iso_code_class [c1]) + { + case ISO_0x20_or_0x7F: + if (composition_state != COMPOSING_NO) + { + if (composition_state == COMPOSING_RULE + || composition_state == COMPOSING_COMPONENT_RULE) + { + DECODE_COMPOSITION_RULE (c1); + components[component_idx++] = c1; + composition_state--; + continue; + } + else if (method == COMPOSITION_WITH_RULE) + composition_state = COMPOSING_RULE; + else if (method == COMPOSITION_WITH_RULE_ALTCHARS + && composition_state == COMPOSING_COMPONENT_CHAR) + composition_state = COMPOSING_COMPONENT_CHAR; + } + if (charset_id_0 < 0 + || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0))) + { + /* This is SPACE or DEL. */ + charset = CHARSET_FROM_ID (charset_ascii); + break; + } + /* This is a graphic character, we fall down ... */ + + case ISO_graphic_plane_0: + if (composition_state == COMPOSING_RULE) + { + DECODE_COMPOSITION_RULE (c1); + components[component_idx++] = c1; + composition_state = COMPOSING_CHAR; + } + charset = CHARSET_FROM_ID (charset_id_0); + break; + + case ISO_0xA0_or_0xFF: + if (charset_id_1 < 0 + || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1)) + || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) + goto invalid_code; + /* This is a graphic character, we fall down ... */ + + case ISO_graphic_plane_1: + if (charset_id_1 < 0) + goto invalid_code; + charset = CHARSET_FROM_ID (charset_id_1); + break; + + case ISO_carriage_return: + if (c1 == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src == src_end) + goto no_more_source; + if (*src == '\n') + ONE_MORE_BYTE (c1); + } + else if (EQ (eol_type, Qmac)) + c1 = '\n'; + } + /* fall through */ + + case ISO_control_0: + MAYBE_FINISH_COMPOSITION (); + charset = CHARSET_FROM_ID (charset_ascii); + break; + + case ISO_control_1: + MAYBE_FINISH_COMPOSITION (); + goto invalid_code; + + case ISO_shift_out: + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT) + || CODING_ISO_DESIGNATION (coding, 1) < 0) + goto invalid_code; + CODING_ISO_INVOCATION (coding, 0) = 1; + charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); + continue; + + case ISO_shift_in: + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)) + goto invalid_code; + CODING_ISO_INVOCATION (coding, 0) = 0; + charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); + continue; + + case ISO_single_shift_2_7: + case ISO_single_shift_2: + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)) + goto invalid_code; + /* SS2 is handled as an escape sequence of ESC 'N' */ + c1 = 'N'; + goto label_escape_sequence; + + case ISO_single_shift_3: + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)) + goto invalid_code; + /* SS2 is handled as an escape sequence of ESC 'O' */ + c1 = 'O'; + goto label_escape_sequence; + + case ISO_control_sequence_introducer: + /* CSI is handled as an escape sequence of ESC '[' ... */ + c1 = '['; + goto label_escape_sequence; + + case ISO_escape: + ONE_MORE_BYTE (c1); + label_escape_sequence: + /* Escape sequences handled here are invocation, + designation, direction specification, and character + composition specification. */ + switch (c1) + { + case '&': /* revision of following character set */ + ONE_MORE_BYTE (c1); + if (!(c1 >= '@' && c1 <= '~')) + goto invalid_code; + ONE_MORE_BYTE (c1); + if (c1 != ISO_CODE_ESC) + goto invalid_code; + ONE_MORE_BYTE (c1); + goto label_escape_sequence; + + case '$': /* designation of 2-byte character set */ + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION)) + goto invalid_code; + ONE_MORE_BYTE (c1); + if (c1 >= '@' && c1 <= 'B') + { /* designation of JISX0208.1978, GB2312.1980, + or JISX0208.1980 */ + DECODE_DESIGNATION (0, 2, 0, c1); + } + else if (c1 >= 0x28 && c1 <= 0x2B) + { /* designation of DIMENSION2_CHARS94 character set */ + ONE_MORE_BYTE (c2); + DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2); + } + else if (c1 >= 0x2C && c1 <= 0x2F) + { /* designation of DIMENSION2_CHARS96 character set */ + ONE_MORE_BYTE (c2); + DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2); + } + else + goto invalid_code; + /* We must update these variables now. */ + charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); + charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); + continue; + + case 'n': /* invocation of locking-shift-2 */ + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT) + || CODING_ISO_DESIGNATION (coding, 2) < 0) + goto invalid_code; + CODING_ISO_INVOCATION (coding, 0) = 2; + charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); + continue; + + case 'o': /* invocation of locking-shift-3 */ + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT) + || CODING_ISO_DESIGNATION (coding, 3) < 0) + goto invalid_code; + CODING_ISO_INVOCATION (coding, 0) = 3; + charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); + continue; + + case 'N': /* invocation of single-shift-2 */ + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) + || CODING_ISO_DESIGNATION (coding, 2) < 0) + goto invalid_code; + charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2)); + ONE_MORE_BYTE (c1); + if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) + goto invalid_code; + break; + + case 'O': /* invocation of single-shift-3 */ + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) + || CODING_ISO_DESIGNATION (coding, 3) < 0) + goto invalid_code; + charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3)); + ONE_MORE_BYTE (c1); + if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) + goto invalid_code; + break; + + case '0': case '2': case '3': case '4': /* start composition */ + if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)) + goto invalid_code; + DECODE_COMPOSITION_START (c1); + continue; + + case '1': /* end composition */ + if (composition_state == COMPOSING_NO) + goto invalid_code; + DECODE_COMPOSITION_END (); + continue; + + case '[': /* specification of direction */ + if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION) + goto invalid_code; + /* For the moment, nested direction is not supported. + So, `coding->mode & CODING_MODE_DIRECTION' zero means + left-to-right, and nozero means right-to-left. */ + ONE_MORE_BYTE (c1); + switch (c1) + { + case ']': /* end of the current direction */ + coding->mode &= ~CODING_MODE_DIRECTION; + + case '0': /* end of the current direction */ + case '1': /* start of left-to-right direction */ + ONE_MORE_BYTE (c1); + if (c1 == ']') + coding->mode &= ~CODING_MODE_DIRECTION; + else + goto invalid_code; + break; + + case '2': /* start of right-to-left direction */ + ONE_MORE_BYTE (c1); + if (c1 == ']') + coding->mode |= CODING_MODE_DIRECTION; + else + goto invalid_code; + break; + + default: + goto invalid_code; + } + continue; + + default: + if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION)) + goto invalid_code; + if (c1 >= 0x28 && c1 <= 0x2B) + { /* designation of DIMENSION1_CHARS94 character set */ + ONE_MORE_BYTE (c2); + DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2); + } + else if (c1 >= 0x2C && c1 <= 0x2F) + { /* designation of DIMENSION1_CHARS96 character set */ + ONE_MORE_BYTE (c2); + DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2); + } + else + goto invalid_code; + /* We must update these variables now. */ + charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); + charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); + continue; + } + } + + /* Now we know CHARSET and 1st position code C1 of a character. + Produce a decoded character while getting 2nd position code + C2 if necessary. */ + c1 &= 0x7F; + if (CHARSET_DIMENSION (charset) > 1) + { + ONE_MORE_BYTE (c2); + if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)) + /* C2 is not in a valid range. */ + goto invalid_code; + c1 = (c1 << 8) | (c2 & 0x7F); + if (CHARSET_DIMENSION (charset) > 2) + { + ONE_MORE_BYTE (c2); + if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)) + /* C2 is not in a valid range. */ + goto invalid_code; + c1 = (c1 << 8) | (c2 & 0x7F); + } + } + + CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c); + if (c < 0) + { + MAYBE_FINISH_COMPOSITION (); + for (; src_base < src; src_base++, char_offset++) + { + if (ASCII_BYTE_P (*src_base)) + *charbuf++ = *src_base; + else + *charbuf++ = BYTE8_TO_CHAR (*src_base); + } + } + else if (composition_state == COMPOSING_NO) + { + *charbuf++ = c; + char_offset++; + } + else + components[component_idx++] = c; + continue; + + invalid_code: + MAYBE_FINISH_COMPOSITION (); + src = src_base; + consumed_chars = consumed_chars_base; + ONE_MORE_BYTE (c); + *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); + coding->errors++; + } + + no_more_source: + coding->consumed_char += consumed_chars_base; + coding->consumed = src_base - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + + +/* ISO2022 encoding stuff. */ + +/* + It is not enough to say just "ISO2022" on encoding, we have to + specify more details. In Emacs, each coding system of ISO2022 + variant has the following specifications: + 1. Initial designation to G0 thru G3. + 2. Allows short-form designation? + 3. ASCII should be designated to G0 before control characters? + 4. ASCII should be designated to G0 at end of line? + 5. 7-bit environment or 8-bit environment? + 6. Use locking-shift? + 7. Use Single-shift? + And the following two are only for Japanese: + 8. Use ASCII in place of JIS0201-1976-Roman? + 9. Use JISX0208-1983 in place of JISX0208-1978? + These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits + defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more + details. +*/ + +/* Produce codes (escape sequence) for designating CHARSET to graphic + register REG at DST, and increment DST. If of CHARSET is + '@', 'A', or 'B' and the coding system CODING allows, produce + designation sequence of short-form. */ + +#define ENCODE_DESIGNATION(charset, reg, coding) \ + do { \ + unsigned char final_char = CHARSET_ISO_FINAL (charset); \ + char *intermediate_char_94 = "()*+"; \ + char *intermediate_char_96 = ",-./"; \ + int revision = -1; \ + int c; \ + \ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \ + revision = XINT (CHARSET_ISO_REVISION (charset)); \ + \ + if (revision >= 0) \ + { \ + EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \ + EMIT_ONE_BYTE ('@' + revision); \ + } \ + EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \ + if (CHARSET_DIMENSION (charset) == 1) \ + { \ + if (! CHARSET_ISO_CHARS_96 (charset)) \ + c = intermediate_char_94[reg]; \ + else \ + c = intermediate_char_96[reg]; \ + EMIT_ONE_ASCII_BYTE (c); \ + } \ + else \ + { \ + EMIT_ONE_ASCII_BYTE ('$'); \ + if (! CHARSET_ISO_CHARS_96 (charset)) \ + { \ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \ + || reg != 0 \ + || final_char < '@' || final_char > 'B') \ + EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \ + } \ + else \ + EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \ + } \ + EMIT_ONE_ASCII_BYTE (final_char); \ + \ + CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \ + } while (0) + + +/* The following two macros produce codes (control character or escape + sequence) for ISO2022 single-shift functions (single-shift-2 and + single-shift-3). */ + +#define ENCODE_SINGLE_SHIFT_2 \ + do { \ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ + EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \ + else \ + EMIT_ONE_BYTE (ISO_CODE_SS2); \ + CODING_ISO_SINGLE_SHIFTING (coding) = 1; \ + } while (0) + + +#define ENCODE_SINGLE_SHIFT_3 \ + do { \ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ + EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \ + else \ + EMIT_ONE_BYTE (ISO_CODE_SS3); \ + CODING_ISO_SINGLE_SHIFTING (coding) = 1; \ + } while (0) + + +/* The following four macros produce codes (control character or + escape sequence) for ISO2022 locking-shift functions (shift-in, + shift-out, locking-shift-2, and locking-shift-3). */ + +#define ENCODE_SHIFT_IN \ + do { \ + EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \ + CODING_ISO_INVOCATION (coding, 0) = 0; \ + } while (0) + + +#define ENCODE_SHIFT_OUT \ + do { \ + EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \ + CODING_ISO_INVOCATION (coding, 0) = 1; \ + } while (0) + + +#define ENCODE_LOCKING_SHIFT_2 \ + do { \ + EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \ + CODING_ISO_INVOCATION (coding, 0) = 2; \ + } while (0) + + +#define ENCODE_LOCKING_SHIFT_3 \ + do { \ + EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \ + CODING_ISO_INVOCATION (coding, 0) = 3; \ + } while (0) + + +/* Produce codes for a DIMENSION1 character whose character set is + CHARSET and whose position-code is C1. Designation and invocation + sequences are also produced in advance if necessary. */ + +#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \ + do { \ + int id = CHARSET_ID (charset); \ + if (CODING_ISO_SINGLE_SHIFTING (coding)) \ + { \ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ + EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \ + else \ + EMIT_ONE_BYTE (c1 | 0x80); \ + CODING_ISO_SINGLE_SHIFTING (coding) = 0; \ + break; \ + } \ + else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \ + { \ + EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \ + break; \ + } \ + else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \ + { \ + EMIT_ONE_BYTE (c1 | 0x80); \ + break; \ + } \ + else \ + /* Since CHARSET is not yet invoked to any graphic planes, we \ + must invoke it, or, at first, designate it to some graphic \ + register. Then repeat the loop to actually produce the \ + character. */ \ + dst = encode_invocation_designation (charset, coding, dst, \ + &produced_chars); \ + } while (1) + + +/* Produce codes for a DIMENSION2 character whose character set is + CHARSET and whose position-codes are C1 and C2. Designation and + invocation codes are also produced in advance if necessary. */ + +#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \ + do { \ + int id = CHARSET_ID (charset); \ + if (CODING_ISO_SINGLE_SHIFTING (coding)) \ + { \ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ + EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \ + else \ + EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \ + CODING_ISO_SINGLE_SHIFTING (coding) = 0; \ + break; \ + } \ + else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \ + { \ + EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \ + break; \ + } \ + else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \ + { \ + EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \ + break; \ + } \ + else \ + /* Since CHARSET is not yet invoked to any graphic planes, we \ + must invoke it, or, at first, designate it to some graphic \ + register. Then repeat the loop to actually produce the \ + character. */ \ + dst = encode_invocation_designation (charset, coding, dst, \ + &produced_chars); \ + } while (1) + + +#define ENCODE_ISO_CHARACTER(charset, c) \ + do { \ + int code = ENCODE_CHAR ((charset),(c)); \ + \ + if (CHARSET_DIMENSION (charset) == 1) \ + ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \ + else \ + ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \ + } while (0) + + +/* Produce designation and invocation codes at a place pointed by DST + to use CHARSET. The element `spec.iso_2022' of *CODING is updated. + Return new DST. */ + +unsigned char * +encode_invocation_designation (charset, coding, dst, p_nchars) + struct charset *charset; + struct coding_system *coding; + unsigned char *dst; + int *p_nchars; +{ + int multibytep = coding->dst_multibyte; + int produced_chars = *p_nchars; + int reg; /* graphic register number */ + int id = CHARSET_ID (charset); + + /* At first, check designations. */ + for (reg = 0; reg < 4; reg++) + if (id == CODING_ISO_DESIGNATION (coding, reg)) + break; + + if (reg >= 4) + { + /* CHARSET is not yet designated to any graphic registers. */ + /* At first check the requested designation. */ + reg = CODING_ISO_REQUEST (coding, id); + if (reg < 0) + /* Since CHARSET requests no special designation, designate it + to graphic register 0. */ + reg = 0; + + ENCODE_DESIGNATION (charset, reg, coding); + } + + if (CODING_ISO_INVOCATION (coding, 0) != reg + && CODING_ISO_INVOCATION (coding, 1) != reg) + { + /* Since the graphic register REG is not invoked to any graphic + planes, invoke it to graphic plane 0. */ + switch (reg) + { + case 0: /* graphic register 0 */ + ENCODE_SHIFT_IN; + break; + + case 1: /* graphic register 1 */ + ENCODE_SHIFT_OUT; + break; + + case 2: /* graphic register 2 */ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) + ENCODE_SINGLE_SHIFT_2; + else + ENCODE_LOCKING_SHIFT_2; + break; + + case 3: /* graphic register 3 */ + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) + ENCODE_SINGLE_SHIFT_3; + else + ENCODE_LOCKING_SHIFT_3; + break; + } + } + + *p_nchars = produced_chars; + return dst; +} + +/* The following three macros produce codes for indicating direction + of text. */ +#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \ + do { \ + if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \ + EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \ + else \ + EMIT_ONE_BYTE (ISO_CODE_CSI); \ + } while (0) + + +#define ENCODE_DIRECTION_R2L() \ + do { \ + ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \ + EMIT_TWO_ASCII_BYTES ('2', ']'); \ + } while (0) + + +#define ENCODE_DIRECTION_L2R() \ + do { \ + ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \ + EMIT_TWO_ASCII_BYTES ('0', ']'); \ + } while (0) + + +/* Produce codes for designation and invocation to reset the graphic + planes and registers to initial state. */ +#define ENCODE_RESET_PLANE_AND_REGISTER() \ + do { \ + int reg; \ + struct charset *charset; \ + \ + if (CODING_ISO_INVOCATION (coding, 0) != 0) \ + ENCODE_SHIFT_IN; \ + for (reg = 0; reg < 4; reg++) \ + if (CODING_ISO_INITIAL (coding, reg) >= 0 \ + && (CODING_ISO_DESIGNATION (coding, reg) \ + != CODING_ISO_INITIAL (coding, reg))) \ + { \ + charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \ + ENCODE_DESIGNATION (charset, reg, coding); \ + } \ + } while (0) + + +/* Produce designation sequences of charsets in the line started from + SRC to a place pointed by DST, and return updated DST. + + If the current block ends before any end-of-line, we may fail to + find all the necessary designations. */ + +static unsigned char * +encode_designation_at_bol (coding, charbuf, charbuf_end, dst) + struct coding_system *coding; + int *charbuf, *charbuf_end; + unsigned char *dst; +{ + struct charset *charset; + /* Table of charsets to be designated to each graphic register. */ + int r[4]; + int c, found = 0, reg; + int produced_chars = 0; + int multibytep = coding->dst_multibyte; + Lisp_Object attrs; + Lisp_Object charset_list; + + attrs = CODING_ID_ATTRS (coding->id); + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + if (EQ (charset_list, Qiso_2022)) + charset_list = Viso_2022_charset_list; + + for (reg = 0; reg < 4; reg++) + r[reg] = -1; + + while (found < 4) + { + int id; + + c = *charbuf++; + if (c == '\n') + break; + charset = char_charset (c, charset_list, NULL); + id = CHARSET_ID (charset); + reg = CODING_ISO_REQUEST (coding, id); + if (reg >= 0 && r[reg] < 0) + { + found++; + r[reg] = id; + } + } + + if (found) + { + for (reg = 0; reg < 4; reg++) + if (r[reg] >= 0 + && CODING_ISO_DESIGNATION (coding, reg) != r[reg]) + ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding); + } + + return dst; +} + +/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */ + +static int +encode_coding_iso_2022 (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int safe_room = 16; + int bol_designation + = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL + && CODING_ISO_BOL (coding)); + int produced_chars = 0; + Lisp_Object attrs, eol_type, charset_list; + int ascii_compatible; + int c; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + + ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); + + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + + if (bol_designation) + { + unsigned char *dst_prev = dst; + + /* We have to produce designation sequences if any now. */ + dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst); + bol_designation = 0; + /* We are sure that designation sequences are all ASCII bytes. */ + produced_chars += dst - dst_prev; + } + + c = *charbuf++; + + /* Now encode the character C. */ + if (c < 0x20 || c == 0x7F) + { + if (c == '\n' + || (c == '\r' && EQ (eol_type, Qmac))) + { + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL) + ENCODE_RESET_PLANE_AND_REGISTER (); + if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL) + { + int i; + + for (i = 0; i < 4; i++) + CODING_ISO_DESIGNATION (coding, i) + = CODING_ISO_INITIAL (coding, i); + } + bol_designation + = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL; + } + else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL) + ENCODE_RESET_PLANE_AND_REGISTER (); + EMIT_ONE_ASCII_BYTE (c); + } + else if (ASCII_CHAR_P (c)) + { + if (ascii_compatible) + EMIT_ONE_ASCII_BYTE (c); + else + ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii), c); + } + else + { + struct charset *charset = char_charset (c, charset_list, NULL); + + if (!charset) + { + c = coding->default_char; + charset = char_charset (c, charset_list, NULL); + } + ENCODE_ISO_CHARACTER (charset, c); + } + } + + if (coding->mode & CODING_MODE_LAST_BLOCK + && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL) + { + ASSURE_DESTINATION (safe_room); + ENCODE_RESET_PLANE_AND_REGISTER (); + } + coding->result = CODING_RESULT_SUCCESS; + CODING_ISO_BOL (coding) = bol_designation; + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + + +/*** 8,9. SJIS and BIG5 handlers ***/ + +/* Although SJIS and BIG5 are not ISO's coding system, they are used + quite widely. So, for the moment, Emacs supports them in the bare + C code. But, in the future, they may be supported only by CCL. */ + +/* SJIS is a coding system encoding three character sets: ASCII, right + half of JISX0201-Kana, and JISX0208. An ASCII character is encoded + as is. A character of charset katakana-jisx0201 is encoded by + "position-code + 0x80". A character of charset japanese-jisx0208 + is encoded in 2-byte but two position-codes are divided and shifted + so that it fit in the range below. + + --- CODE RANGE of SJIS --- + (character set) (range) + ASCII 0x00 .. 0x7F + KATAKANA-JISX0201 0xA0 .. 0xDF + JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF + (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC + ------------------------------- + +*/ + +/* BIG5 is a coding system encoding two character sets: ASCII and + Big5. An ASCII character is encoded as is. Big5 is a two-byte + character set and is encoded in two-byte. + + --- CODE RANGE of BIG5 --- + (character set) (range) + ASCII 0x00 .. 0x7F + Big5 (1st byte) 0xA1 .. 0xFE + (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE + -------------------------- + + */ + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in SJIS. If it is, return + CATEGORY_MASK_SJIS, else return 0. */ + +static int +detect_coding_sjis (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int found = 0; + int c; + + /* A coding system of this category is always ASCII compatible. */ + src += coding->head_ascii; + + while (1) + { + ONE_MORE_BYTE (c); + if (c < 0x80) + continue; + if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) + { + ONE_MORE_BYTE (c); + if (c < 0x40 || c == 0x7F || c > 0xFC) + break; + found = 1; + } + else if (c >= 0xA0 && c < 0xE0) + found = 1; + else + break; + } + *mask &= ~CATEGORY_MASK_SJIS; + return 0; + + no_more_source: + if (!found) + return 0; + *mask &= CATEGORY_MASK_SJIS; + return 1; +} + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in BIG5. If it is, return + CATEGORY_MASK_BIG5, else return 0. */ + +static int +detect_coding_big5 (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int found = 0; + int c; + + /* A coding system of this category is always ASCII compatible. */ + src += coding->head_ascii; + + while (1) + { + ONE_MORE_BYTE (c); + if (c < 0x80) + continue; + if (c >= 0xA1) + { + ONE_MORE_BYTE (c); + if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) + return 0; + found = 1; + } + else + break; + } + *mask &= ~CATEGORY_MASK_BIG5; + return 0; + + no_more_source: + if (!found) + return 0; + *mask &= CATEGORY_MASK_BIG5; + return 1; +} + +/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". + If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ + +static void +decode_coding_sjis (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + unsigned char *src_base; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int consumed_chars = 0, consumed_chars_base; + int multibytep = coding->src_multibyte; + struct charset *charset_roman, *charset_kanji, *charset_kana; + Lisp_Object attrs, eol_type, charset_list, val; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + + val = charset_list; + charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))); + + while (1) + { + int c, c1; + + src_base = src; + consumed_chars_base = consumed_chars; + + if (charbuf >= charbuf_end) + break; + + ONE_MORE_BYTE (c); + + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src == src_end) + goto no_more_source; + if (*src == '\n') + ONE_MORE_BYTE (c); + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + else + { + struct charset *charset; + + if (c < 0x80) + charset = charset_roman; + else + { + if (c >= 0xF0) + goto invalid_code; + if (c < 0xA0 || c >= 0xE0) + { + /* SJIS -> JISX0208 */ + ONE_MORE_BYTE (c1); + if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC) + goto invalid_code; + c = (c << 8) | c1; + SJIS_TO_JIS (c); + charset = charset_kanji; + } + else + /* SJIS -> JISX0201-Kana */ + charset = charset_kana; + } + CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); + } + *charbuf++ = c; + continue; + + invalid_code: + src = src_base; + consumed_chars = consumed_chars_base; + ONE_MORE_BYTE (c); + *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); + coding->errors++; + } + + no_more_source: + coding->consumed_char += consumed_chars_base; + coding->consumed = src_base - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + +static void +decode_coding_big5 (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + unsigned char *src_base; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int consumed_chars = 0, consumed_chars_base; + int multibytep = coding->src_multibyte; + struct charset *charset_roman, *charset_big5; + Lisp_Object attrs, eol_type, charset_list, val; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + val = charset_list; + charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val))); + + while (1) + { + int c, c1; + + src_base = src; + consumed_chars_base = consumed_chars; + + if (charbuf >= charbuf_end) + break; + + ONE_MORE_BYTE (c); + + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src == src_end) + goto no_more_source; + if (*src == '\n') + ONE_MORE_BYTE (c); + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + else + { + struct charset *charset; + if (c < 0x80) + charset = charset_roman; + else + { + /* BIG5 -> Big5 */ + if (c < 0xA1 || c > 0xFE) + goto invalid_code; + ONE_MORE_BYTE (c1); + if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE) + goto invalid_code; + c = c << 8 | c1; + charset = charset_big5; + } + CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); + } + + *charbuf++ = c; + continue; + + invalid_code: + src = src_base; + consumed_chars = consumed_chars_base; + ONE_MORE_BYTE (c); + *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); + coding->errors++; + } + + no_more_source: + coding->consumed_char += consumed_chars_base; + coding->consumed = src_base - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + +/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". + This function can encode charsets `ascii', `katakana-jisx0201', + `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We + are sure that all these charsets are registered as official charset + (i.e. do not have extended leading-codes). Characters of other + charsets are produced without any encoding. If SJIS_P is 1, encode + SJIS text, else encode BIG5 text. */ + +static int +encode_coding_sjis (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int safe_room = 4; + int produced_chars = 0; + Lisp_Object attrs, eol_type, charset_list, val; + int ascii_compatible; + struct charset *charset_roman, *charset_kanji, *charset_kana; + int c; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + val = charset_list; + charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))); + + ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); + + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + /* Now encode the character C. */ + if (ASCII_CHAR_P (c) && ascii_compatible) + EMIT_ONE_ASCII_BYTE (c); + else + { + unsigned code; + struct charset *charset = char_charset (c, charset_list, &code); + + if (!charset) + { + c = coding->default_char; + charset = char_charset (c, charset_list, &code); + } + if (code == CHARSET_INVALID_CODE (charset)) + abort (); + if (charset == charset_kanji) + { + int c1, c2; + JIS_TO_SJIS (code); + c1 = code >> 8, c2 = code & 0xFF; + EMIT_TWO_BYTES (c1, c2); + } + else if (charset == charset_kana) + EMIT_ONE_BYTE (code | 0x80); + else + EMIT_ONE_ASCII_BYTE (code & 0x7F); + } + } + coding->result = CODING_RESULT_SUCCESS; + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + +static int +encode_coding_big5 (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int safe_room = 4; + int produced_chars = 0; + Lisp_Object attrs, eol_type, charset_list, val; + int ascii_compatible; + struct charset *charset_roman, *charset_big5; + int c; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + val = charset_list; + charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val))); + ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); + + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + /* Now encode the character C. */ + if (ASCII_CHAR_P (c) && ascii_compatible) + EMIT_ONE_ASCII_BYTE (c); + else + { + unsigned code; + struct charset *charset = char_charset (c, charset_list, &code); + + if (! charset) + { + c = coding->default_char; + charset = char_charset (c, charset_list, &code); + } + if (code == CHARSET_INVALID_CODE (charset)) + abort (); + if (charset == charset_big5) + { + int c1, c2; + + c1 = code >> 8, c2 = code & 0xFF; + EMIT_TWO_BYTES (c1, c2); + } + else + EMIT_ONE_ASCII_BYTE (code & 0x7F); + } + } + coding->result = CODING_RESULT_SUCCESS; + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + + +/*** 10. CCL handlers ***/ + +/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". + Check if a text is encoded in a coding system of which + encoder/decoder are written in CCL program. If it is, return + CATEGORY_MASK_CCL, else return 0. */ + +static int +detect_coding_ccl (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + int found = 0; + unsigned char *valids = CODING_CCL_VALIDS (coding); + int head_ascii = coding->head_ascii; + Lisp_Object attrs; + + coding = &coding_categories[coding_category_ccl]; + attrs = CODING_ID_ATTRS (coding->id); + if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) + src += head_ascii; + + while (1) + { + int c; + ONE_MORE_BYTE (c); + if (! valids[c]) + break; + if (!found && valids[c] > 1) + found = 1; + } + *mask &= ~CATEGORY_MASK_CCL; + return 0; + + no_more_source: + if (!found) + return 0; + *mask &= CATEGORY_MASK_CCL; + return 1; +} + +static void +decode_coding_ccl (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int consumed_chars = 0; + int multibytep = coding->src_multibyte; + struct ccl_program ccl; + int source_charbuf[1024]; + int source_byteidx[1024]; + + setup_ccl_program (&ccl, CODING_CCL_DECODER (coding)); + + while (src < src_end) + { + unsigned char *p = src; + int *source, *source_end; + int i = 0; + + if (multibytep) + while (i < 1024 && p < src_end) + { + source_byteidx[i] = p - src; + source_charbuf[i++] = STRING_CHAR_ADVANCE (p); + } + else + while (i < 1024 && p < src_end) + source_charbuf[i++] = *p++; + + if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK) + ccl.last_block = 1; + + source = source_charbuf; + source_end = source + i; + while (source < source_end) + { + ccl_driver (&ccl, source, charbuf, + source_end - source, charbuf_end - charbuf); + source += ccl.consumed; + charbuf += ccl.produced; + if (ccl.status != CCL_STAT_SUSPEND_BY_DST) + break; + } + if (source < source_end) + src += source_byteidx[source - source_charbuf]; + else + src = p; + consumed_chars += source - source_charbuf; + + if (ccl.status != CCL_STAT_SUSPEND_BY_SRC + && ccl.status != CODING_RESULT_INSUFFICIENT_SRC) + break; + } + + switch (ccl.status) + { + case CCL_STAT_SUSPEND_BY_SRC: + coding->result = CODING_RESULT_INSUFFICIENT_SRC; + break; + case CCL_STAT_SUSPEND_BY_DST: + break; + case CCL_STAT_QUIT: + case CCL_STAT_INVALID_CMD: + coding->result = CODING_RESULT_INTERRUPT; + break; + default: + coding->result = CODING_RESULT_SUCCESS; + break; + } + coding->consumed_char += consumed_chars; + coding->consumed = src - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + +static int +encode_coding_ccl (coding) + struct coding_system *coding; +{ + struct ccl_program ccl; + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + unsigned char *adjusted_dst_end = dst_end - 1; + int destination_charbuf[1024]; + int i, produced_chars = 0; + + setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding)); + + ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK; + ccl.dst_multibyte = coding->dst_multibyte; + + while (charbuf < charbuf_end && dst < adjusted_dst_end) + { + int dst_bytes = dst_end - dst; + if (dst_bytes > 1024) + dst_bytes = 1024; + + ccl_driver (&ccl, charbuf, destination_charbuf, + charbuf_end - charbuf, dst_bytes); + charbuf += ccl.consumed; + if (multibytep) + for (i = 0; i < ccl.produced; i++) + EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF); + else + { + for (i = 0; i < ccl.produced; i++) + *dst++ = destination_charbuf[i] & 0xFF; + produced_chars += ccl.produced; + } + } + + switch (ccl.status) + { + case CCL_STAT_SUSPEND_BY_SRC: + coding->result = CODING_RESULT_INSUFFICIENT_SRC; + break; + case CCL_STAT_SUSPEND_BY_DST: + coding->result = CODING_RESULT_INSUFFICIENT_DST; + break; + case CCL_STAT_QUIT: + case CCL_STAT_INVALID_CMD: + coding->result = CODING_RESULT_INTERRUPT; + break; + default: + coding->result = CODING_RESULT_SUCCESS; + break; + } + + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + + + +/*** 10, 11. no-conversion handlers ***/ + +/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ + +static void +decode_coding_raw_text (coding) + struct coding_system *coding; +{ + coding->chars_at_source = 1; + coding->consumed_char = coding->src_chars; + coding->consumed = coding->src_bytes; + coding->result = CODING_RESULT_SUCCESS; +} + +static int +encode_coding_raw_text (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = coding->charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int produced_chars = 0; + int c; + + if (multibytep) + { + int safe_room = MAX_MULTIBYTE_LENGTH * 2; + + if (coding->src_multibyte) + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + if (ASCII_CHAR_P (c)) + EMIT_ONE_ASCII_BYTE (c); + else if (CHAR_BYTE8_P (c)) + { + c = CHAR_TO_BYTE8 (c); + EMIT_ONE_BYTE (c); + } + else + { + unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str; + + CHAR_STRING_ADVANCE (c, p1); + while (p0 < p1) + EMIT_ONE_BYTE (*p0); + } + } + else + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + EMIT_ONE_BYTE (c); + } + } + else + { + if (coding->src_multibyte) + { + int safe_room = MAX_MULTIBYTE_LENGTH; + + while (charbuf < charbuf_end) + { + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + if (ASCII_CHAR_P (c)) + *dst++ = c; + else if (CHAR_BYTE8_P (c)) + *dst++ = CHAR_TO_BYTE8 (c); + else + CHAR_STRING_ADVANCE (c, dst); + produced_chars++; + } + } + else + { + ASSURE_DESTINATION (charbuf_end - charbuf); + while (charbuf < charbuf_end && dst < dst_end) + *dst++ = *charbuf++; + produced_chars = dst - (coding->destination + coding->dst_bytes); + } + } + coding->result = CODING_RESULT_SUCCESS; + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + +static int +detect_coding_charset (coding, mask) + struct coding_system *coding; + int *mask; +{ + unsigned char *src = coding->source, *src_base = src; + unsigned char *src_end = coding->source + coding->src_bytes; + int multibytep = coding->src_multibyte; + int consumed_chars = 0; + Lisp_Object attrs, valids; + + coding = &coding_categories[coding_category_charset]; + attrs = CODING_ID_ATTRS (coding->id); + valids = AREF (attrs, coding_attr_charset_valids); + + if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) + src += coding->head_ascii; + + while (1) + { + int c; + + ONE_MORE_BYTE (c); + if (NILP (AREF (valids, c))) + break; + } + *mask &= ~CATEGORY_MASK_CHARSET; + return 0; + + no_more_source: + *mask &= CATEGORY_MASK_CHARSET; + return 1; +} + +static void +decode_coding_charset (coding) + struct coding_system *coding; +{ + unsigned char *src = coding->source + coding->consumed; + unsigned char *src_end = coding->source + coding->src_bytes; + unsigned char *src_base; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_size; + int consumed_chars = 0, consumed_chars_base; + int multibytep = coding->src_multibyte; + struct charset *charset; + Lisp_Object attrs, eol_type, charset_list; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); + + while (1) + { + int c, c1; + + src_base = src; + consumed_chars_base = consumed_chars; + + if (charbuf >= charbuf_end) + break; + + ONE_MORE_BYTE (c1); + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src == src_end) + goto no_more_source; + if (*src == '\n') + ONE_MORE_BYTE (c); + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + else + { + CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c); + if (c < 0) + goto invalid_code; + } + *charbuf++ = c; + continue; + + invalid_code: + src = src_base; + consumed_chars = consumed_chars_base; + ONE_MORE_BYTE (c); + *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); + coding->errors++; + } + + no_more_source: + coding->consumed_char += consumed_chars_base; + coding->consumed = src_base - coding->source; + coding->charbuf_used = charbuf - coding->charbuf; +} + +static int +encode_coding_charset (coding) + struct coding_system *coding; +{ + int multibytep = coding->dst_multibyte; + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int safe_room = MAX_MULTIBYTE_LENGTH; + int produced_chars = 0; + struct charset *charset; + Lisp_Object attrs, eol_type, charset_list; + int ascii_compatible; + int c; + + CODING_GET_INFO (coding, attrs, eol_type, charset_list); + charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); + ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); + + while (charbuf < charbuf_end) + { + unsigned code; + + ASSURE_DESTINATION (safe_room); + c = *charbuf++; + if (ascii_compatible && ASCII_CHAR_P (c)) + EMIT_ONE_ASCII_BYTE (c); + else if ((code = ENCODE_CHAR (charset, c)) + != CHARSET_INVALID_CODE (charset)) + EMIT_ONE_BYTE (code); + else + EMIT_ONE_BYTE (coding->default_char); + } + + coding->result = CODING_RESULT_SUCCESS; + coding->produced_char += produced_chars; + coding->produced = dst - coding->destination; + return 0; +} + + +/*** 7. C library functions ***/ + +/* In Emacs Lisp, coding system is represented by a Lisp symbol which + has a property `coding-system'. The value of this property is a + vector of length 5 (called as coding-vector). Among elements of + this vector, the first (element[0]) and the fifth (element[4]) + carry important information for decoding/encoding. Before + decoding/encoding, this information should be set in fields of a + structure of type `coding_system'. + + A value of property `coding-system' can be a symbol of another + subsidiary coding-system. In that case, Emacs gets coding-vector + from that symbol. + + `element[0]' contains information to be set in `coding->type'. The + value and its meaning is as follows: + + 0 -- coding_type_emacs_mule + 1 -- coding_type_sjis + 2 -- coding_type_iso_2022 + 3 -- coding_type_big5 + 4 -- coding_type_ccl encoder/decoder written in CCL + nil -- coding_type_no_conversion + t -- coding_type_undecided (automatic conversion on decoding, + no-conversion on encoding) + + `element[4]' contains information to be set in `coding->flags' and + `coding->spec'. The meaning varies by `coding->type'. + + If `coding->type' is `coding_type_iso_2022', element[4] is a vector + of length 32 (of which the first 13 sub-elements are used now). + Meanings of these sub-elements are: + + sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022' + If the value is an integer of valid charset, the charset is + assumed to be designated to graphic register N initially. + + If the value is minus, it is a minus value of charset which + reserves graphic register N, which means that the charset is + not designated initially but should be designated to graphic + register N just before encoding a character in that charset. + + If the value is nil, graphic register N is never used on + encoding. + + sub-element[N] where N is 4 through 11: to be set in `coding->flags' + Each value takes t or nil. See the section ISO2022 of + `coding.h' for more information. + + If `coding->type' is `coding_type_big5', element[4] is t to denote + BIG5-ETen or nil to denote BIG5-HKU. + + If `coding->type' takes the other value, element[4] is ignored. + + Emacs Lisp's coding system also carries information about format of + end-of-line in a value of property `eol-type'. If the value is + integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If + it is not integer, it should be a vector of subsidiary coding + systems of which property `eol-type' has one of above values. + +*/ + +/* Setup coding context CODING from information about CODING_SYSTEM. + If CODING_SYSTEM is nil, `no-conversion' is assumed. If + CODING_SYSTEM is invalid, signal an error. */ + +void +setup_coding_system (coding_system, coding) + Lisp_Object coding_system; + struct coding_system *coding; +{ + int id; + Lisp_Object attrs; + Lisp_Object eol_type; + Lisp_Object coding_type; + Lisp_Object val; + + if (NILP (coding_system)) + coding_system = Qno_conversion; + + CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id); + + attrs = CODING_ID_ATTRS (coding->id); + eol_type = CODING_ID_EOL_TYPE (coding->id); + + coding->mode = 0; + coding->head_ascii = -1; + coding->common_flags + = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0); + + val = CODING_ATTR_SAFE_CHARSETS (attrs); + coding->max_charset_id = XSTRING (val)->size - 1; + coding->safe_charsets = (char *) XSTRING (val)->data; + coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs)); + + coding_type = CODING_ATTR_TYPE (attrs); + if (EQ (coding_type, Qundecided)) + { + coding->detector = NULL; + coding->decoder = decode_coding_raw_text; + coding->encoder = encode_coding_raw_text; + coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; + } + else if (EQ (coding_type, Qiso_2022)) + { + int i; + int flags = XINT (AREF (attrs, coding_attr_iso_flags)); + + /* Invoke graphic register 0 to plane 0. */ + CODING_ISO_INVOCATION (coding, 0) = 0; + /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */ + CODING_ISO_INVOCATION (coding, 1) + = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1); + /* Setup the initial status of designation. */ + for (i = 0; i < 4; i++) + CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i); + /* Not single shifting initially. */ + CODING_ISO_SINGLE_SHIFTING (coding) = 0; + /* Beginning of buffer should also be regarded as bol. */ + CODING_ISO_BOL (coding) = 1; + coding->detector = detect_coding_iso_2022; + coding->decoder = decode_coding_iso_2022; + coding->encoder = encode_coding_iso_2022; + if (flags & CODING_ISO_FLAG_SAFE) + coding->mode |= CODING_MODE_SAFE_ENCODING; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK + | CODING_REQUIRE_FLUSHING_MASK); + if (flags & CODING_ISO_FLAG_COMPOSITION) + coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK; + if (flags & CODING_ISO_FLAG_FULL_SUPPORT) + { + setup_iso_safe_charsets (attrs); + val = CODING_ATTR_SAFE_CHARSETS (attrs); + coding->max_charset_id = XSTRING (val)->size - 1; + coding->safe_charsets = (char *) XSTRING (val)->data; + } + CODING_ISO_FLAGS (coding) = flags; + } + else if (EQ (coding_type, Qcharset)) + { + coding->detector = detect_coding_charset; + coding->decoder = decode_coding_charset; + coding->encoder = encode_coding_charset; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + } + else if (EQ (coding_type, Qutf_8)) + { + coding->detector = detect_coding_utf_8; + coding->decoder = decode_coding_utf_8; + coding->encoder = encode_coding_utf_8; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + } + else if (EQ (coding_type, Qutf_16)) + { + val = AREF (attrs, coding_attr_utf_16_bom); + CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom + : EQ (val, Qt) ? utf_16_with_bom + : utf_16_without_bom); + val = AREF (attrs, coding_attr_utf_16_endian); + CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian + : utf_16_little_endian); + coding->detector = detect_coding_utf_16; + coding->decoder = decode_coding_utf_16; + coding->encoder = encode_coding_utf_16; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + } + else if (EQ (coding_type, Qccl)) + { + coding->detector = detect_coding_ccl; + coding->decoder = decode_coding_ccl; + coding->encoder = encode_coding_ccl; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK + | CODING_REQUIRE_FLUSHING_MASK); + } + else if (EQ (coding_type, Qemacs_mule)) + { + coding->detector = detect_coding_emacs_mule; + coding->decoder = decode_coding_emacs_mule; + coding->encoder = encode_coding_emacs_mule; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + if (! NILP (AREF (attrs, coding_attr_emacs_mule_full)) + && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list)) + { + Lisp_Object tail, safe_charsets; + int max_charset_id = 0; + + for (tail = Vemacs_mule_charset_list; CONSP (tail); + tail = XCDR (tail)) + if (max_charset_id < XFASTINT (XCAR (tail))) + max_charset_id = XFASTINT (XCAR (tail)); + safe_charsets = Fmake_string (make_number (max_charset_id + 1), + make_number (255)); + for (tail = Vemacs_mule_charset_list; CONSP (tail); + tail = XCDR (tail)) + XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0; + coding->max_charset_id = max_charset_id; + coding->safe_charsets = (char *) XSTRING (safe_charsets)->data; + } + } + else if (EQ (coding_type, Qshift_jis)) + { + coding->detector = detect_coding_sjis; + coding->decoder = decode_coding_sjis; + coding->encoder = encode_coding_sjis; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + } + else if (EQ (coding_type, Qbig5)) + { + coding->detector = detect_coding_big5; + coding->decoder = decode_coding_big5; + coding->encoder = encode_coding_big5; + coding->common_flags + |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + } + else /* EQ (coding_type, Qraw_text) */ + { + coding->detector = NULL; + coding->decoder = decode_coding_raw_text; + coding->encoder = encode_coding_raw_text; + coding->common_flags |= CODING_FOR_UNIBYTE_MASK; + } + + return; +} + +/* Return raw-text or one of its subsidiaries that has the same + eol_type as CODING-SYSTEM. */ + +Lisp_Object +raw_text_coding_system (coding_system) + Lisp_Object coding_system; +{ + Lisp_Object spec, attrs, coding_type; + Lisp_Object eol_type, raw_text_eol_type; + + spec = CODING_SYSTEM_SPEC (coding_system); + attrs = AREF (spec, 0); + + if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text)) + return coding_system; + + eol_type = AREF (spec, 2); + if (VECTORP (eol_type)) + return Qraw_text; + spec = CODING_SYSTEM_SPEC (Qraw_text); + raw_text_eol_type = AREF (spec, 2); + return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0) + : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1) + : AREF (raw_text_eol_type, 2)); +} + + +/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT + does, return one of the subsidiary that has the same eol-spec as + PARENT. Otherwise, return CODING_SYSTEM. */ + +Lisp_Object +coding_inherit_eol_type (coding_system, parent) +{ + Lisp_Object spec, attrs, eol_type; + + spec = CODING_SYSTEM_SPEC (coding_system); + attrs = AREF (spec, 0); + eol_type = AREF (spec, 2); + if (VECTORP (eol_type)) + { + Lisp_Object parent_spec; + Lisp_Object parent_attrs; + Lisp_Object parent_eol_type; + + parent_spec + = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system); + parent_eol_type = AREF (parent_spec, 2); + if (EQ (parent_eol_type, Qunix)) + coding_system = AREF (eol_type, 0); + else if (EQ (parent_eol_type, Qdos)) + coding_system = AREF (eol_type, 1); + else if (EQ (parent_eol_type, Qmac)) + coding_system = AREF (eol_type, 2); + } + return coding_system; +} + +/* Emacs has a mechanism to automatically detect a coding system if it + is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But, + it's impossible to distinguish some coding systems accurately + because they use the same range of codes. So, at first, coding + systems are categorized into 7, those are: + + o coding-category-emacs-mule + + The category for a coding system which has the same code range + as Emacs' internal format. Assigned the coding-system (Lisp + symbol) `emacs-mule' by default. + + o coding-category-sjis + + The category for a coding system which has the same code range + as SJIS. Assigned the coding-system (Lisp + symbol) `japanese-shift-jis' by default. + + o coding-category-iso-7 + + The category for a coding system which has the same code range + as ISO2022 of 7-bit environment. This doesn't use any locking + shift and single shift functions. This can encode/decode all + charsets. Assigned the coding-system (Lisp symbol) + `iso-2022-7bit' by default. + + o coding-category-iso-7-tight + + Same as coding-category-iso-7 except that this can + encode/decode only the specified charsets. + + o coding-category-iso-8-1 + + The category for a coding system which has the same code range + as ISO2022 of 8-bit environment and graphic plane 1 used only + for DIMENSION1 charset. This doesn't use any locking shift + and single shift functions. Assigned the coding-system (Lisp + symbol) `iso-latin-1' by default. + + o coding-category-iso-8-2 + + The category for a coding system which has the same code range + as ISO2022 of 8-bit environment and graphic plane 1 used only + for DIMENSION2 charset. This doesn't use any locking shift + and single shift functions. Assigned the coding-system (Lisp + symbol) `japanese-iso-8bit' by default. + + o coding-category-iso-7-else + + The category for a coding system which has the same code range + as ISO2022 of 7-bit environemnt but uses locking shift or + single shift functions. Assigned the coding-system (Lisp + symbol) `iso-2022-7bit-lock' by default. + + o coding-category-iso-8-else + + The category for a coding system which has the same code range + as ISO2022 of 8-bit environemnt but uses locking shift or + single shift functions. Assigned the coding-system (Lisp + symbol) `iso-2022-8bit-ss2' by default. + + o coding-category-big5 + + The category for a coding system which has the same code range + as BIG5. Assigned the coding-system (Lisp symbol) + `cn-big5' by default. + + o coding-category-utf-8 + + The category for a coding system which has the same code range + as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp + symbol) `utf-8' by default. + + o coding-category-utf-16-be + + The category for a coding system in which a text has an + Unicode signature (cf. Unicode Standard) in the order of BIG + endian at the head. Assigned the coding-system (Lisp symbol) + `utf-16-be' by default. + + o coding-category-utf-16-le + + The category for a coding system in which a text has an + Unicode signature (cf. Unicode Standard) in the order of + LITTLE endian at the head. Assigned the coding-system (Lisp + symbol) `utf-16-le' by default. + + o coding-category-ccl + + The category for a coding system of which encoder/decoder is + written in CCL programs. The default value is nil, i.e., no + coding system is assigned. + + o coding-category-binary + + The category for a coding system not categorized in any of the + above. Assigned the coding-system (Lisp symbol) + `no-conversion' by default. + + Each of them is a Lisp symbol and the value is an actual + `coding-system's (this is also a Lisp symbol) assigned by a user. + What Emacs does actually is to detect a category of coding system. + Then, it uses a `coding-system' assigned to it. If Emacs can't + decide only one possible category, it selects a category of the + highest priority. Priorities of categories are also specified by a + user in a Lisp variable `coding-category-list'. + +*/ + +#define EOL_SEEN_NONE 0 +#define EOL_SEEN_LF 1 +#define EOL_SEEN_CR 2 +#define EOL_SEEN_CRLF 4 + +/* Detect how end-of-line of a text of length CODING->src_bytes + pointed by CODING->source is encoded. Return one of + EOL_SEEN_XXX. */ + +#define MAX_EOL_CHECK_COUNT 3 + +static int +detect_eol (coding, source, src_bytes) + struct coding_system *coding; + unsigned char *source; + EMACS_INT src_bytes; +{ + Lisp_Object attrs, coding_type; + unsigned char *src = source, *src_end = src + src_bytes; + unsigned char c; + int total = 0; + int eol_seen = EOL_SEEN_NONE; + int first_eol_seen; + + attrs = CODING_ID_ATTRS (coding->id); + coding_type = CODING_ATTR_TYPE (attrs); + + if (EQ (coding_type, Qccl)) + { + int msb, lsb; + + msb = coding->spec.utf_16.endian == utf_16_little_endian; + lsb = 1 - msb; + + while (src + 1 < src_end) + { + c = src[lsb]; + if (src[msb] == 0 && (c == '\n' || c == '\r')) + { + int this_eol; + + if (c == '\n') + this_eol = EOL_SEEN_LF; + else if (src + 3 >= src_end + || src[msb + 2] != 0 + || src[lsb + 2] != '\n') + this_eol = EOL_SEEN_CR; + else + this_eol = EOL_SEEN_CRLF; + + if (eol_seen == EOL_SEEN_NONE) + /* This is the first end-of-line. */ + eol_seen = this_eol; + else if (eol_seen != this_eol) + { + /* The found type is different from what found before. */ + eol_seen = EOL_SEEN_LF; + break; + } + if (++total == MAX_EOL_CHECK_COUNT) + break; + } + src += 2; + } + } + else + { + while (src < src_end) + { + c = *src++; + if (c == '\n' || c == '\r') + { + int this_eol; + + if (c == '\n') + this_eol = EOL_SEEN_LF; + else if (src >= src_end || *src != '\n') + this_eol = EOL_SEEN_CR; + else + this_eol = EOL_SEEN_CRLF, src++; + + if (eol_seen == EOL_SEEN_NONE) + /* This is the first end-of-line. */ + eol_seen = this_eol; + else if (eol_seen != this_eol) + { + /* The found type is different from what found before. */ + eol_seen = EOL_SEEN_LF; + break; + } + if (++total == MAX_EOL_CHECK_COUNT) + break; + } + } + } + return eol_seen; +} + + +static void +adjust_coding_eol_type (coding, eol_seen) + struct coding_system *coding; + int eol_seen; +{ + Lisp_Object eol_type, coding_system; + + eol_type = CODING_ID_EOL_TYPE (coding->id); + if (eol_seen & EOL_SEEN_LF) + coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); + else if (eol_type & EOL_SEEN_CRLF) + coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1)); + else if (eol_type & EOL_SEEN_CR) + coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2)); +} + +/* Detect how a text specified in CODING is encoded. If a coding + system is detected, update fields of CODING by the detected coding + system. */ + +void +detect_coding (coding) + struct coding_system *coding; +{ + unsigned char *src, *src_end; + Lisp_Object attrs, coding_type; + + coding->consumed = coding->consumed_char = 0; + coding->produced = coding->produced_char = 0; + coding_set_source (coding); + + src_end = coding->source + coding->src_bytes; + + /* If we have not yet decided the text encoding type, detect it + now. */ + if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) + { + int mask = CATEGORY_MASK_ANY; + int c, i; + + for (src = coding->source; src < src_end; src++) + { + c = *src; + if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC + || c == ISO_CODE_SI + || c == ISO_CODE_SO))) + break; + } + coding->head_ascii = src - (coding->source + coding->consumed); + + if (coding->head_ascii < coding->src_bytes) + { + int detected = 0; + + for (i = 0; i < coding_category_raw_text; i++) + { + enum coding_category category = coding_priorities[i]; + struct coding_system *this = coding_categories + category; + + if (category >= coding_category_raw_text + || detected & (1 << category)) + continue; + + if (this->id < 0) + { + /* No coding system of this category is defined. */ + mask &= ~(1 << category); + } + else + { + detected |= detected_mask[category]; + if ((*(this->detector)) (coding, &mask)) + break; + } + } + if (! mask) + setup_coding_system (Qraw_text, coding); + else if (mask != CATEGORY_MASK_ANY) + for (i = 0; i < coding_category_raw_text; i++) + { + enum coding_category category = coding_priorities[i]; + struct coding_system *this = coding_categories + category; + + if (mask & (1 << category)) + { + setup_coding_system (CODING_ID_NAME (this->id), coding); + break; + } + } + } + } + + attrs = CODING_ID_ATTRS (coding->id); + coding_type = CODING_ATTR_TYPE (attrs); + + /* If we have not yet decided the EOL type, detect it now. But, the + detection is impossible for a CCL based coding system, in which + case, we detct the EOL type after decoding. */ + if (VECTORP (CODING_ID_EOL_TYPE (coding->id)) + && ! EQ (coding_type, Qccl)) + { + int eol_seen = detect_eol (coding, coding->source, coding->src_bytes); + + if (eol_seen != EOL_SEEN_NONE) + adjust_coding_eol_type (coding, eol_seen); + } +} + + +static void +decode_eol (coding) + struct coding_system *coding; +{ + if (VECTORP (CODING_ID_EOL_TYPE (coding->id))) + { + unsigned char *p = CHAR_POS_ADDR (coding->dst_pos); + unsigned char *pend = p + coding->produced; + int eol_seen = EOL_SEEN_NONE; + + for (; p < pend; p++) + { + if (*p == '\n') + eol_seen |= EOL_SEEN_LF; + else if (*p == '\r') + { + if (p + 1 < pend && *(p + 1) == '\n') + { + eol_seen |= EOL_SEEN_CRLF; + p++; + } + else + eol_seen |= EOL_SEEN_CR; + } + } + if (eol_seen != EOL_SEEN_NONE) + adjust_coding_eol_type (coding, eol_seen); + } + + if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac)) + { + unsigned char *p = CHAR_POS_ADDR (coding->dst_pos); + unsigned char *pend = p + coding->produced; + + for (; p < pend; p++) + if (*p == '\r') + *p = '\n'; + } + else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos)) + { + unsigned char *p, *pbeg, *pend; + Lisp_Object undo_list; + + move_gap_both (coding->dst_pos + coding->produced_char, + coding->dst_pos_byte + coding->produced); + undo_list = current_buffer->undo_list; + current_buffer->undo_list = Qt; + del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil); + current_buffer->undo_list = undo_list; + pbeg = GPT_ADDR; + pend = pbeg + coding->produced; + + for (p = pend - 1; p >= pbeg; p--) + if (*p == '\r') + { + safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1); + pend--; + } + coding->produced_char -= coding->produced - (pend - pbeg); + coding->produced = pend - pbeg; + insert_from_gap (coding->produced_char, coding->produced); + } +} + +static void +translate_chars (coding, table) + struct coding_system *coding; + Lisp_Object table; +{ + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + int c; + + if (coding->chars_at_source) + return; + + while (charbuf < charbuf_end) + { + c = *charbuf; + if (c < 0) + charbuf += c; + else + *charbuf++ = translate_char (table, c); + } +} + +static int +produce_chars (coding) + struct coding_system *coding; +{ + unsigned char *dst = coding->destination + coding->produced; + unsigned char *dst_end = coding->destination + coding->dst_bytes; + int produced; + int produced_chars = 0; + + if (! coding->chars_at_source) + { + /* Characters are in coding->charbuf. */ + int *buf = coding->charbuf; + int *buf_end = buf + coding->charbuf_used; + unsigned char *adjusted_dst_end; + + if (BUFFERP (coding->src_object) + && EQ (coding->src_object, coding->dst_object)) + dst_end = coding->source + coding->consumed; + adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH; + + while (buf < buf_end) + { + int c = *buf++; + + if (dst >= adjusted_dst_end) + { + dst = alloc_destination (coding, + buf_end - buf + MAX_MULTIBYTE_LENGTH, + dst); + dst_end = coding->destination + coding->dst_bytes; + adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH; + } + if (c >= 0) + { + if (coding->dst_multibyte + || ! CHAR_BYTE8_P (c)) + CHAR_STRING_ADVANCE (c, dst); + else + *dst++ = CHAR_TO_BYTE8 (c); + produced_chars++; + } + else + /* This is an annotation data. */ + buf -= c + 1; + } + } + else + { + int multibytep = coding->src_multibyte; + unsigned char *src = coding->source; + unsigned char *src_end = src + coding->src_bytes; + Lisp_Object eol_type; + + eol_type = CODING_ID_EOL_TYPE (coding->id); + + if (coding->src_multibyte != coding->dst_multibyte) + { + if (coding->src_multibyte) + { + int consumed_chars; + + while (1) + { + unsigned char *src_base = src; + int c; + + ONE_MORE_BYTE (c); + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src < src_end + && *src == '\n') + c = *src++; + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + if (dst == dst_end) + { + EMACS_INT offset = src - coding->source; + + dst = alloc_destination (coding, src_end - src + 1, dst); + dst_end = coding->destination + coding->dst_bytes; + coding_set_source (coding); + src = coding->source + offset; + src_end = coding->source + coding->src_bytes; + } + *dst++ = c; + produced_chars++; + } + no_more_source: + ; + } + else + while (src < src_end) + { + int c = *src++; + + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src < src_end + && *src == '\n') + c = *src++; + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + if (dst >= dst_end - 1) + { + EMACS_INT offset = src - coding->source; + + dst = alloc_destination (coding, src_end - src + 2, dst); + dst_end = coding->destination + coding->dst_bytes; + coding_set_source (coding); + src = coding->source + offset; + src_end = coding->source + coding->src_bytes; + } + EMIT_ONE_BYTE (c); + } + } + else + { + if (!EQ (coding->src_object, coding->dst_object)) + { + int require = coding->src_bytes - coding->dst_bytes; + + if (require > 0) + { + EMACS_INT offset = src - coding->source; + + dst = alloc_destination (coding, require, dst); + coding_set_source (coding); + src = coding->source + offset; + src_end = coding->source + coding->src_bytes; + } + } + produced_chars = coding->src_chars; + while (src < src_end) + { + int c = *src++; + + if (c == '\r') + { + if (EQ (eol_type, Qdos)) + { + if (src < src_end + && *src == '\n') + c = *src++; + produced_chars--; + } + else if (EQ (eol_type, Qmac)) + c = '\n'; + } + *dst++ = c; + } + } + } + + produced = dst - (coding->destination + coding->produced); + if (BUFFERP (coding->dst_object)) + insert_from_gap (produced_chars, produced); + coding->produced += produced; + coding->produced_char += produced_chars; + return produced_chars; +} + +/* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ] + or + [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ] + */ + +static INLINE void +produce_composition (coding, charbuf) + struct coding_system *coding; + int *charbuf; +{ + Lisp_Object buffer; + int len; + EMACS_INT pos; + enum composition_method method; + int cmp_len; + Lisp_Object components; + + buffer = coding->dst_object; + len = -charbuf[0]; + pos = coding->dst_pos + charbuf[1]; + method = (enum composition_method) (charbuf[3]); + cmp_len = charbuf[4]; + + if (method == COMPOSITION_RELATIVE) + components = Qnil; + else + { + Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1]; + int i; + + len -= 5; + charbuf += 5; + for (i = 0; i < len; i++) + args[i] = make_number (charbuf[i]); + components = (method == COMPOSITION_WITH_ALTCHARS + ? Fstring (len, args) : Fvector (len, args)); + } + compose_text (pos, pos + cmp_len, components, Qnil, Qnil); +} + +static int * +save_composition_data (buf, buf_end, prop) + int *buf, *buf_end; + Lisp_Object prop; +{ + enum composition_method method = COMPOSITION_METHOD (prop); + int cmp_len = COMPOSITION_LENGTH (prop); + + if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end) + return NULL; + + buf[1] = CODING_ANNOTATE_COMPOSITION_MASK; + buf[2] = method; + buf[3] = cmp_len; + + if (method == COMPOSITION_RELATIVE) + buf[0] = 4; + else + { + Lisp_Object components; + int len, i; + + components = COMPOSITION_COMPONENTS (prop); + if (VECTORP (components)) + { + len = XVECTOR (components)->size; + for (i = 0; i < len; i++) + buf[4 + i] = XINT (AREF (components, i)); + } + else if (STRINGP (components)) + { + int i_byte; + + len = XSTRING (components)->size; + i = i_byte = 0; + while (i < len) + FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte); + } + else if (INTEGERP (components)) + { + len = 1; + buf[4] = XINT (components); + } + else if (CONSP (components)) + { + for (len = 0; CONSP (components); + len++, components = XCDR (components)) + buf[4 + len] = XINT (XCAR (components)); + } + else + abort (); + buf[0] = 4 + len; + } + return (buf + buf[0]); +} + +#define CHARBUF_SIZE 0x4000 + +#define ALLOC_CONVERSION_WORK_AREA(coding) \ + do { \ + int size = CHARBUF_SIZE;; \ + \ + coding->charbuf = NULL; \ + while (size > 1024) \ + { \ + coding->charbuf = (int *) alloca (sizeof (int) * size); \ + if (coding->charbuf) \ + break; \ + size >>= 1; \ + } \ + if (! coding->charbuf) \ + { \ + coding->result = CODING_RESULT_INSUFFICIENT_MEM; \ + return coding->result; \ + } \ + coding->charbuf_size = size; \ + } while (0) + + +static void +produce_annotation (coding) + struct coding_system *coding; +{ + int *charbuf = coding->charbuf; + int *charbuf_end = charbuf + coding->charbuf_used; + + while (charbuf < charbuf_end) + { + if (*charbuf >= 0) + charbuf++; + else + { + int len = -*charbuf; + switch (charbuf[2]) + { + case CODING_ANNOTATE_COMPOSITION_MASK: + produce_composition (coding, charbuf); + break; + default: + abort (); + } + charbuf += len; + } + } +} + +/* Decode the data at CODING->src_object into CODING->dst_object. + CODING->src_object is a buffer, a string, or nil. + CODING->dst_object is a buffer. + + If CODING->src_object is a buffer, it must be the current buffer. + In this case, if CODING->src_pos is positive, it is a position of + the source text in the buffer, otherwise, the source text is in the + gap area of the buffer, and CODING->src_pos specifies the offset of + the text from GPT (which must be the same as PT). If this is the + same buffer as CODING->dst_object, CODING->src_pos must be + negative. + + If CODING->src_object is a string, CODING->src_pos in an index to + that string. + + If CODING->src_object is nil, CODING->source must already point to + the non-relocatable memory area. In this case, CODING->src_pos is + an offset from CODING->source. + + The decoded data is inserted at the current point of the buffer + CODING->dst_object. +*/ + +static int +decode_coding (coding) + struct coding_system *coding; +{ + Lisp_Object attrs; + + if (BUFFERP (coding->src_object) + && coding->src_pos > 0 + && coding->src_pos < GPT + && coding->src_pos + coding->src_chars > GPT) + move_gap_both (coding->src_pos, coding->src_pos_byte); + + if (BUFFERP (coding->dst_object)) + { + if (current_buffer != XBUFFER (coding->dst_object)) + set_buffer_internal (XBUFFER (coding->dst_object)); + if (GPT != PT) + move_gap_both (PT, PT_BYTE); + } + + coding->consumed = coding->consumed_char = 0; + coding->produced = coding->produced_char = 0; + coding->chars_at_source = 0; + coding->result = CODING_RESULT_SUCCESS; + coding->errors = 0; + + ALLOC_CONVERSION_WORK_AREA (coding); + + attrs = CODING_ID_ATTRS (coding->id); + + do + { + coding_set_source (coding); + coding->annotated = 0; + (*(coding->decoder)) (coding); + if (!NILP (CODING_ATTR_DECODE_TBL (attrs))) + translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding); + coding_set_destination (coding); + produce_chars (coding); + if (coding->annotated) + produce_annotation (coding); + } + while (coding->consumed < coding->src_bytes + && ! coding->result); + + if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl) + && SYMBOLP (CODING_ID_EOL_TYPE (coding->id)) + && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)) + decode_eol (coding); + + coding->carryover_bytes = 0; + if (coding->consumed < coding->src_bytes) + { + int nbytes = coding->src_bytes - coding->consumed; + unsigned char *src; + + coding_set_source (coding); + coding_set_destination (coding); + src = coding->source + coding->consumed; + + if (coding->mode & CODING_MODE_LAST_BLOCK) + { + /* Flush out unprocessed data as binary chars. We are sure + that the number of data is less than the size of + coding->charbuf. */ + int *charbuf = coding->charbuf; + + while (nbytes-- > 0) + { + int c = *src++; + *charbuf++ = (c & 0x80 ? - c : c); + } + produce_chars (coding); + } + else + { + /* Record unprocessed bytes in coding->carryover. We are + sure that the number of data is less than the size of + coding->carryover. */ + unsigned char *p = coding->carryover; + + coding->carryover_bytes = nbytes; + while (nbytes-- > 0) + *p++ = *src++; + } + coding->consumed = coding->src_bytes; + } + + if (BUFFERP (coding->dst_object)) + { + record_insert (coding->dst_pos, coding->produced_char); + } + + return coding->result; +} + +static void +consume_chars (coding) + struct coding_system *coding; +{ + int *buf = coding->charbuf; + /* -1 is to compensate for CRLF. */ + int *buf_end = coding->charbuf + coding->charbuf_size - 1; + unsigned char *src = coding->source + coding->consumed; + int pos = coding->src_pos + coding->consumed_char; + int end_pos = coding->src_pos + coding->src_chars; + int multibytep = coding->src_multibyte; + Lisp_Object eol_type; + int c; + int start, end, stop; + Lisp_Object object, prop; + + eol_type = CODING_ID_EOL_TYPE (coding->id); + if (VECTORP (eol_type)) + eol_type = Qunix; + + object = coding->src_object; + + /* Note: composition handling is not yet implemented. */ + coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; + + if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK + && find_composition (pos, end_pos, &start, &end, &prop, object) + && end <= end_pos + && (start >= pos + || (find_composition (end, end_pos, &start, &end, &prop, object) + && end <= end_pos))) + stop = start; + else + stop = end_pos; + + while (buf < buf_end) + { + if (pos == stop) + { + int *p; + + if (pos == end_pos) + break; + p = save_composition_data (buf, buf_end, prop); + if (p == NULL) + break; + buf = p; + if (find_composition (end, end_pos, &start, &end, &prop, object) + && end <= end_pos) + stop = start; + else + stop = end_pos; + } + + if (! multibytep) + c = *src++; + else + c = STRING_CHAR_ADVANCE (src); + if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)) + c = '\n'; + if (! EQ (eol_type, Qunix)) + { + if (c == '\n') + { + if (EQ (eol_type, Qdos)) + *buf++ = '\r'; + else + c = '\r'; + } + } + *buf++ = c; + pos++; + } + + coding->consumed = src - coding->source; + coding->consumed_char = pos - coding->src_pos; + coding->charbuf_used = buf - coding->charbuf; + coding->chars_at_source = 0; +} + + +/* Encode the text at CODING->src_object into CODING->dst_object. + CODING->src_object is a buffer or a string. + CODING->dst_object is a buffer or nil. + + If CODING->src_object is a buffer, it must be the current buffer. + In this case, if CODING->src_pos is positive, it is a position of + the source text in the buffer, otherwise. the source text is in the + gap area of the buffer, and coding->src_pos specifies the offset of + the text from GPT (which must be the same as PT). If this is the + same buffer as CODING->dst_object, CODING->src_pos must be + negative and CODING should not have `pre-write-conversion'. + + If CODING->src_object is a string, CODING should not have + `pre-write-conversion'. + + If CODING->dst_object is a buffer, the encoded data is inserted at + the current point of that buffer. + + If CODING->dst_object is nil, the encoded data is placed at the + memory area specified by CODING->destination. */ + +static int +encode_coding (coding) + struct coding_system *coding; +{ + int error = 0; + Lisp_Object attrs; + + attrs = CODING_ID_ATTRS (coding->id); + + if (BUFFERP (coding->dst_object)) + { + set_buffer_internal (XBUFFER (coding->dst_object)); + coding->dst_multibyte + = ! NILP (current_buffer->enable_multibyte_characters); + } + + coding->consumed = coding->consumed_char = 0; + coding->produced = coding->produced_char = 0; + coding->result = CODING_RESULT_SUCCESS; + coding->errors = 0; + + ALLOC_CONVERSION_WORK_AREA (coding); + + do { + coding_set_source (coding); + consume_chars (coding); + + if (!NILP (CODING_ATTR_ENCODE_TBL (attrs))) + translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding); + + coding_set_destination (coding); + (*(coding->encoder)) (coding); + } while (coding->consumed_char < coding->src_chars); + + if (BUFFERP (coding->dst_object)) + insert_from_gap (coding->produced_char, coding->produced); + + return (coding->result); +} + +/* Work buffer */ + +/* List of currently used working buffer. */ +Lisp_Object Vcode_conversion_work_buf_list; + +/* A working buffer used by the top level conversion. */ +Lisp_Object Vcode_conversion_reused_work_buf; + + +/* Return a working buffer that can be freely used by the following + code conversion. MULTIBYTEP specifies the multibyteness of the + buffer. */ + +Lisp_Object +make_conversion_work_buffer (multibytep) + int multibytep; +{ + struct buffer *current = current_buffer; + Lisp_Object buf; + + if (NILP (Vcode_conversion_work_buf_list)) + { + if (NILP (Vcode_conversion_reused_work_buf)) + Vcode_conversion_reused_work_buf + = Fget_buffer_create (build_string (" *code-conversion-work*")); + Vcode_conversion_work_buf_list + = Fcons (Vcode_conversion_reused_work_buf, Qnil); + } + else + { + int depth = Flength (Vcode_conversion_work_buf_list); + char str[128]; + + sprintf (str, " *code-conversion-work*<%d>", depth); + Vcode_conversion_work_buf_list + = Fcons (Fget_buffer_create (build_string (str)), + Vcode_conversion_work_buf_list); + } + + buf = XCAR (Vcode_conversion_work_buf_list); + set_buffer_internal (XBUFFER (buf)); + current_buffer->undo_list = Qt; + Ferase_buffer (); + Fset_buffer_multibyte (multibytep ? Qt : Qnil); + set_buffer_internal (current); + return buf; +} + +static struct coding_system *saved_coding; + +Lisp_Object +code_conversion_restore (info) + Lisp_Object info; +{ + int depth = Flength (Vcode_conversion_work_buf_list); + Lisp_Object buf; + + if (depth > 0) + { + buf = XCAR (Vcode_conversion_work_buf_list); + Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list); + if (depth > 1 && !NILP (Fbuffer_live_p (buf))) + Fkill_buffer (buf); + } + + if (saved_coding->dst_object == Qt + && saved_coding->destination) + xfree (saved_coding->destination); + + return save_excursion_restore (info); +} + + +int +decode_coding_gap (coding, chars, bytes) + struct coding_system *coding; + EMACS_INT chars, bytes; +{ + int count = specpdl_ptr - specpdl; + + saved_coding = coding; + record_unwind_protect (code_conversion_restore, save_excursion_save ()); + + coding->src_object = Fcurrent_buffer (); + coding->src_chars = chars; + coding->src_bytes = bytes; + coding->src_pos = -chars; + coding->src_pos_byte = -bytes; + coding->src_multibyte = chars < bytes; + coding->dst_object = coding->src_object; + coding->dst_pos = PT; + coding->dst_pos_byte = PT_BYTE; + + if (CODING_REQUIRE_DETECTION (coding)) + detect_coding (coding); + + decode_coding (coding); + + unbind_to (count, Qnil); + return coding->result; +} + +int +encode_coding_gap (coding, chars, bytes) + struct coding_system *coding; + EMACS_INT chars, bytes; +{ + int count = specpdl_ptr - specpdl; + Lisp_Object buffer; + + saved_coding = coding; + record_unwind_protect (code_conversion_restore, save_excursion_save ()); + + buffer = Fcurrent_buffer (); + coding->src_object = buffer; + coding->src_chars = chars; + coding->src_bytes = bytes; + coding->src_pos = -chars; + coding->src_pos_byte = -bytes; + coding->src_multibyte = chars < bytes; + coding->dst_object = coding->src_object; + coding->dst_pos = PT; + coding->dst_pos_byte = PT_BYTE; + + encode_coding (coding); + + unbind_to (count, Qnil); + return coding->result; +} + + +/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in + SRC_OBJECT into DST_OBJECT by coding context CODING. + + SRC_OBJECT is a buffer, a string, or Qnil. + + If it is a buffer, the text is at point of the buffer. FROM and TO + are positions in the buffer. + + If it is a string, the text is at the beginning of the string. + FROM and TO are indices to the string. + + If it is nil, the text is at coding->source. FROM and TO are + indices to coding->source. + + DST_OBJECT is a buffer, Qt, or Qnil. + + If it is a buffer, the decoded text is inserted at point of the + buffer. If the buffer is the same as SRC_OBJECT, the source text + is deleted. + + If it is Qt, a string is made from the decoded text, and + set in CODING->dst_object. + + If it is Qnil, the decoded text is stored at CODING->destination. + The called must allocate CODING->dst_bytes bytes at + CODING->destination by xmalloc. If the decoded text is longer than + CODING->dst_bytes, CODING->destination is relocated by xrealloc. + */ + +void +decode_coding_object (coding, src_object, from, from_byte, to, to_byte, + dst_object) + struct coding_system *coding; + Lisp_Object src_object; + EMACS_INT from, from_byte, to, to_byte; + Lisp_Object dst_object; +{ + int count = specpdl_ptr - specpdl; + unsigned char *destination; + EMACS_INT dst_bytes; + EMACS_INT chars = to - from; + EMACS_INT bytes = to_byte - from_byte; + Lisp_Object attrs; + + saved_coding = coding; + record_unwind_protect (code_conversion_restore, save_excursion_save ()); + + if (NILP (dst_object)) + { + destination = coding->destination; + dst_bytes = coding->dst_bytes; + } + + coding->src_object = src_object; + coding->src_chars = chars; + coding->src_bytes = bytes; + coding->src_multibyte = chars < bytes; + + if (STRINGP (src_object)) + { + coding->src_pos = from; + coding->src_pos_byte = from_byte; + } + else if (BUFFERP (src_object)) + { + set_buffer_internal (XBUFFER (src_object)); + if (from != GPT) + move_gap_both (from, from_byte); + if (EQ (src_object, dst_object)) + { + TEMP_SET_PT_BOTH (from, from_byte); + del_range_both (from, from_byte, to, to_byte, 1); + coding->src_pos = -chars; + coding->src_pos_byte = -bytes; + } + else + { + coding->src_pos = from; + coding->src_pos_byte = from_byte; + } + } + + if (CODING_REQUIRE_DETECTION (coding)) + detect_coding (coding); + attrs = CODING_ID_ATTRS (coding->id); + + if (! NILP (CODING_ATTR_POST_READ (attrs)) + || EQ (dst_object, Qt)) + { + coding->dst_object = make_conversion_work_buffer (1); + coding->dst_pos = BEG; + coding->dst_pos_byte = BEG_BYTE; + coding->dst_multibyte = 1; + } + else if (BUFFERP (dst_object)) + { + coding->dst_object = dst_object; + coding->dst_pos = BUF_PT (XBUFFER (dst_object)); + coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object)); + coding->dst_multibyte + = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters); + } + else + { + coding->dst_object = Qnil; + coding->dst_multibyte = 1; + } + + decode_coding (coding); + + if (BUFFERP (coding->dst_object)) + set_buffer_internal (XBUFFER (coding->dst_object)); + + if (! NILP (CODING_ATTR_POST_READ (attrs))) + { + struct gcpro gcpro1, gcpro2; + EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE; + Lisp_Object val; + + GCPRO2 (coding->src_object, coding->dst_object); + val = call1 (CODING_ATTR_POST_READ (attrs), + make_number (coding->produced_char)); + UNGCPRO; + CHECK_NATNUM (val); + coding->produced_char += Z - prev_Z; + coding->produced += Z_BYTE - prev_Z_BYTE; + } + + if (EQ (dst_object, Qt)) + { + coding->dst_object = Fbuffer_string (); + } + else if (NILP (dst_object) && BUFFERP (coding->dst_object)) + { + set_buffer_internal (XBUFFER (coding->dst_object)); + if (dst_bytes < coding->produced) + { + destination + = (unsigned char *) xrealloc (destination, coding->produced); + if (! destination) + { + coding->result = CODING_RESULT_INSUFFICIENT_DST; + unbind_to (count, Qnil); + return; + } + if (BEGV < GPT && GPT < BEGV + coding->produced_char) + move_gap_both (BEGV, BEGV_BYTE); + bcopy (BEGV_ADDR, destination, coding->produced); + coding->destination = destination; + } + } + + unbind_to (count, Qnil); +} + + +void +encode_coding_object (coding, src_object, from, from_byte, to, to_byte, + dst_object) + struct coding_system *coding; + Lisp_Object src_object; + EMACS_INT from, from_byte, to, to_byte; + Lisp_Object dst_object; +{ + int count = specpdl_ptr - specpdl; + EMACS_INT chars = to - from; + EMACS_INT bytes = to_byte - from_byte; + Lisp_Object attrs; + + saved_coding = coding; + record_unwind_protect (code_conversion_restore, save_excursion_save ()); + + coding->src_object = src_object; + coding->src_chars = chars; + coding->src_bytes = bytes; + coding->src_multibyte = chars < bytes; + + attrs = CODING_ID_ATTRS (coding->id); + + if (! NILP (CODING_ATTR_PRE_WRITE (attrs))) + { + Lisp_Object val; + + coding->src_object = make_conversion_work_buffer (coding->src_multibyte); + set_buffer_internal (XBUFFER (coding->src_object)); + if (STRINGP (src_object)) + insert_from_string (src_object, from, from_byte, chars, bytes, 0); + else if (BUFFERP (src_object)) + insert_from_buffer (XBUFFER (src_object), from, chars, 0); + else + insert_1_both (coding->source + from, chars, bytes, 0, 0, 0); + + if (EQ (src_object, dst_object)) + { + set_buffer_internal (XBUFFER (src_object)); + del_range_both (from, from_byte, to, to_byte, 1); + set_buffer_internal (XBUFFER (coding->src_object)); + } + + val = call2 (CODING_ATTR_PRE_WRITE (attrs), + make_number (1), make_number (chars)); + CHECK_NATNUM (val); + if (BEG != GPT) + move_gap_both (BEG, BEG_BYTE); + coding->src_chars = Z - BEG; + coding->src_bytes = Z_BYTE - BEG_BYTE; + coding->src_pos = BEG; + coding->src_pos_byte = BEG_BYTE; + coding->src_multibyte = Z < Z_BYTE; + } + else if (STRINGP (src_object)) + { + coding->src_pos = from; + coding->src_pos_byte = from_byte; + } + else if (BUFFERP (src_object)) + { + set_buffer_internal (XBUFFER (src_object)); + if (from != GPT) + move_gap_both (from, from_byte); + if (EQ (src_object, dst_object)) + { + del_range_both (from, from_byte, to, to_byte, 1); + coding->src_pos = -chars; + coding->src_pos_byte = -bytes; + } + else + { + coding->src_pos = from; + coding->src_pos_byte = from_byte; + } + } + + if (BUFFERP (dst_object)) + { + coding->dst_object = dst_object; + coding->dst_pos = BUF_PT (XBUFFER (dst_object)); + coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object)); + coding->dst_multibyte + = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters); + } + else if (EQ (dst_object, Qt)) + { + coding->dst_object = Qnil; + coding->destination = (unsigned char *) xmalloc (coding->src_chars); + coding->dst_bytes = coding->src_chars; + coding->dst_multibyte = 0; + } + else + { + coding->dst_object = Qnil; + coding->dst_multibyte = 0; + } + + encode_coding (coding); + + if (EQ (dst_object, Qt)) + { + if (BUFFERP (coding->dst_object)) + coding->dst_object = Fbuffer_string (); + else + { + coding->dst_object + = make_unibyte_string ((char *) coding->destination, + coding->produced); + xfree (coding->destination); + } + } + + unbind_to (count, Qnil); +} + + +Lisp_Object +preferred_coding_system () +{ + int id = coding_categories[coding_priorities[0]].id; + + return CODING_ID_NAME (id); +} + + +#ifdef emacs +/*** 8. Emacs Lisp library functions ***/ + +DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0, + doc: /* Return t if OBJECT is nil or a coding-system. +See the documentation of `define-coding-system' for information +about coding-system objects. */) + (obj) + Lisp_Object obj; +{ + return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil); +} + +DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system, + Sread_non_nil_coding_system, 1, 1, 0, + doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */) + (prompt) + Lisp_Object prompt; +{ + Lisp_Object val; + do + { + val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil, + Qt, Qnil, Qcoding_system_history, Qnil, Qnil); + } + while (XSTRING (val)->size == 0); + return (Fintern (val, Qnil)); +} + +DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0, + doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. +If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */) + (prompt, default_coding_system) + Lisp_Object prompt, default_coding_system; +{ + Lisp_Object val; + if (SYMBOLP (default_coding_system)) + XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name); + val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil, + Qt, Qnil, Qcoding_system_history, + default_coding_system, Qnil); + return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil)); +} + +DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system, + 1, 1, 0, + doc: /* Check validity of CODING-SYSTEM. +If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. +It is valid if it is a symbol with a non-nil `coding-system' property. +The value of property should be a vector of length 5. */) + (coding_system) + Lisp_Object coding_system; +{ + CHECK_SYMBOL (coding_system); + if (!NILP (Fcoding_system_p (coding_system))) + return coding_system; + while (1) + Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); +} + + +Lisp_Object +detect_coding_system (src, src_bytes, highest, multibytep, coding_system) + unsigned char *src; + int src_bytes, highest; + int multibytep; + Lisp_Object coding_system; +{ + unsigned char *src_end = src + src_bytes; + int mask = CATEGORY_MASK_ANY; + int detected = 0; + int c, i; + Lisp_Object attrs, eol_type; + Lisp_Object val; + struct coding_system coding; + + if (NILP (coding_system)) + coding_system = Qundecided; + setup_coding_system (coding_system, &coding); + attrs = CODING_ID_ATTRS (coding.id); + eol_type = CODING_ID_EOL_TYPE (coding.id); + + coding.source = src; + coding.src_bytes = src_bytes; + coding.src_multibyte = multibytep; + coding.consumed = 0; + + if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided) + { + mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); + } + else + { + coding_system = Qnil; + for (; src < src_end; src++) + { + c = *src; + if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC + || c == ISO_CODE_SI + || c == ISO_CODE_SO))) + break; + } + coding.head_ascii = src - coding.source; + + if (src < src_end) + for (i = 0; i < coding_category_raw_text; i++) + { + enum coding_category category = coding_priorities[i]; + struct coding_system *this = coding_categories + category; + + if (category >= coding_category_raw_text + || detected & (1 << category)) + continue; + + if (this->id < 0) + { + /* No coding system of this category is defined. */ + mask &= ~(1 << category); + } + else + { + detected |= detected_mask[category]; + if ((*(coding_categories[category].detector)) (&coding, &mask) + && highest) + { + mask &= detected_mask[category]; + break; + } + } + } + } + + if (!mask) + val = Fcons (make_number (coding_category_raw_text), Qnil); + else if (mask == CATEGORY_MASK_ANY) + val = Fcons (make_number (coding_category_undecided), Qnil); + else if (highest) + { + for (i = 0; i < coding_category_raw_text; i++) + if (mask & (1 << coding_priorities[i])) + { + val = Fcons (make_number (coding_priorities[i]), Qnil); + break; + } + } + else + { + val = Qnil; + for (i = coding_category_raw_text - 1; i >= 0; i--) + if (mask & (1 << coding_priorities[i])) + val = Fcons (make_number (coding_priorities[i]), val); + } + + { + int one_byte_eol = -1, two_byte_eol = -1; + Lisp_Object tail; + + for (tail = val; CONSP (tail); tail = XCDR (tail)) + { + struct coding_system *this + = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail)) + : &coding); + int this_eol; + + attrs = CODING_ID_ATTRS (this->id); + eol_type = CODING_ID_EOL_TYPE (this->id); + XSETCAR (tail, CODING_ID_NAME (this->id)); + if (VECTORP (eol_type)) + { + if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16)) + { + if (two_byte_eol < 0) + two_byte_eol = detect_eol (this, coding.source, src_bytes); + this_eol = two_byte_eol; + } + else + { + if (one_byte_eol < 0) + one_byte_eol =detect_eol (this, coding.source, src_bytes); + this_eol = one_byte_eol; + } + if (this_eol == EOL_SEEN_LF) + XSETCAR (tail, AREF (eol_type, 0)); + else if (this_eol == EOL_SEEN_CRLF) + XSETCAR (tail, AREF (eol_type, 1)); + else if (this_eol == EOL_SEEN_CR) + XSETCAR (tail, AREF (eol_type, 2)); + } + } + } + + return (highest ? XCAR (val) : val); +} + + +DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, + 2, 3, 0, + doc: /* Detect coding system of the text in the region between START and END. +Return a list of possible coding systems ordered by priority. + +If only ASCII characters are found, it returns a list of single element +`undecided' or its subsidiary coding system according to a detected +end-of-line format. + +If optional argument HIGHEST is non-nil, return the coding system of +highest priority. */) + (start, end, highest) + Lisp_Object start, end, highest; +{ + int from, to; + int from_byte, to_byte; + + CHECK_NUMBER_COERCE_MARKER (start); + CHECK_NUMBER_COERCE_MARKER (end); + + validate_region (&start, &end); + from = XINT (start), to = XINT (end); + from_byte = CHAR_TO_BYTE (from); + to_byte = CHAR_TO_BYTE (to); + + if (from < GPT && to >= GPT) + move_gap_both (to, to_byte); + + return detect_coding_system (BYTE_POS_ADDR (from_byte), + to_byte - from_byte, + !NILP (highest), + !NILP (current_buffer + ->enable_multibyte_characters), + Qnil); +} + +DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, + 1, 2, 0, + doc: /* Detect coding system of the text in STRING. +Return a list of possible coding systems ordered by priority. + +If only ASCII characters are found, it returns a list of single element +`undecided' or its subsidiary coding system according to a detected +end-of-line format. + +If optional argument HIGHEST is non-nil, return the coding system of +highest priority. */) + (string, highest) + Lisp_Object string, highest; +{ + CHECK_STRING (string); + + return detect_coding_system (XSTRING (string)->data, + STRING_BYTES (XSTRING (string)), + !NILP (highest), + STRING_MULTIBYTE (string), + Qnil); +} + + +static INLINE int +char_encodable_p (c, attrs) + int c; + Lisp_Object attrs; +{ + Lisp_Object tail; + int id; + struct charset *charset; + + for (tail = CODING_ATTR_CHARSET_LIST (attrs); + CONSP (tail); tail = XCDR (tail)) + { + charset = CHARSET_FROM_ID (XINT (XCAR (tail))); + if (CHAR_CHARSET_P (c, charset)) + break; + } + return (! NILP (tail)); +} + + +/* Return a list of coding systems that safely encode the text between + START and END. If EXCLUDE is non-nil, it is a list of coding + systems not to check. The returned list doesn't contain any such + coding systems. In any case, If the text contains only ASCII or is + unibyte, return t. */ + +DEFUN ("find-coding-systems-region-internal", + Ffind_coding_systems_region_internal, + Sfind_coding_systems_region_internal, 2, 3, 0, + doc: /* Internal use only. */) + (start, end, exclude) + Lisp_Object start, end, exclude; +{ + Lisp_Object coding_attrs_list, safe_codings; + EMACS_INT start_byte, end_byte; + unsigned char *p, *pbeg, *pend; + int c; + Lisp_Object tail, elt; + + if (STRINGP (start)) + { + if (!STRING_MULTIBYTE (start) + && XSTRING (start)->size != STRING_BYTES (XSTRING (start))) + return Qt; + start_byte = 0; + end_byte = STRING_BYTES (XSTRING (start)); + } + else + { + CHECK_NUMBER_COERCE_MARKER (start); + CHECK_NUMBER_COERCE_MARKER (end); + if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) + args_out_of_range (start, end); + if (NILP (current_buffer->enable_multibyte_characters)) + return Qt; + start_byte = CHAR_TO_BYTE (XINT (start)); + end_byte = CHAR_TO_BYTE (XINT (end)); + if (XINT (end) - XINT (start) == end_byte - start_byte) + return Qt; + + if (start < GPT && end > GPT) + { + if ((GPT - start) < (end - GPT)) + move_gap_both (start, start_byte); + else + move_gap_both (end, end_byte); + } + } + + coding_attrs_list = Qnil; + for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail)) + if (NILP (exclude) + || NILP (Fmemq (XCAR (tail), exclude))) + { + Lisp_Object attrs; + + attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0); + if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)) + && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided)) + coding_attrs_list = Fcons (attrs, coding_attrs_list); + } + + if (STRINGP (start)) + p = pbeg = XSTRING (start)->data; + else + p = pbeg = BYTE_POS_ADDR (start_byte); + pend = p + (end_byte - start_byte); + + while (p < pend && ASCII_BYTE_P (*p)) p++; + while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--; + + while (p < pend) + { + if (ASCII_BYTE_P (*p)) + p++; + else + { + c = STRING_CHAR_ADVANCE (p); + + charset_map_loaded = 0; + for (tail = coding_attrs_list; CONSP (tail);) + { + elt = XCAR (tail); + if (NILP (elt)) + tail = XCDR (tail); + else if (char_encodable_p (c, elt)) + tail = XCDR (tail); + else if (CONSP (XCDR (tail))) + { + XSETCAR (tail, XCAR (XCDR (tail))); + XSETCDR (tail, XCDR (XCDR (tail))); + } + else + { + XSETCAR (tail, Qnil); + tail = XCDR (tail); + } + } + if (charset_map_loaded) + { + EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg; + + if (STRINGP (start)) + pbeg = XSTRING (start)->data; + else + pbeg = BYTE_POS_ADDR (start_byte); + p = pbeg + p_offset; + pend = pbeg + pend_offset; + } + } + } + + safe_codings = Qnil; + for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail)) + if (! NILP (XCAR (tail))) + safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings); + + return safe_codings; +} + + +DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region, + Scheck_coding_systems_region, 3, 3, 0, + doc: /* Check if the region is encodable by coding systems. + +START and END are buffer positions specifying the region. +CODING-SYSTEM-LIST is a list of coding systems to check. + +The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where +CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the +whole region, POS0, POS1, ... are buffer positions where non-encodable +characters are found. + +If all coding systems in CODING-SYSTEM-LIST can encode the region, the +value is nil. + +START may be a string. In that case, check if the string is +encodable, and the value contains indices to the string instead of +buffer positions. END is ignored. */) + (start, end, coding_system_list) + Lisp_Object start, end, coding_system_list; +{ + Lisp_Object list; + EMACS_INT start_byte, end_byte; + int pos; + unsigned char *p, *pbeg, *pend; + int c; + Lisp_Object tail, elt; + + if (STRINGP (start)) + { + if (!STRING_MULTIBYTE (start) + && XSTRING (start)->size != STRING_BYTES (XSTRING (start))) + return Qnil; + start_byte = 0; + end_byte = STRING_BYTES (XSTRING (start)); + pos = 0; + } + else + { + CHECK_NUMBER_COERCE_MARKER (start); + CHECK_NUMBER_COERCE_MARKER (end); + if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) + args_out_of_range (start, end); + if (NILP (current_buffer->enable_multibyte_characters)) + return Qnil; + start_byte = CHAR_TO_BYTE (XINT (start)); + end_byte = CHAR_TO_BYTE (XINT (end)); + if (XINT (end) - XINT (start) == end_byte - start_byte) + return Qt; + + if (start < GPT && end > GPT) + { + if ((GPT - start) < (end - GPT)) + move_gap_both (start, start_byte); + else + move_gap_both (end, end_byte); + } + pos = start; + } + + list = Qnil; + for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail)) + { + elt = XCAR (tail); + list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0), + Qnil)), + list); + } + + if (STRINGP (start)) + p = pbeg = XSTRING (start)->data; + else + p = pbeg = BYTE_POS_ADDR (start_byte); + pend = p + (end_byte - start_byte); + + while (p < pend && ASCII_BYTE_P (*p)) p++, pos++; + while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--; + + while (p < pend) + { + if (ASCII_BYTE_P (*p)) + p++; + else + { + c = STRING_CHAR_ADVANCE (p); + + charset_map_loaded = 0; + for (tail = list; CONSP (tail); tail = XCDR (tail)) + { + elt = XCDR (XCAR (tail)); + if (! char_encodable_p (c, XCAR (elt))) + XSETCDR (elt, Fcons (make_number (pos), XCDR (elt))); + } + if (charset_map_loaded) + { + EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg; + + if (STRINGP (start)) + pbeg = XSTRING (start)->data; + else + pbeg = BYTE_POS_ADDR (start_byte); + p = pbeg + p_offset; + pend = pbeg + pend_offset; + } + } + pos++; + } + + tail = list; + list = Qnil; + for (; CONSP (tail); tail = XCDR (tail)) + { + elt = XCAR (tail); + if (CONSP (XCDR (XCDR (elt)))) + list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))), + list); + } + + return list; +} + + + +Lisp_Object +code_convert_region (start, end, coding_system, dst_object, encodep, norecord) + Lisp_Object start, end, coding_system, dst_object; + int encodep, norecord; +{ + struct coding_system coding; + EMACS_INT from, from_byte, to, to_byte; + Lisp_Object src_object; + + CHECK_NUMBER_COERCE_MARKER (start); + CHECK_NUMBER_COERCE_MARKER (end); + if (NILP (coding_system)) + coding_system = Qno_conversion; + else + CHECK_CODING_SYSTEM (coding_system); + src_object = Fcurrent_buffer (); + if (NILP (dst_object)) + dst_object = src_object; + else if (! EQ (dst_object, Qt)) + CHECK_BUFFER (dst_object); + + validate_region (&start, &end); + from = XFASTINT (start); + from_byte = CHAR_TO_BYTE (from); + to = XFASTINT (end); + to_byte = CHAR_TO_BYTE (to); + + setup_coding_system (coding_system, &coding); + coding.mode |= CODING_MODE_LAST_BLOCK; + + if (encodep) + encode_coding_object (&coding, src_object, from, from_byte, to, to_byte, + dst_object); + else + decode_coding_object (&coding, src_object, from, from_byte, to, to_byte, + dst_object); + if (! norecord) + Vlast_coding_system_used = CODING_ID_NAME (coding.id); + + if (coding.result != CODING_RESULT_SUCCESS) + error ("Code conversion error: %d", coding.result); + + return (BUFFERP (dst_object) + ? make_number (coding.produced_char) + : coding.dst_object); +} + + +DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region, + 3, 4, "r\nzCoding system: ", + doc: /* Decode the current region from the specified coding system. +When called from a program, takes four arguments: + START, END, CODING-SYSTEM, and DESTINATION. +START and END are buffer positions. + +Optional 4th arguments DESTINATION specifies where the decoded text goes. +If nil, the region between START and END is replace by the decoded text. +If buffer, the decoded text is inserted in the buffer. +If t, the decoded text is returned. + +This function sets `last-coding-system-used' to the precise coding system +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is +not fully specified.) +It returns the length of the decoded text. */) + (start, end, coding_system, destination) + Lisp_Object start, end, coding_system, destination; +{ + return code_convert_region (start, end, coding_system, destination, 0, 0); +} + +DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region, + 3, 4, "r\nzCoding system: ", + doc: /* Encode the current region by specified coding system. +When called from a program, takes three arguments: +START, END, and CODING-SYSTEM. START and END are buffer positions. + +Optional 4th arguments DESTINATION specifies where the encoded text goes. +If nil, the region between START and END is replace by the encoded text. +If buffer, the encoded text is inserted in the buffer. +If t, the encoded text is returned. + +This function sets `last-coding-system-used' to the precise coding system +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is +not fully specified.) +It returns the length of the encoded text. */) + (start, end, coding_system, destination) + Lisp_Object start, end, coding_system, destination; +{ + return code_convert_region (start, end, coding_system, destination, 1, 0); +} + +Lisp_Object +code_convert_string (string, coding_system, dst_object, + encodep, nocopy, norecord) + Lisp_Object string, coding_system, dst_object; + int encodep, nocopy, norecord; +{ + struct coding_system coding; + EMACS_INT chars, bytes; + + CHECK_STRING (string); + if (NILP (coding_system)) + { + if (! norecord) + Vlast_coding_system_used = Qno_conversion; + if (NILP (dst_object)) + return (nocopy ? Fcopy_sequence (string) : string); + } + + if (NILP (coding_system)) + coding_system = Qno_conversion; + else + CHECK_CODING_SYSTEM (coding_system); + if (NILP (dst_object)) + dst_object = Qt; + else if (! EQ (dst_object, Qt)) + CHECK_BUFFER (dst_object); + + setup_coding_system (coding_system, &coding); + coding.mode |= CODING_MODE_LAST_BLOCK; + chars = XSTRING (string)->size; + bytes = STRING_BYTES (XSTRING (string)); + if (encodep) + encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object); + else + decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object); + if (! norecord) + Vlast_coding_system_used = CODING_ID_NAME (coding.id); + + if (coding.result != CODING_RESULT_SUCCESS) + error ("Code conversion error: %d", coding.result); + + return (BUFFERP (dst_object) + ? make_number (coding.produced_char) + : coding.dst_object); +} + + +/* Encode or decode STRING according to CODING_SYSTEM. + Do not set Vlast_coding_system_used. + + This function is called only from macros DECODE_FILE and + ENCODE_FILE, thus we ignore character composition. */ + +Lisp_Object +code_convert_string_norecord (string, coding_system, encodep) + Lisp_Object string, coding_system; + int encodep; +{ + code_convert_string (string, coding_system, Qt, encodep, 0, 1); +} + + +DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string, + 2, 4, 0, + doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result. + +Optional third arg NOCOPY non-nil means it is OK to return STRING itself +if the decoding operation is trivial. + +Optional fourth arg BUFFER non-nil meant that the decoded text is +inserted in BUFFER instead of returned as a astring. In this case, +the return value is BUFFER. + +This function sets `last-coding-system-used' to the precise coding system +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is +not fully specified. */) + (string, coding_system, nocopy, buffer) + Lisp_Object string, coding_system, nocopy, buffer; +{ + return code_convert_string (string, coding_system, buffer, + 0, ! NILP (nocopy), 0); +} + +DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string, + 2, 4, 0, + doc: /* Encode STRING to CODING-SYSTEM, and return the result. + +Optional third arg NOCOPY non-nil means it is OK to return STRING +itself if the encoding operation is trivial. + +Optional fourth arg BUFFER non-nil meant that the encoded text is +inserted in BUFFER instead of returned as a astring. In this case, +the return value is BUFFER. + +This function sets `last-coding-system-used' to the precise coding system +used (which may be different from CODING-SYSTEM if CODING-SYSTEM is +not fully specified.) */) + (string, coding_system, nocopy, buffer) + Lisp_Object string, coding_system, nocopy, buffer; +{ + return code_convert_string (string, coding_system, buffer, + nocopy, ! NILP (nocopy), 1); +} + + +DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0, + doc: /* Decode a Japanese character which has CODE in shift_jis encoding. +Return the corresponding character. */) + (code) + Lisp_Object code; +{ + Lisp_Object spec, attrs, val; + struct charset *charset_roman, *charset_kanji, *charset_kana, *charset; + int c; + + CHECK_NATNUM (code); + c = XFASTINT (code); + CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec); + attrs = AREF (spec, 0); + + if (ASCII_BYTE_P (c) + && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) + return code; + + val = CODING_ATTR_CHARSET_LIST (attrs); + charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))); + + if (c <= 0x7F) + charset = charset_roman; + else if (c >= 0xA0 && c < 0xDF) + { + charset = charset_kana; + c -= 0x80; + } + else + { + int s1 = c >> 8, s2 = c & 0x7F; + + if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF + || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC) + error ("Invalid code: %d", code); + SJIS_TO_JIS (c); + charset = charset_kanji; + } + c = DECODE_CHAR (charset, c); + if (c < 0) + error ("Invalid code: %d", code); + return make_number (c); +} + + +DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0, + doc: /* Encode a Japanese character CHAR to shift_jis encoding. +Return the corresponding code in SJIS. */) + (ch) + Lisp_Object ch; +{ + Lisp_Object spec, attrs, charset_list; + int c; + struct charset *charset; + unsigned code; + + CHECK_CHARACTER (ch); + c = XFASTINT (ch); + CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec); + attrs = AREF (spec, 0); + + if (ASCII_CHAR_P (c) + && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) + return ch; + + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + charset = char_charset (c, charset_list, &code); + if (code == CHARSET_INVALID_CODE (charset)) + error ("Can't encode by shift_jis encoding: %d", c); + JIS_TO_SJIS (code); + + return make_number (code); +} + +DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0, + doc: /* Decode a Big5 character which has CODE in BIG5 coding system. +Return the corresponding character. */) + (code) + Lisp_Object code; +{ + Lisp_Object spec, attrs, val; + struct charset *charset_roman, *charset_big5, *charset; + int c; + + CHECK_NATNUM (code); + c = XFASTINT (code); + CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec); + attrs = AREF (spec, 0); + + if (ASCII_BYTE_P (c) + && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) + return code; + + val = CODING_ATTR_CHARSET_LIST (attrs); + charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); + charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val))); + + if (c <= 0x7F) + charset = charset_roman; + else + { + int b1 = c >> 8, b2 = c & 0x7F; + if (b1 < 0xA1 || b1 > 0xFE + || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE) + error ("Invalid code: %d", code); + charset = charset_big5; + } + c = DECODE_CHAR (charset, (unsigned )c); + if (c < 0) + error ("Invalid code: %d", code); + return make_number (c); +} + +DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0, + doc: /* Encode the Big5 character CHAR to BIG5 coding system. +Return the corresponding character code in Big5. */) + (ch) + Lisp_Object ch; +{ + Lisp_Object spec, attrs, charset_list; + struct charset *charset; + int c; + unsigned code; + + CHECK_CHARACTER (ch); + c = XFASTINT (ch); + CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec); + attrs = AREF (spec, 0); + if (ASCII_CHAR_P (c) + && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) + return ch; + + charset_list = CODING_ATTR_CHARSET_LIST (attrs); + charset = char_charset (c, charset_list, &code); + if (code == CHARSET_INVALID_CODE (charset)) + error ("Can't encode by Big5 encoding: %d", c); + + return make_number (code); +} + + +DEFUN ("set-terminal-coding-system-internal", + Fset_terminal_coding_system_internal, + Sset_terminal_coding_system_internal, 1, 1, 0, + doc: /* Internal use only. */) + (coding_system) +{ + CHECK_SYMBOL (coding_system); + setup_coding_system (Fcheck_coding_system (coding_system), + &terminal_coding); + + /* We had better not send unsafe characters to terminal. */ + terminal_coding.mode |= CODING_MODE_SAFE_ENCODING; + /* Characer composition should be disabled. */ + terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; + terminal_coding.src_multibyte = 1; + terminal_coding.dst_multibyte = 0; + return Qnil; +} + +DEFUN ("set-safe-terminal-coding-system-internal", + Fset_safe_terminal_coding_system_internal, + Sset_safe_terminal_coding_system_internal, 1, 1, 0, + doc: /* Internal use only. */) + (coding_system) +{ + CHECK_SYMBOL (coding_system); + setup_coding_system (Fcheck_coding_system (coding_system), + &safe_terminal_coding); + /* Characer composition should be disabled. */ + safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; + safe_terminal_coding.src_multibyte = 1; + safe_terminal_coding.dst_multibyte = 0; + return Qnil; +} + +DEFUN ("terminal-coding-system", + Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0, + doc: /* Return coding system specified for terminal output. */) + () +{ + return CODING_ID_NAME (terminal_coding.id); +} + +DEFUN ("set-keyboard-coding-system-internal", + Fset_keyboard_coding_system_internal, + Sset_keyboard_coding_system_internal, 1, 1, 0, + doc: /* Internal use only. */) + (coding_system) + Lisp_Object coding_system; +{ + CHECK_SYMBOL (coding_system); + setup_coding_system (Fcheck_coding_system (coding_system), + &keyboard_coding); + /* Characer composition should be disabled. */ + keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; + return Qnil; +} + +DEFUN ("keyboard-coding-system", + Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0, + doc: /* Return coding system specified for decoding keyboard input. */) + () +{ + return CODING_ID_NAME (keyboard_coding.id); +} + + +DEFUN ("find-operation-coding-system", Ffind_operation_coding_system, + Sfind_operation_coding_system, 1, MANY, 0, + doc: /* Choose a coding system for an operation based on the target name. +The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM). +DECODING-SYSTEM is the coding system to use for decoding +\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system +for encoding (in case OPERATION does encoding). + +The first argument OPERATION specifies an I/O primitive: + For file I/O, `insert-file-contents' or `write-region'. + For process I/O, `call-process', `call-process-region', or `start-process'. + For network I/O, `open-network-stream'. + +The remaining arguments should be the same arguments that were passed +to the primitive. Depending on which primitive, one of those arguments +is selected as the TARGET. For example, if OPERATION does file I/O, +whichever argument specifies the file name is TARGET. + +TARGET has a meaning which depends on OPERATION: + For file I/O, TARGET is a file name. + For process I/O, TARGET is a process name. + For network I/O, TARGET is a service name or a port number + +This function looks up what specified for TARGET in, +`file-coding-system-alist', `process-coding-system-alist', +or `network-coding-system-alist' depending on OPERATION. +They may specify a coding system, a cons of coding systems, +or a function symbol to call. +In the last case, we call the function with one argument, +which is a list of all the arguments given to this function. + +usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */) + (nargs, args) + int nargs; + Lisp_Object *args; +{ + Lisp_Object operation, target_idx, target, val; + register Lisp_Object chain; + + if (nargs < 2) + error ("Too few arguments"); + operation = args[0]; + if (!SYMBOLP (operation) + || !INTEGERP (target_idx = Fget (operation, Qtarget_idx))) + error ("Invalid first arguement"); + if (nargs < 1 + XINT (target_idx)) + error ("Too few arguments for operation: %s", + XSYMBOL (operation)->name->data); + target = args[XINT (target_idx) + 1]; + if (!(STRINGP (target) + || (EQ (operation, Qopen_network_stream) && INTEGERP (target)))) + error ("Invalid %dth argument", XINT (target_idx) + 1); + + chain = ((EQ (operation, Qinsert_file_contents) + || EQ (operation, Qwrite_region)) + ? Vfile_coding_system_alist + : (EQ (operation, Qopen_network_stream) + ? Vnetwork_coding_system_alist + : Vprocess_coding_system_alist)); + if (NILP (chain)) + return Qnil; + + for (; CONSP (chain); chain = XCDR (chain)) + { + Lisp_Object elt; + + elt = XCAR (chain); + if (CONSP (elt) + && ((STRINGP (target) + && STRINGP (XCAR (elt)) + && fast_string_match (XCAR (elt), target) >= 0) + || (INTEGERP (target) && EQ (target, XCAR (elt))))) + { + val = XCDR (elt); + /* Here, if VAL is both a valid coding system and a valid + function symbol, we return VAL as a coding system. */ + if (CONSP (val)) + return val; + if (! SYMBOLP (val)) + return Qnil; + if (! NILP (Fcoding_system_p (val))) + return Fcons (val, val); + if (! NILP (Ffboundp (val))) + { + val = call1 (val, Flist (nargs, args)); + if (CONSP (val)) + return val; + if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val))) + return Fcons (val, val); + } + return Qnil; + } + } + return Qnil; +} + +DEFUN ("set-coding-system-priority", Fset_coding_system_priority, + Sset_coding_system_priority, 1, MANY, 0, + doc: /* Put higher priority to coding systems of the arguments. */) + (nargs, args) + int nargs; + Lisp_Object *args; +{ + int i, j; + int changed[coding_category_max]; + enum coding_category priorities[coding_category_max]; + + bzero (changed, sizeof changed); + + for (i = j = 0; i < nargs; i++) + { + enum coding_category category; + Lisp_Object spec, attrs; + + CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec); + attrs = AREF (spec, 0); + category = XINT (CODING_ATTR_CATEGORY (attrs)); + if (changed[category]) + /* Ignore this coding system because a coding system of the + same category already had a higher priority. */ + continue; + changed[category] = 1; + priorities[j++] = category; + if (coding_categories[category].id >= 0 + && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id))) + setup_coding_system (args[i], &coding_categories[category]); + } + + /* Now we have decided top J priorities. Reflect the order of the + original priorities to the remaining priorities. */ + + for (i = j, j = 0; i < coding_category_max; i++, j++) + { + while (j < coding_category_max + && changed[coding_priorities[j]]) + j++; + if (j == coding_category_max) + abort (); + priorities[i] = coding_priorities[j]; + } + + bcopy (priorities, coding_priorities, sizeof priorities); + return Qnil; +} + +DEFUN ("coding-system-priority-list", Fcoding_system_priority_list, + Scoding_system_priority_list, 0, 1, 0, + doc: /* Return a list of coding systems ordered by their priorities. */) + (highestp) + Lisp_Object highestp; +{ + int i; + Lisp_Object val; + + for (i = 0, val = Qnil; i < coding_category_max; i++) + { + enum coding_category category = coding_priorities[i]; + int id = coding_categories[category].id; + Lisp_Object attrs; + + if (id < 0) + continue; + attrs = CODING_ID_ATTRS (id); + if (! NILP (highestp)) + return CODING_ATTR_BASE_NAME (attrs); + val = Fcons (CODING_ATTR_BASE_NAME (attrs), val); + } + return Fnreverse (val); +} + +static Lisp_Object +make_subsidiaries (base) + Lisp_Object base; +{ + Lisp_Object subsidiaries; + char *suffixes[] = { "-unix", "-dos", "-mac" }; + int base_name_len = STRING_BYTES (XSYMBOL (base)->name); + char *buf = (char *) alloca (base_name_len + 6); + int i; + + bcopy (XSYMBOL (base)->name->data, buf, base_name_len); + subsidiaries = Fmake_vector (make_number (3), Qnil); + for (i = 0; i < 3; i++) + { + bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1); + ASET (subsidiaries, i, intern (buf)); + } + return subsidiaries; +} + + +DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal, + Sdefine_coding_system_internal, coding_arg_max, MANY, 0, + doc: /* For internal use only. */) + (nargs, args) + int nargs; + Lisp_Object *args; +{ + Lisp_Object name; + Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */ + Lisp_Object attrs; /* Vector of attributes. */ + Lisp_Object eol_type; + Lisp_Object aliases; + Lisp_Object coding_type, charset_list, safe_charsets; + enum coding_category category; + Lisp_Object tail, val; + int max_charset_id = 0; + int i; + + if (nargs < coding_arg_max) + goto short_args; + + attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil); + + name = args[coding_arg_name]; + CHECK_SYMBOL (name); + CODING_ATTR_BASE_NAME (attrs) = name; + + val = args[coding_arg_mnemonic]; + if (! STRINGP (val)) + CHECK_CHARACTER (val); + CODING_ATTR_MNEMONIC (attrs) = val; + + coding_type = args[coding_arg_coding_type]; + CHECK_SYMBOL (coding_type); + CODING_ATTR_TYPE (attrs) = coding_type; + + charset_list = args[coding_arg_charset_list]; + if (SYMBOLP (charset_list)) + { + if (EQ (charset_list, Qiso_2022)) + { + if (! EQ (coding_type, Qiso_2022)) + error ("Invalid charset-list"); + charset_list = Viso_2022_charset_list; + } + else if (EQ (charset_list, Qemacs_mule)) + { + if (! EQ (coding_type, Qemacs_mule)) + error ("Invalid charset-list"); + charset_list = Vemacs_mule_charset_list; + } + for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) + if (max_charset_id < XFASTINT (XCAR (tail))) + max_charset_id = XFASTINT (XCAR (tail)); + } + else + { + charset_list = Fcopy_sequence (charset_list); + for (tail = charset_list; !NILP (tail); tail = Fcdr (tail)) + { + struct charset *charset; + + val = Fcar (tail); + CHECK_CHARSET_GET_CHARSET (val, charset); + if (EQ (coding_type, Qiso_2022) + ? CHARSET_ISO_FINAL (charset) < 0 + : EQ (coding_type, Qemacs_mule) + ? CHARSET_EMACS_MULE_ID (charset) < 0 + : 0) + error ("Can't handle charset `%s'", + XSYMBOL (CHARSET_NAME (charset))->name->data); + + XCAR (tail) = make_number (charset->id); + if (max_charset_id < charset->id) + max_charset_id = charset->id; + } + } + CODING_ATTR_CHARSET_LIST (attrs) = charset_list; + + safe_charsets = Fmake_string (make_number (max_charset_id + 1), + make_number (255)); + for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) + XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0; + CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets; + + val = args[coding_arg_decode_translation_table]; + if (! NILP (val)) + CHECK_CHAR_TABLE (val); + CODING_ATTR_DECODE_TBL (attrs) = val; + + val = args[coding_arg_encode_translation_table]; + if (! NILP (val)) + CHECK_CHAR_TABLE (val); + CODING_ATTR_ENCODE_TBL (attrs) = val; + + val = args[coding_arg_post_read_conversion]; + CHECK_SYMBOL (val); + CODING_ATTR_POST_READ (attrs) = val; + + val = args[coding_arg_pre_write_conversion]; + CHECK_SYMBOL (val); + CODING_ATTR_PRE_WRITE (attrs) = val; + + val = args[coding_arg_default_char]; + if (NILP (val)) + CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' '); + else + { + CHECK_CHARACTER (val); + CODING_ATTR_DEFAULT_CHAR (attrs) = val; + } + + val = args[coding_arg_plist]; + CHECK_LIST (val); + CODING_ATTR_PLIST (attrs) = val; + + if (EQ (coding_type, Qcharset)) + { + val = Fmake_vector (make_number (256), Qnil); + + for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) + { + struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail))); + + for (i = charset->code_space[0]; i <= charset->code_space[1]; i++) + if (NILP (AREF (val, i))) + ASET (val, i, XCAR (tail)); + } + ASET (attrs, coding_attr_charset_valids, val); + category = coding_category_charset; + } + else if (EQ (coding_type, Qccl)) + { + Lisp_Object valids; + + if (nargs < coding_arg_ccl_max) + goto short_args; + + val = args[coding_arg_ccl_decoder]; + CHECK_CCL_PROGRAM (val); + if (VECTORP (val)) + val = Fcopy_sequence (val); + ASET (attrs, coding_attr_ccl_decoder, val); + + val = args[coding_arg_ccl_encoder]; + CHECK_CCL_PROGRAM (val); + if (VECTORP (val)) + val = Fcopy_sequence (val); + ASET (attrs, coding_attr_ccl_encoder, val); + + val = args[coding_arg_ccl_valids]; + valids = Fmake_string (make_number (256), make_number (0)); + for (tail = val; !NILP (tail); tail = Fcdr (tail)) + { + val = Fcar (tail); + if (INTEGERP (val)) + ASET (valids, XINT (val), 1); + else + { + int from, to; + + CHECK_CONS (val); + CHECK_NUMBER (XCAR (val)); + CHECK_NUMBER (XCDR (val)); + from = XINT (XCAR (val)); + to = XINT (XCDR (val)); + for (i = from; i <= to; i++) + ASET (valids, i, 1); + } + } + ASET (attrs, coding_attr_ccl_valids, valids); + + category = coding_category_ccl; + } + else if (EQ (coding_type, Qutf_16)) + { + Lisp_Object bom, endian; + + if (nargs < coding_arg_utf16_max) + goto short_args; + + bom = args[coding_arg_utf16_bom]; + if (! NILP (bom) && ! EQ (bom, Qt)) + { + CHECK_CONS (bom); + CHECK_CODING_SYSTEM (XCAR (bom)); + CHECK_CODING_SYSTEM (XCDR (bom)); + } + ASET (attrs, coding_attr_utf_16_bom, bom); + + endian = args[coding_arg_utf16_endian]; + ASET (attrs, coding_attr_utf_16_endian, endian); + + category = (CONSP (bom) + ? coding_category_utf_16_auto + : NILP (bom) + ? (NILP (endian) + ? coding_category_utf_16_be_nosig + : coding_category_utf_16_le_nosig) + : (NILP (endian) + ? coding_category_utf_16_be + : coding_category_utf_16_le)); + } + else if (EQ (coding_type, Qiso_2022)) + { + Lisp_Object initial, reg_usage, request, flags; + struct charset *charset; + int i, id, max_id = -1; + + if (nargs < coding_arg_iso2022_max) + goto short_args; + + initial = Fcopy_sequence (args[coding_arg_iso2022_initial]); + CHECK_VECTOR (initial); + for (i = 0; i < 4; i++) + { + val = Faref (initial, make_number (i)); + if (! NILP (val)) + { + CHECK_CHARSET_GET_ID (val, id); + ASET (initial, i, make_number (id)); + } + else + ASET (initial, i, make_number (-1)); + } + + reg_usage = args[coding_arg_iso2022_reg_usage]; + CHECK_CONS (reg_usage); + CHECK_NATNUM (XCAR (reg_usage)); + CHECK_NATNUM (XCDR (reg_usage)); + + request = Fcopy_sequence (args[coding_arg_iso2022_request]); + for (tail = request; ! NILP (tail); tail = Fcdr (tail)) + { + int id; + + val = Fcar (tail); + CHECK_CONS (val); + CHECK_CHARSET_GET_ID (XCAR (val), id); + CHECK_NATNUM (XCDR (val)); + if (XINT (XCDR (val)) >= 4) + error ("Invalid graphic register number: %d", XINT (XCDR (val))); + XCAR (val) = make_number (id); + } + + flags = args[coding_arg_iso2022_flags]; + CHECK_NATNUM (flags); + i = XINT (flags); + if (EQ (args[coding_arg_charset_list], Qiso_2022)) + flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT); + + ASET (attrs, coding_attr_iso_initial, initial); + ASET (attrs, coding_attr_iso_usage, reg_usage); + ASET (attrs, coding_attr_iso_request, request); + ASET (attrs, coding_attr_iso_flags, flags); + setup_iso_safe_charsets (attrs); + + if (i & CODING_ISO_FLAG_SEVEN_BITS) + category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT + | CODING_ISO_FLAG_SINGLE_SHIFT)) + ? coding_category_iso_7_else + : EQ (args[coding_arg_charset_list], Qiso_2022) + ? coding_category_iso_7 + : coding_category_iso_7_tight); + else + { + int id = XINT (AREF (initial, 1)); + + category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT + | CODING_ISO_FLAG_SINGLE_SHIFT)) + || EQ (args[coding_arg_charset_list], Qiso_2022) + || id < 0) + ? coding_category_iso_8_else + : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1) + ? coding_category_iso_8_1 + : coding_category_iso_8_2); + } + } + else if (EQ (coding_type, Qemacs_mule)) + { + if (EQ (args[coding_arg_charset_list], Qemacs_mule)) + ASET (attrs, coding_attr_emacs_mule_full, Qt); + + category = coding_category_emacs_mule; + } + else if (EQ (coding_type, Qshift_jis)) + { + + struct charset *charset; + + if (XINT (Flength (charset_list)) != 3) + error ("There should be just three charsets"); + + charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); + if (CHARSET_DIMENSION (charset) != 1) + error ("Dimension of charset %s is not one", + XSYMBOL (CHARSET_NAME (charset))->name->data); + + charset_list = XCDR (charset_list); + charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); + if (CHARSET_DIMENSION (charset) != 1) + error ("Dimension of charset %s is not one", + XSYMBOL (CHARSET_NAME (charset))->name->data); + + charset_list = XCDR (charset_list); + charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); + if (CHARSET_DIMENSION (charset) != 2) + error ("Dimension of charset %s is not two", + XSYMBOL (CHARSET_NAME (charset))->name->data); + + category = coding_category_sjis; + Vsjis_coding_system = name; + } + else if (EQ (coding_type, Qbig5)) + { + struct charset *charset; + + if (XINT (Flength (charset_list)) != 2) + error ("There should be just two charsets"); + + charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); + if (CHARSET_DIMENSION (charset) != 1) + error ("Dimension of charset %s is not one", + XSYMBOL (CHARSET_NAME (charset))->name->data); + + charset_list = XCDR (charset_list); + charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); + if (CHARSET_DIMENSION (charset) != 2) + error ("Dimension of charset %s is not two", + XSYMBOL (CHARSET_NAME (charset))->name->data); + + category = coding_category_big5; + Vbig5_coding_system = name; + } + else if (EQ (coding_type, Qraw_text)) + category = coding_category_raw_text; + else if (EQ (coding_type, Qutf_8)) + category = coding_category_utf_8; + else if (EQ (coding_type, Qundecided)) + category = coding_category_undecided; + else + error ("Invalid coding system type: %s", + XSYMBOL (coding_type)->name->data); + + CODING_ATTR_CATEGORY (attrs) = make_number (category); + + eol_type = args[coding_arg_eol_type]; + if (! NILP (eol_type) + && ! EQ (eol_type, Qunix) + && ! EQ (eol_type, Qdos) + && ! EQ (eol_type, Qmac)) + error ("Invalid eol-type"); + + aliases = Fcons (name, Qnil); + + if (NILP (eol_type)) + { + eol_type = make_subsidiaries (name); + for (i = 0; i < 3; i++) + { + Lisp_Object this_spec, this_name, this_aliases, this_eol_type; + + this_name = AREF (eol_type, i); + this_aliases = Fcons (this_name, Qnil); + this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac); + this_spec = Fmake_vector (make_number (3), attrs); + ASET (this_spec, 1, this_aliases); + ASET (this_spec, 2, this_eol_type); + Fputhash (this_name, this_spec, Vcoding_system_hash_table); + Vcoding_system_list = Fcons (this_name, Vcoding_system_list); + Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil), + Vcoding_system_alist); + } + } + + spec_vec = Fmake_vector (make_number (3), attrs); + ASET (spec_vec, 1, aliases); + ASET (spec_vec, 2, eol_type); + + Fputhash (name, spec_vec, Vcoding_system_hash_table); + Vcoding_system_list = Fcons (name, Vcoding_system_list); + Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil), + Vcoding_system_alist); + + { + int id = coding_categories[category].id; + + if (id < 0 || EQ (name, CODING_ID_NAME (id))) + setup_coding_system (name, &coding_categories[category]); + } + + return Qnil; + + short_args: + return Fsignal (Qwrong_number_of_arguments, + Fcons (intern ("define-coding-system-internal"), + make_number (nargs))); +} + +DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, + Sdefine_coding_system_alias, 2, 2, 0, + doc: /* Define ALIAS as an alias for CODING-SYSTEM. */) + (alias, coding_system) + Lisp_Object alias, coding_system; +{ + Lisp_Object spec, aliases, eol_type; + + CHECK_SYMBOL (alias); + CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); + aliases = AREF (spec, 1); + while (!NILP (XCDR (aliases))) + aliases = XCDR (aliases); + XCDR (aliases) = Fcons (alias, Qnil); + + eol_type = AREF (spec, 2); + if (VECTORP (eol_type)) + { + Lisp_Object subsidiaries; + int i; + + subsidiaries = make_subsidiaries (alias); + for (i = 0; i < 3; i++) + Fdefine_coding_system_alias (AREF (subsidiaries, i), + AREF (eol_type, i)); + + ASET (spec, 2, subsidiaries); + } + + Fputhash (alias, spec, Vcoding_system_hash_table); + + return Qnil; +} + +DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base, + 1, 1, 0, + doc: /* Return the base of CODING-SYSTEM. +Any alias or subsidiary coding systems are not base coding system. */) + (coding_system) + Lisp_Object coding_system; +{ + Lisp_Object spec, attrs; + + if (NILP (coding_system)) + return (Qno_conversion); + CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); + attrs = AREF (spec, 0); + return CODING_ATTR_BASE_NAME (attrs); +} + +DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist, + 1, 1, 0, + doc: "Return the property list of CODING-SYSTEM.") + (coding_system) + Lisp_Object coding_system; +{ + Lisp_Object spec, attrs; + + if (NILP (coding_system)) + coding_system = Qno_conversion; + CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); + attrs = AREF (spec, 0); + return CODING_ATTR_PLIST (attrs); +} + + +DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases, + 1, 1, 0, + doc: /* Return the list of aliases of CODING-SYSTEM. +A base coding system is what made by `define-coding-system'. +Any alias nor subsidiary coding systems are not base coding system. */) + (coding_system) + Lisp_Object coding_system; +{ + Lisp_Object spec; + + if (NILP (coding_system)) + coding_system = Qno_conversion; + CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); + return AREF (spec, 2); +} + +DEFUN ("coding-system-eol-type", Fcoding_system_eol_type, + Scoding_system_eol_type, 1, 1, 0, + doc: /* Return eol-type of CODING-SYSTEM. +An eol-type is integer 0, 1, 2, or a vector of coding systems. + +Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF, +and CR respectively. + +A vector value indicates that a format of end-of-line should be +detected automatically. Nth element of the vector is the subsidiary +coding system whose eol-type is N. */) + (coding_system) + Lisp_Object coding_system; +{ + Lisp_Object spec, eol_type; + int n; + + if (NILP (coding_system)) + coding_system = Qno_conversion; + if (! CODING_SYSTEM_P (coding_system)) + return Qnil; + spec = CODING_SYSTEM_SPEC (coding_system); + eol_type = AREF (spec, 2); + if (VECTORP (eol_type)) + return Fcopy_sequence (eol_type); + n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2; + return make_number (n); +} + +#endif /* emacs */ + + +/*** 9. Post-amble ***/ + +void +init_coding_once () +{ + int i; + + for (i = 0; i < coding_category_max; i++) + { + coding_categories[i].id = -1; + coding_priorities[i] = i; + } + + /* ISO2022 specific initialize routine. */ + for (i = 0; i < 0x20; i++) + iso_code_class[i] = ISO_control_0; + for (i = 0x21; i < 0x7F; i++) + iso_code_class[i] = ISO_graphic_plane_0; + for (i = 0x80; i < 0xA0; i++) + iso_code_class[i] = ISO_control_1; + for (i = 0xA1; i < 0xFF; i++) + iso_code_class[i] = ISO_graphic_plane_1; + iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F; + iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF; + iso_code_class[ISO_CODE_CR] = ISO_carriage_return; + iso_code_class[ISO_CODE_SO] = ISO_shift_out; + iso_code_class[ISO_CODE_SI] = ISO_shift_in; + iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7; + iso_code_class[ISO_CODE_ESC] = ISO_escape; + iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2; + iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3; + iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer; + + inhibit_pre_post_conversion = 0; + + for (i = 0; i < 256; i++) + { + emacs_mule_bytes[i] = 1; + } +} + +#ifdef emacs + +void +syms_of_coding () +{ + staticpro (&Vcoding_system_hash_table); + Vcoding_system_hash_table = Fmakehash (Qeq); + + staticpro (&Vsjis_coding_system); + Vsjis_coding_system = Qnil; + + staticpro (&Vbig5_coding_system); + Vbig5_coding_system = Qnil; + + staticpro (&Vcode_conversion_work_buf_list); + Vcode_conversion_work_buf_list = Qnil; + + staticpro (&Vcode_conversion_reused_work_buf); + Vcode_conversion_reused_work_buf = Qnil; + + DEFSYM (Qcharset, "charset"); + DEFSYM (Qtarget_idx, "target-idx"); + DEFSYM (Qcoding_system_history, "coding-system-history"); + Fset (Qcoding_system_history, Qnil); + + /* Target FILENAME is the first argument. */ + Fput (Qinsert_file_contents, Qtarget_idx, make_number (0)); + /* Target FILENAME is the third argument. */ + Fput (Qwrite_region, Qtarget_idx, make_number (2)); + + DEFSYM (Qcall_process, "call-process"); + /* Target PROGRAM is the first argument. */ + Fput (Qcall_process, Qtarget_idx, make_number (0)); + + DEFSYM (Qcall_process_region, "call-process-region"); + /* Target PROGRAM is the third argument. */ + Fput (Qcall_process_region, Qtarget_idx, make_number (2)); + + DEFSYM (Qstart_process, "start-process"); + /* Target PROGRAM is the third argument. */ + Fput (Qstart_process, Qtarget_idx, make_number (2)); + + DEFSYM (Qopen_network_stream, "open-network-stream"); + /* Target SERVICE is the fourth argument. */ + Fput (Qopen_network_stream, Qtarget_idx, make_number (3)); + + DEFSYM (Qcoding_system, "coding-system"); + DEFSYM (Qcoding_aliases, "coding-aliases"); + + DEFSYM (Qeol_type, "eol-type"); + DEFSYM (Qunix, "unix"); + DEFSYM (Qdos, "dos"); + DEFSYM (Qmac, "mac"); + + DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system"); + DEFSYM (Qpost_read_conversion, "post-read-conversion"); + DEFSYM (Qpre_write_conversion, "pre-write-conversion"); + DEFSYM (Qdefault_char, "default-char"); + DEFSYM (Qundecided, "undecided"); + DEFSYM (Qno_conversion, "no-conversion"); + DEFSYM (Qraw_text, "raw-text"); + + DEFSYM (Qiso_2022, "iso-2022"); + + DEFSYM (Qutf_8, "utf-8"); + + DEFSYM (Qutf_16, "utf-16"); + DEFSYM (Qutf_16_be, "utf-16-be"); + DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig"); + DEFSYM (Qutf_16_le, "utf-16-l3"); + DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig"); + DEFSYM (Qsignature, "signature"); + DEFSYM (Qendian, "endian"); + DEFSYM (Qbig, "big"); + DEFSYM (Qlittle, "little"); + + DEFSYM (Qshift_jis, "shift-jis"); + DEFSYM (Qbig5, "big5"); + + DEFSYM (Qcoding_system_p, "coding-system-p"); + + DEFSYM (Qcoding_system_error, "coding-system-error"); + Fput (Qcoding_system_error, Qerror_conditions, + Fcons (Qcoding_system_error, Fcons (Qerror, Qnil))); + Fput (Qcoding_system_error, Qerror_message, + build_string ("Invalid coding system")); + + /* Intern this now in case it isn't already done. + Setting this variable twice is harmless. + But don't staticpro it here--that is done in alloc.c. */ + Qchar_table_extra_slots = intern ("char-table-extra-slots"); + + DEFSYM (Qtranslation_table, "translation-table"); + Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1)); + DEFSYM (Qtranslation_table_id, "translation-table-id"); + DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode"); + DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode"); + + DEFSYM (Qchar_coding_system, "char-coding-system"); + + Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2)); + + DEFSYM (Qvalid_codes, "valid-codes"); + + DEFSYM (Qemacs_mule, "emacs-mule"); + + Vcoding_category_table + = Fmake_vector (make_number (coding_category_max), Qnil); + staticpro (&Vcoding_category_table); + /* Followings are target of code detection. */ + ASET (Vcoding_category_table, coding_category_iso_7, + intern ("coding-category-iso-7")); + ASET (Vcoding_category_table, coding_category_iso_7_tight, + intern ("coding-category-iso-7-tight")); + ASET (Vcoding_category_table, coding_category_iso_8_1, + intern ("coding-category-iso-8-1")); + ASET (Vcoding_category_table, coding_category_iso_8_2, + intern ("coding-category-iso-8-2")); + ASET (Vcoding_category_table, coding_category_iso_7_else, + intern ("coding-category-iso-7-else")); + ASET (Vcoding_category_table, coding_category_iso_8_else, + intern ("coding-category-iso-8-else")); + ASET (Vcoding_category_table, coding_category_utf_8, + intern ("coding-category-utf-8")); + ASET (Vcoding_category_table, coding_category_utf_16_be, + intern ("coding-category-utf-16-be")); + ASET (Vcoding_category_table, coding_category_utf_16_le, + intern ("coding-category-utf-16-le")); + ASET (Vcoding_category_table, coding_category_utf_16_be_nosig, + intern ("coding-category-utf-16-be-nosig")); + ASET (Vcoding_category_table, coding_category_utf_16_le_nosig, + intern ("coding-category-utf-16-le-nosig")); + ASET (Vcoding_category_table, coding_category_charset, + intern ("coding-category-charset")); + ASET (Vcoding_category_table, coding_category_sjis, + intern ("coding-category-sjis")); + ASET (Vcoding_category_table, coding_category_big5, + intern ("coding-category-big5")); + ASET (Vcoding_category_table, coding_category_ccl, + intern ("coding-category-ccl")); + ASET (Vcoding_category_table, coding_category_emacs_mule, + intern ("coding-category-emacs-mule")); + /* Followings are NOT target of code detection. */ + ASET (Vcoding_category_table, coding_category_raw_text, + intern ("coding-category-raw-text")); + ASET (Vcoding_category_table, coding_category_undecided, + intern ("coding-category-undecided")); + + { + Lisp_Object args[coding_arg_max]; + Lisp_Object plist[14]; + int i; + + for (i = 0; i < coding_arg_max; i++) + args[i] = Qnil; + + plist[0] = intern (":name"); + plist[1] = args[coding_arg_name] = Qno_conversion; + plist[2] = intern (":mnemonic"); + plist[3] = args[coding_arg_mnemonic] = make_number ('='); + plist[4] = intern (":coding-type"); + plist[5] = args[coding_arg_coding_type] = Qraw_text; + plist[6] = intern (":ascii-compatible-p"); + plist[7] = args[coding_arg_ascii_compatible_p] = Qt; + plist[8] = intern (":default-char"); + plist[9] = args[coding_arg_default_char] = make_number (0); + plist[10] = intern (":docstring"); + plist[11] = build_string ("Do no conversion.\n\ +\n\ +When you visit a file with this coding, the file is read into a\n\ +unibyte buffer as is, thus each byte of a file is treated as a\n\ +character."); + plist[12] = intern (":eol-type"); + plist[13] = args[coding_arg_eol_type] = Qunix; + args[coding_arg_plist] = Flist (14, plist); + Fdefine_coding_system_internal (coding_arg_max, args); + } + + setup_coding_system (Qno_conversion, &keyboard_coding); + setup_coding_system (Qno_conversion, &terminal_coding); + setup_coding_system (Qno_conversion, &safe_terminal_coding); + + defsubr (&Scoding_system_p); + defsubr (&Sread_coding_system); + defsubr (&Sread_non_nil_coding_system); + defsubr (&Scheck_coding_system); + defsubr (&Sdetect_coding_region); + defsubr (&Sdetect_coding_string); + defsubr (&Sfind_coding_systems_region_internal); + defsubr (&Scheck_coding_systems_region); + defsubr (&Sdecode_coding_region); + defsubr (&Sencode_coding_region); + defsubr (&Sdecode_coding_string); + defsubr (&Sencode_coding_string); + defsubr (&Sdecode_sjis_char); + defsubr (&Sencode_sjis_char); + defsubr (&Sdecode_big5_char); + defsubr (&Sencode_big5_char); + defsubr (&Sset_terminal_coding_system_internal); + defsubr (&Sset_safe_terminal_coding_system_internal); + defsubr (&Sterminal_coding_system); + defsubr (&Sset_keyboard_coding_system_internal); + defsubr (&Skeyboard_coding_system); + defsubr (&Sfind_operation_coding_system); + defsubr (&Sset_coding_system_priority); + defsubr (&Sdefine_coding_system_internal); + defsubr (&Sdefine_coding_system_alias); + defsubr (&Scoding_system_base); + defsubr (&Scoding_system_plist); + defsubr (&Scoding_system_aliases); + defsubr (&Scoding_system_eol_type); + defsubr (&Scoding_system_priority_list); + + DEFVAR_LISP ("coding-system-list", &Vcoding_system_list, + doc: /* List of coding systems. + +Do not alter the value of this variable manually. This variable should be +updated by the functions `define-coding-system' and +`define-coding-system-alias'. */); + Vcoding_system_list = Qnil; + + DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist, + doc: /* Alist of coding system names. +Each element is one element list of coding system name. +This variable is given to `completing-read' as TABLE argument. + +Do not alter the value of this variable manually. This variable should be +updated by the functions `make-coding-system' and +`define-coding-system-alias'. */); + Vcoding_system_alist = Qnil; + + DEFVAR_LISP ("coding-category-list", &Vcoding_category_list, + doc: /* List of coding-categories (symbols) ordered by priority. + +On detecting a coding system, Emacs tries code detection algorithms +associated with each coding-category one by one in this order. When +one algorithm agrees with a byte sequence of source text, the coding +system bound to the corresponding coding-category is selected. */); + { + int i; + + Vcoding_category_list = Qnil; + for (i = coding_category_max - 1; i >= 0; i--) + Vcoding_category_list + = Fcons (XVECTOR (Vcoding_category_table)->contents[i], + Vcoding_category_list); + } + + DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read, + doc: /* Specify the coding system for read operations. +It is useful to bind this variable with `let', but do not set it globally. +If the value is a coding system, it is used for decoding on read operation. +If not, an appropriate element is used from one of the coding system alists: +There are three such tables, `file-coding-system-alist', +`process-coding-system-alist', and `network-coding-system-alist'. */); + Vcoding_system_for_read = Qnil; + + DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write, + doc: /* Specify the coding system for write operations. +Programs bind this variable with `let', but you should not set it globally. +If the value is a coding system, it is used for encoding of output, +when writing it to a file and when sending it to a file or subprocess. + +If this does not specify a coding system, an appropriate element +is used from one of the coding system alists: +There are three such tables, `file-coding-system-alist', +`process-coding-system-alist', and `network-coding-system-alist'. +For output to files, if the above procedure does not specify a coding system, +the value of `buffer-file-coding-system' is used. */); + Vcoding_system_for_write = Qnil; + + DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used, + doc: /* +Coding system used in the latest file or process I/O. */); + Vlast_coding_system_used = Qnil; + + DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion, + doc: /* +*Non-nil means always inhibit code conversion of end-of-line format. +See info node `Coding Systems' and info node `Text and Binary' concerning +such conversion. */); + inhibit_eol_conversion = 0; + + DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system, + doc: /* +Non-nil means process buffer inherits coding system of process output. +Bind it to t if the process output is to be treated as if it were a file +read from some filesystem. */); + inherit_process_coding_system = 0; + + DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist, + doc: /* +Alist to decide a coding system to use for a file I/O operation. +The format is ((PATTERN . VAL) ...), +where PATTERN is a regular expression matching a file name, +VAL is a coding system, a cons of coding systems, or a function symbol. +If VAL is a coding system, it is used for both decoding and encoding +the file contents. +If VAL is a cons of coding systems, the car part is used for decoding, +and the cdr part is used for encoding. +If VAL is a function symbol, the function must return a coding system +or a cons of coding systems which are used as above. The function gets +the arguments with which `find-operation-coding-systems' was called. + +See also the function `find-operation-coding-system' +and the variable `auto-coding-alist'. */); + Vfile_coding_system_alist = Qnil; + + DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist, + doc: /* +Alist to decide a coding system to use for a process I/O operation. +The format is ((PATTERN . VAL) ...), +where PATTERN is a regular expression matching a program name, +VAL is a coding system, a cons of coding systems, or a function symbol. +If VAL is a coding system, it is used for both decoding what received +from the program and encoding what sent to the program. +If VAL is a cons of coding systems, the car part is used for decoding, +and the cdr part is used for encoding. +If VAL is a function symbol, the function must return a coding system +or a cons of coding systems which are used as above. + +See also the function `find-operation-coding-system'. */); + Vprocess_coding_system_alist = Qnil; + + DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist, + doc: /* +Alist to decide a coding system to use for a network I/O operation. +The format is ((PATTERN . VAL) ...), +where PATTERN is a regular expression matching a network service name +or is a port number to connect to, +VAL is a coding system, a cons of coding systems, or a function symbol. +If VAL is a coding system, it is used for both decoding what received +from the network stream and encoding what sent to the network stream. +If VAL is a cons of coding systems, the car part is used for decoding, +and the cdr part is used for encoding. +If VAL is a function symbol, the function must return a coding system +or a cons of coding systems which are used as above. + +See also the function `find-operation-coding-system'. */); + Vnetwork_coding_system_alist = Qnil; + + DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system, + doc: /* Coding system to use with system messages. +Also used for decoding keyboard input on X Window system. */); + Vlocale_coding_system = Qnil; + + /* The eol mnemonics are reset in startup.el system-dependently. */ + DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix, + doc: /* +*String displayed in mode line for UNIX-like (LF) end-of-line format. */); + eol_mnemonic_unix = build_string (":"); + + DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos, + doc: /* +*String displayed in mode line for DOS-like (CRLF) end-of-line format. */); + eol_mnemonic_dos = build_string ("\\"); + + DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac, + doc: /* +*String displayed in mode line for MAC-like (CR) end-of-line format. */); + eol_mnemonic_mac = build_string ("/"); + + DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided, + doc: /* +*String displayed in mode line when end-of-line format is not yet determined. */); + eol_mnemonic_undecided = build_string (":"); + + DEFVAR_LISP ("enable-character-translation", &Venable_character_translation, + doc: /* +*Non-nil enables character translation while encoding and decoding. */); + Venable_character_translation = Qt; + + DEFVAR_LISP ("standard-translation-table-for-decode", + &Vstandard_translation_table_for_decode, + doc: /* Table for translating characters while decoding. */); + Vstandard_translation_table_for_decode = Qnil; + + DEFVAR_LISP ("standard-translation-table-for-encode", + &Vstandard_translation_table_for_encode, + doc: /* Table for translating characters while encoding. */); + Vstandard_translation_table_for_encode = Qnil; + + DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table, + doc: /* Alist of charsets vs revision numbers. +While encoding, if a charset (car part of an element) is found, +designate it with the escape sequence identifying revision (cdr part +of the element). */); + Vcharset_revision_table = Qnil; + + DEFVAR_LISP ("default-process-coding-system", + &Vdefault_process_coding_system, + doc: /* Cons of coding systems used for process I/O by default. +The car part is used for decoding a process output, +the cdr part is used for encoding a text to be sent to a process. */); + Vdefault_process_coding_system = Qnil; + + DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table, + doc: /* +Table of extra Latin codes in the range 128..159 (inclusive). +This is a vector of length 256. +If Nth element is non-nil, the existence of code N in a file +\(or output of subprocess) doesn't prevent it to be detected as +a coding system of ISO 2022 variant which has a flag +`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file +or reading output of a subprocess. +Only 128th through 159th elements has a meaning. */); + Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil); + + DEFVAR_LISP ("select-safe-coding-system-function", + &Vselect_safe_coding_system_function, + doc: /* +Function to call to select safe coding system for encoding a text. + +If set, this function is called to force a user to select a proper +coding system which can encode the text in the case that a default +coding system used in each operation can't encode the text. + +The default value is `select-safe-coding-system' (which see). */); + Vselect_safe_coding_system_function = Qnil; + + DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table, + doc: /* +Char-table containing safe coding systems of each characters. +Each element doesn't include such generic coding systems that can +encode any characters. They are in the first extra slot. */); + Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil); + + DEFVAR_BOOL ("inhibit-iso-escape-detection", + &inhibit_iso_escape_detection, + doc: /* +If non-nil, Emacs ignores ISO2022's escape sequence on code detection. + +By default, on reading a file, Emacs tries to detect how the text is +encoded. This code detection is sensitive to escape sequences. If +the sequence is valid as ISO2022, the code is determined as one of +the ISO2022 encodings, and the file is decoded by the corresponding +coding system (e.g. `iso-2022-7bit'). + +However, there may be a case that you want to read escape sequences in +a file as is. In such a case, you can set this variable to non-nil. +Then, as the code detection ignores any escape sequences, no file is +detected as encoded in some ISO2022 encoding. The result is that all +escape sequences become visible in a buffer. + +The default value is nil, and it is strongly recommended not to change +it. That is because many Emacs Lisp source files that contain +non-ASCII characters are encoded by the coding system `iso-2022-7bit' +in Emacs's distribution, and they won't be decoded correctly on +reading if you suppress escape sequence detection. + +The other way to read escape sequences in a file without decoding is +to explicitly specify some coding system that doesn't use ISO2022's +escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */); + inhibit_iso_escape_detection = 0; +} + +char * +emacs_strerror (error_number) + int error_number; +{ + char *str; + + synchronize_system_messages_locale (); + str = strerror (error_number); + + if (! NILP (Vlocale_coding_system)) + { + Lisp_Object dec = code_convert_string_norecord (build_string (str), + Vlocale_coding_system, + 0); + str = (char *) XSTRING (dec)->data; + } + + return str; +} + +#endif /* emacs */ diff --git a/src/coding.h b/src/coding.h dissimilarity index 69% index 500f02acf3..cc115b6ef3 100644 --- a/src/coding.h +++ b/src/coding.h @@ -1,718 +1,663 @@ -/* Header for coding system handler. - Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN. - Licensed to the Free Software Foundation. - -This file is part of GNU Emacs. - -GNU Emacs is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -GNU Emacs is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GNU Emacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -#ifndef EMACS_CODING_H -#define EMACS_CODING_H - -#include "ccl.h" - -/*** EMACS' INTERNAL FORMAT (emacs-mule) section ***/ - -/* All code (1-byte) of Emacs' internal format is classified into one - of the followings. See also `charset.h'. */ -enum emacs_code_class_type - { - EMACS_control_code, /* Control codes in the range - 0x00..0x1F and 0x7F except for the - following two codes. */ - EMACS_linefeed_code, /* 0x0A (linefeed) to denote - end-of-line. */ - EMACS_carriage_return_code, /* 0x0D (carriage-return) to be used - in selective display mode. */ - EMACS_ascii_code, /* ASCII characters. */ - EMACS_leading_code_2, /* Base leading code of official - TYPE9N character. */ - EMACS_leading_code_3, /* Base leading code of private TYPE9N - or official TYPE9Nx9N character. */ - EMACS_leading_code_4, /* Base leading code of private - TYPE9Nx9N character. */ - EMACS_invalid_code /* Invalid code, i.e. a base leading - code not yet assigned to any - charset, or a code of the range - 0xA0..0xFF. */ - }; - -extern enum emacs_code_class_type emacs_code_class[256]; - -/*** ISO2022 section ***/ - -/* Macros to define code of control characters for ISO2022's functions. */ - /* code */ /* function */ -#define ISO_CODE_LF 0x0A /* line-feed */ -#define ISO_CODE_CR 0x0D /* carriage-return */ -#define ISO_CODE_SO 0x0E /* shift-out */ -#define ISO_CODE_SI 0x0F /* shift-in */ -#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */ -#define ISO_CODE_ESC 0x1B /* escape */ -#define ISO_CODE_SS2 0x8E /* single-shift-2 */ -#define ISO_CODE_SS3 0x8F /* single-shift-3 */ -#define ISO_CODE_CSI 0x9B /* control-sequence-introduce */ - -/* All code (1-byte) of ISO2022 is classified into one of the - followings. */ -enum iso_code_class_type - { - ISO_control_0, /* Control codes in the range - 0x00..0x1F and 0x7F, except for the - following 5 codes. */ - ISO_carriage_return, /* ISO_CODE_CR (0x0D) */ - ISO_shift_out, /* ISO_CODE_SO (0x0E) */ - ISO_shift_in, /* ISO_CODE_SI (0x0F) */ - ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */ - ISO_escape, /* ISO_CODE_SO (0x1B) */ - ISO_control_1, /* Control codes in the range - 0x80..0x9F, except for the - following 3 codes. */ - ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */ - ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */ - ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */ - ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */ - ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */ - ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */ - ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */ - }; - -/** The macros CODING_FLAG_ISO_XXX defines a flag bit of the `flags' - element in the structure `coding_system'. This information is used - while encoding a text to ISO2022. **/ - -/* If set, produce short-form designation sequence (e.g. ESC $ A) - instead of long-form sequence (e.g. ESC $ ( A). */ -#define CODING_FLAG_ISO_SHORT_FORM 0x0001 - -/* If set, reset graphic planes and registers at end-of-line to the - initial state. */ -#define CODING_FLAG_ISO_RESET_AT_EOL 0x0002 - -/* If set, reset graphic planes and registers before any control - characters to the initial state. */ -#define CODING_FLAG_ISO_RESET_AT_CNTL 0x0004 - -/* If set, encode by 7-bit environment. */ -#define CODING_FLAG_ISO_SEVEN_BITS 0x0008 - -/* If set, use locking-shift function. */ -#define CODING_FLAG_ISO_LOCKING_SHIFT 0x0010 - -/* If set, use single-shift function. Overwrite - CODING_FLAG_ISO_LOCKING_SHIFT. */ -#define CODING_FLAG_ISO_SINGLE_SHIFT 0x0020 - -/* If set, designate JISX0201-Roman instead of ASCII. */ -#define CODING_FLAG_ISO_USE_ROMAN 0x0040 - -/* If set, designate JISX0208-1978 instead of JISX0208-1983. */ -#define CODING_FLAG_ISO_USE_OLDJIS 0x0080 - -/* If set, do not produce ISO6429's direction specifying sequence. */ -#define CODING_FLAG_ISO_NO_DIRECTION 0x0100 - -/* If set, assume designation states are reset at beginning of line on - output. */ -#define CODING_FLAG_ISO_INIT_AT_BOL 0x0200 - -/* If set, designation sequence should be placed at beginning of line - on output. */ -#define CODING_FLAG_ISO_DESIGNATE_AT_BOL 0x0400 - -/* If set, do not encode unsafe characters on output. */ -#define CODING_FLAG_ISO_SAFE 0x0800 - -/* If set, extra latin codes (128..159) are accepted as a valid code - on input. */ -#define CODING_FLAG_ISO_LATIN_EXTRA 0x1000 - -/* If set, use designation escape sequence. */ -#define CODING_FLAG_ISO_DESIGNATION 0x10000 - -/* A character to be produced on output if encoding of the original - character is prohibited by CODING_FLAG_ISO_SAFE. */ -#define CODING_INHIBIT_CHARACTER_SUBSTITUTION 077 /* 077 == `?' */ - -/* Structure of the field `spec.iso2022' in the structure `coding_system'. */ -struct iso2022_spec -{ - /* The current graphic register invoked to each graphic plane. */ - int current_invocation[2]; - - /* The current charset designated to each graphic register. */ - int current_designation[4]; - - /* A charset initially designated to each graphic register. */ - int initial_designation[4]; - - /* If not -1, it is a graphic register specified in an invalid - designation sequence. */ - int last_invalid_designation_register; - - /* A graphic register to which each charset should be designated. */ - unsigned char requested_designation[MAX_CHARSET + 1]; - - /* A revision number to be specified for each charset on encoding. - The value 255 means no revision number for the corresponding - charset. */ - unsigned char charset_revision_number[MAX_CHARSET + 1]; - - /* Set to 1 temporarily only when graphic register 2 or 3 is invoked - by single-shift while encoding. */ - int single_shifting; - - /* Set to 1 temporarily only when processing at beginning of line. */ - int bol; -}; - -/* Macros to access each field in the structure `spec.iso2022'. */ -#define CODING_SPEC_ISO_INVOCATION(coding, plane) \ - (coding)->spec.iso2022.current_invocation[plane] -#define CODING_SPEC_ISO_DESIGNATION(coding, reg) \ - (coding)->spec.iso2022.current_designation[reg] -#define CODING_SPEC_ISO_INITIAL_DESIGNATION(coding, reg) \ - (coding)->spec.iso2022.initial_designation[reg] -#define CODING_SPEC_ISO_REQUESTED_DESIGNATION(coding, charset) \ - (coding)->spec.iso2022.requested_designation[charset] -#define CODING_SPEC_ISO_REVISION_NUMBER(coding, charset) \ - (coding)->spec.iso2022.charset_revision_number[charset] -#define CODING_SPEC_ISO_SINGLE_SHIFTING(coding) \ - (coding)->spec.iso2022.single_shifting -#define CODING_SPEC_ISO_BOL(coding) \ - (coding)->spec.iso2022.bol - -/* A value which may appear in - coding->spec.iso2022.requested_designation indicating that the - corresponding charset does not request any graphic register to be - designated. */ -#define CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION 4 - -/* Return a charset which is currently designated to the graphic plane - PLANE in the coding-system CODING. */ -#define CODING_SPEC_ISO_PLANE_CHARSET(coding, plane) \ - ((CODING_SPEC_ISO_INVOCATION (coding, plane) < 0) \ - ? -1 \ - : CODING_SPEC_ISO_DESIGNATION (coding, \ - CODING_SPEC_ISO_INVOCATION (coding, plane))) - -/*** BIG5 section ***/ - -/* Macros to denote each type of BIG5 coding system. */ -#define CODING_FLAG_BIG5_HKU 0x00 /* BIG5-HKU is one of variants of - BIG5 developed by Hong Kong - University. */ -#define CODING_FLAG_BIG5_ETEN 0x01 /* BIG5_ETen is one of variants - of BIG5 developed by the - company ETen in Taiwan. */ - -/*** GENERAL section ***/ - -/* Types of coding system. */ -enum coding_type - { - coding_type_no_conversion, /* A coding system which requires no - conversion for reading and writing - including end-of-line format. */ - coding_type_emacs_mule, /* A coding system used in Emacs' - buffer and string. Requires no - conversion for reading and writing - except for end-of-line format. */ - coding_type_undecided, /* A coding system which requires - automatic detection of a real - coding system. */ - coding_type_sjis, /* SJIS coding system for Japanese. */ - coding_type_iso2022, /* Any coding system of ISO2022 - variants. */ - coding_type_big5, /* BIG5 coding system for Chinese. */ - coding_type_ccl, /* The coding system of which decoder - and encoder are written in CCL. */ - coding_type_raw_text /* A coding system for a text - containing random 8-bit code which - does not require code conversion - except for end-of-line format. */ - }; - -/* Formats of end-of-line. */ -#define CODING_EOL_LF 0 /* Line-feed only, same as Emacs' - internal format. */ -#define CODING_EOL_CRLF 1 /* Sequence of carriage-return and - line-feed. */ -#define CODING_EOL_CR 2 /* Carriage-return only. */ -#define CODING_EOL_UNDECIDED 3 /* This value is used to denote the - eol-type is not yet decided. */ -#define CODING_EOL_INCONSISTENT 4 /* This value is used to denote the - eol-type is not consistent - through the file. */ - -/* 1 iff composing. */ -#define COMPOSING_P(coding) ((int) coding->composing > (int) COMPOSITION_NO) - -#define COMPOSITION_DATA_SIZE 4080 -#define COMPOSITION_DATA_MAX_BUNCH_LENGTH (4 + MAX_COMPOSITION_COMPONENTS*2) - -/* Data structure to hold information about compositions of text that - is being decoded or encode. ISO 2022 base code conversion routines - handle special ESC sequences for composition specification. But, - they can't get/put such information directly from/to a buffer in - the deepest place. So, they store or retrieve the information - through this structure. - - The encoder stores the information in this structure when it meets - ESC sequences for composition while encoding codes, then, after all - text codes are encoded, puts `composition' properties on the text - by referring to the structure. - - The decoder at first stores the information of a text to be - decoded, then, while decoding codes, generates ESC sequences for - composition at proper places by referring to the structure. */ - -struct composition_data -{ - /* The character position of the first character to be encoded or - decoded. START and END (see below) are relative to this - position. */ - int char_offset; - - /* The composition data. These elements are repeated for each - composition: - LENGTH START END METHOD [ COMPONENT ... ] - where, - LENGTH is the number of elements for this composition. - - START and END are starting and ending character positions of - the composition relative to `char_offset'. - - METHOD is one of `enum composing_status' specifying the way of - composition. - - COMPONENT is a character or an encoded composition rule. */ - int data[COMPOSITION_DATA_SIZE]; - - /* The number of elements in `data' currently used. */ - int used; - - /* Pointers to the previous and next structures. When `data' is - filled up, another structure is allocated and linked in `next'. - The new structure has backward link to this structure in `prev'. - The number of chained structures depends on how many compositions - the text being encoded or decoded contains. */ - struct composition_data *prev, *next; -}; - -/* Macros used for the member `result' of the struct - coding_system. */ -#define CODING_FINISH_NORMAL 0 -#define CODING_FINISH_INSUFFICIENT_SRC 1 -#define CODING_FINISH_INSUFFICIENT_DST 2 -#define CODING_FINISH_INCONSISTENT_EOL 3 -#define CODING_FINISH_INSUFFICIENT_CMP 4 -#define CODING_FINISH_INTERRUPT 5 - -/* Macros used for the member `mode' of the struct coding_system. */ - -/* If set, recover the original CR or LF of the already decoded text - when the decoding routine encounters an inconsistent eol format. */ -#define CODING_MODE_INHIBIT_INCONSISTENT_EOL 0x01 - -/* If set, the decoding/encoding routines treat the current data as - the last block of the whole text to be converted, and do - appropriate finishing job. */ -#define CODING_MODE_LAST_BLOCK 0x02 - -/* If set, it means that the current source text is in a buffer which - enables selective display. */ -#define CODING_MODE_SELECTIVE_DISPLAY 0x04 - -/* This flag is used by the decoding/encoding routines on the fly. If - set, it means that right-to-left text is being processed. */ -#define CODING_MODE_DIRECTION 0x08 - -struct coding_system -{ - /* Type of the coding system. */ - enum coding_type type; - - /* Type of end-of-line format (LF, CRLF, or CR) of the coding system. */ - int eol_type; - - /* Flag bits of the coding system. The meaning of each bit is common - to all types of coding systems. */ - unsigned int common_flags; - - /* Flag bits of the coding system. The meaning of each bit depends - on the type of the coding system. */ - unsigned int flags; - - /* Mode bits of the coding system. See the comments of the macros - CODING_MODE_XXX. */ - unsigned int mode; - - /* The current status of composition handling. */ - int composing; - - /* 1 iff the next character is a composition rule. */ - int composition_rule_follows; - - /* Information of compositions are stored here on decoding and set - in advance on encoding. */ - struct composition_data *cmp_data; - - /* Index to cmp_data->data for the first element for the current - composition. */ - int cmp_data_start; - - /* Index to cmp_data->data for the current element for the current - composition. */ - int cmp_data_index; - - /* Detailed information specific to each type of coding system. */ - union spec - { - struct iso2022_spec iso2022; - struct ccl_spec ccl; /* Defined in ccl.h. */ - } spec; - - /* Index number of coding category of the coding system. */ - int category_idx; - - /* The following two members specify how characters 128..159 are - represented in source and destination text respectively. 1 means - they are represented by 2-byte sequence, 0 means they are - represented by 1-byte as is (see the comment in charset.h). */ - unsigned src_multibyte : 1; - unsigned dst_multibyte : 1; - - /* How may heading bytes we can skip for decoding. This is set to - -1 in setup_coding_system, and updated by detect_coding. So, - when this is equal to the byte length of the text being - converted, we can skip the actual conversion process. */ - int heading_ascii; - - /* The following members are set by encoding/decoding routine. */ - int produced, produced_char, consumed, consumed_char; - - /* Number of error source data found in a decoding routine. */ - int errors; - - /* Finish status of code conversion. It should be one of macros - CODING_FINISH_XXXX. */ - int result; - - /* If nonzero, suppress error notification. */ - int suppress_error; - - /* The following members are all Lisp symbols. We don't have to - protect them from GC because the current garbage collection - doesn't relocate Lisp symbols. But, when it is changed, we must - find a way to protect them. */ - - /* Backward pointer to the Lisp symbol of the coding system. */ - Lisp_Object symbol; - - /* Lisp function (symbol) to be called after decoding to do - additional conversion, or nil. */ - Lisp_Object post_read_conversion; - - /* Lisp function (symbol) to be called before encoding to do - additional conversion, or nil. */ - Lisp_Object pre_write_conversion; - - /* Character translation tables to look up, or nil. */ - Lisp_Object translation_table_for_decode; - Lisp_Object translation_table_for_encode; -}; - -#define CODING_REQUIRE_FLUSHING_MASK 1 -#define CODING_REQUIRE_DECODING_MASK 2 -#define CODING_REQUIRE_ENCODING_MASK 4 -#define CODING_REQUIRE_DETECTION_MASK 8 - -/* Return 1 if the coding system CODING requires specific code to be - attached at the tail of converted text. */ -#define CODING_REQUIRE_FLUSHING(coding) \ - ((coding)->common_flags & CODING_REQUIRE_FLUSHING_MASK) - -/* Return 1 if the coding system CODING requires code conversion on - decoding. */ -#define CODING_REQUIRE_DECODING(coding) \ - ((coding)->dst_multibyte \ - || (coding)->common_flags & CODING_REQUIRE_DECODING_MASK) - -/* Return 1 if the coding system CODING requires code conversion on - encoding. */ -#define CODING_REQUIRE_ENCODING(coding) \ - ((coding)->src_multibyte \ - || (coding)->common_flags & CODING_REQUIRE_ENCODING_MASK) - -/* Return 1 if the coding system CODING requires some kind of code - detection. */ -#define CODING_REQUIRE_DETECTION(coding) \ - ((coding)->common_flags & CODING_REQUIRE_DETECTION_MASK) - -/* Return 1 if the coding system CODING requires code conversion on - decoding or some kind of code detection. */ -#define CODING_MAY_REQUIRE_DECODING(coding) \ - (CODING_REQUIRE_DECODING (coding) \ - || CODING_REQUIRE_DETECTION (coding)) - -/* Index for each coding category in `coding_category_table' */ -#define CODING_CATEGORY_IDX_EMACS_MULE 0 -#define CODING_CATEGORY_IDX_SJIS 1 -#define CODING_CATEGORY_IDX_ISO_7 2 -#define CODING_CATEGORY_IDX_ISO_7_TIGHT 3 -#define CODING_CATEGORY_IDX_ISO_8_1 4 -#define CODING_CATEGORY_IDX_ISO_8_2 5 -#define CODING_CATEGORY_IDX_ISO_7_ELSE 6 -#define CODING_CATEGORY_IDX_ISO_8_ELSE 7 -#define CODING_CATEGORY_IDX_CCL 8 -#define CODING_CATEGORY_IDX_BIG5 9 -#define CODING_CATEGORY_IDX_UTF_8 10 -#define CODING_CATEGORY_IDX_UTF_16_BE 11 -#define CODING_CATEGORY_IDX_UTF_16_LE 12 -#define CODING_CATEGORY_IDX_RAW_TEXT 13 -#define CODING_CATEGORY_IDX_BINARY 14 -#define CODING_CATEGORY_IDX_MAX 15 - -/* Definitions of flag bits returned by the function - detect_coding_mask (). */ -#define CODING_CATEGORY_MASK_EMACS_MULE (1 << CODING_CATEGORY_IDX_EMACS_MULE) -#define CODING_CATEGORY_MASK_SJIS (1 << CODING_CATEGORY_IDX_SJIS) -#define CODING_CATEGORY_MASK_ISO_7 (1 << CODING_CATEGORY_IDX_ISO_7) -#define CODING_CATEGORY_MASK_ISO_7_TIGHT (1 << CODING_CATEGORY_IDX_ISO_7_TIGHT) -#define CODING_CATEGORY_MASK_ISO_8_1 (1 << CODING_CATEGORY_IDX_ISO_8_1) -#define CODING_CATEGORY_MASK_ISO_8_2 (1 << CODING_CATEGORY_IDX_ISO_8_2) -#define CODING_CATEGORY_MASK_ISO_7_ELSE (1 << CODING_CATEGORY_IDX_ISO_7_ELSE) -#define CODING_CATEGORY_MASK_ISO_8_ELSE (1 << CODING_CATEGORY_IDX_ISO_8_ELSE) -#define CODING_CATEGORY_MASK_CCL (1 << CODING_CATEGORY_IDX_CCL) -#define CODING_CATEGORY_MASK_BIG5 (1 << CODING_CATEGORY_IDX_BIG5) -#define CODING_CATEGORY_MASK_UTF_8 (1 << CODING_CATEGORY_IDX_UTF_8) -#define CODING_CATEGORY_MASK_UTF_16_BE (1 << CODING_CATEGORY_IDX_UTF_16_BE) -#define CODING_CATEGORY_MASK_UTF_16_LE (1 << CODING_CATEGORY_IDX_UTF_16_LE) -#define CODING_CATEGORY_MASK_RAW_TEXT (1 << CODING_CATEGORY_IDX_RAW_TEXT) -#define CODING_CATEGORY_MASK_BINARY (1 << CODING_CATEGORY_IDX_BINARY) - -/* This value is returned if detect_coding_mask () find nothing other - than ASCII characters. */ -#define CODING_CATEGORY_MASK_ANY \ - ( CODING_CATEGORY_MASK_EMACS_MULE \ - | CODING_CATEGORY_MASK_SJIS \ - | CODING_CATEGORY_MASK_ISO_7 \ - | CODING_CATEGORY_MASK_ISO_7_TIGHT \ - | CODING_CATEGORY_MASK_ISO_8_1 \ - | CODING_CATEGORY_MASK_ISO_8_2 \ - | CODING_CATEGORY_MASK_ISO_7_ELSE \ - | CODING_CATEGORY_MASK_ISO_8_ELSE \ - | CODING_CATEGORY_MASK_CCL \ - | CODING_CATEGORY_MASK_BIG5 \ - | CODING_CATEGORY_MASK_UTF_8 \ - | CODING_CATEGORY_MASK_UTF_16_BE \ - | CODING_CATEGORY_MASK_UTF_16_LE) - -#define CODING_CATEGORY_MASK_ISO_7BIT \ - (CODING_CATEGORY_MASK_ISO_7 | CODING_CATEGORY_MASK_ISO_7_TIGHT) - -#define CODING_CATEGORY_MASK_ISO_8BIT \ - (CODING_CATEGORY_MASK_ISO_8_1 | CODING_CATEGORY_MASK_ISO_8_2) - -#define CODING_CATEGORY_MASK_ISO_SHIFT \ - (CODING_CATEGORY_MASK_ISO_7_ELSE | CODING_CATEGORY_MASK_ISO_8_ELSE) - -#define CODING_CATEGORY_MASK_ISO \ - ( CODING_CATEGORY_MASK_ISO_7BIT \ - | CODING_CATEGORY_MASK_ISO_SHIFT \ - | CODING_CATEGORY_MASK_ISO_8BIT) - -#define CODING_CATEGORY_MASK_UTF_16_BE_LE \ - (CODING_CATEGORY_MASK_UTF_16_BE | CODING_CATEGORY_MASK_UTF_16_LE) - -/* Macros to decode or encode a character of JISX0208 in SJIS. S1 and - S2 are the 1st and 2nd position-codes of JISX0208 in SJIS coding - system. C1 and C2 are the 1st and 2nd position codes of Emacs' - internal format. */ - -#define DECODE_SJIS(s1, s2, c1, c2) \ - do { \ - if (s2 >= 0x9F) \ - c1 = s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0), \ - c2 = s2 - 0x7E; \ - else \ - c1 = s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1), \ - c2 = s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F); \ - } while (0) - -#define ENCODE_SJIS(c1, c2, s1, s2) \ - do { \ - if (c1 & 1) \ - s1 = c1 / 2 + ((c1 < 0x5F) ? 0x71 : 0xB1), \ - s2 = c2 + ((c2 >= 0x60) ? 0x20 : 0x1F); \ - else \ - s1 = c1 / 2 + ((c1 < 0x5F) ? 0x70 : 0xB0), \ - s2 = c2 + 0x7E; \ - } while (0) - -/* Encode the file name NAME using the specified coding system - for file names, if any. */ -#define ENCODE_FILE(name) \ - (! NILP (Vfile_name_coding_system) \ - && XFASTINT (Vfile_name_coding_system) != 0 \ - ? code_convert_string_norecord (name, Vfile_name_coding_system, 1) \ - : (! NILP (Vdefault_file_name_coding_system) \ - && XFASTINT (Vdefault_file_name_coding_system) != 0 \ - ? code_convert_string_norecord (name, Vdefault_file_name_coding_system, 1) \ - : name)) - -/* Decode the file name NAME using the specified coding system - for file names, if any. */ -#define DECODE_FILE(name) \ - (! NILP (Vfile_name_coding_system) \ - && XFASTINT (Vfile_name_coding_system) != 0 \ - ? code_convert_string_norecord (name, Vfile_name_coding_system, 0) \ - : (! NILP (Vdefault_file_name_coding_system) \ - && XFASTINT (Vdefault_file_name_coding_system) != 0 \ - ? code_convert_string_norecord (name, Vdefault_file_name_coding_system, 0) \ - : name)) - -#ifdef WINDOWSNT -/* Encode the string STR using the specified coding system - for w32 system functions, if any. */ -#define ENCODE_SYSTEM(str) \ - (! NILP (Vlocale_coding_system) \ - && XFASTINT (Vlocale_coding_system) != 0 \ - ? code_convert_string_norecord (str, Vlocale_coding_system, 1) \ - : str) - -/* Decode the string STR using the specified coding system - for w32 system functions, if any. */ -#define DECODE_SYSTEM(name) \ - (! NILP (Vlocale_coding_system) \ - && XFASTINT (Vlocale_coding_system) != 0 \ - ? code_convert_string_norecord (str, Vlocale_coding_system, 0) \ - : str) - -#else /* WINDOWSNT */ - -#define ENCODE_SYSTEM(str) string_make_unibyte(str) -#define DECODE_SYSTEM(name) name - -#endif /* !WINDOWSNT */ - -/* Extern declarations. */ -extern int decode_coding P_ ((struct coding_system *, unsigned char *, - unsigned char *, int, int)); -extern int encode_coding P_ ((struct coding_system *, unsigned char *, - unsigned char *, int, int)); -extern void coding_save_composition P_ ((struct coding_system *, int, int, - Lisp_Object)); -extern void coding_free_composition_data P_ ((struct coding_system *)); -extern void coding_adjust_composition_offset P_ ((struct coding_system *, - int)); -extern void coding_allocate_composition_data P_ ((struct coding_system *, - int)); -extern void coding_restore_composition P_ ((struct coding_system *, - Lisp_Object)); -extern int code_convert_region P_ ((int, int, int, int, struct coding_system *, - int, int)); -extern Lisp_Object run_pre_post_conversion_on_str P_ ((Lisp_Object, - struct coding_system *, - int)); -extern int decoding_buffer_size P_ ((struct coding_system *, int)); -extern int encoding_buffer_size P_ ((struct coding_system *, int)); -extern void detect_coding P_ ((struct coding_system *, unsigned char *, int)); -extern void detect_eol P_ ((struct coding_system *, unsigned char *, int)); -extern int setup_coding_system P_ ((Lisp_Object, struct coding_system *)); -extern Lisp_Object code_convert_string P_ ((Lisp_Object, - struct coding_system *, int, int)); -extern Lisp_Object code_convert_string1 P_ ((Lisp_Object, Lisp_Object, - Lisp_Object, int)); -extern Lisp_Object code_convert_string_norecord P_ ((Lisp_Object, Lisp_Object, - int)); -extern void setup_raw_text_coding_system P_ ((struct coding_system *)); -extern Lisp_Object encode_coding_string P_ ((Lisp_Object, - struct coding_system *, int)); -extern Lisp_Object decode_coding_string P_ ((Lisp_Object, - struct coding_system *, int)); -extern Lisp_Object Qcoding_system, Qeol_type, Qcoding_category_index; -extern Lisp_Object Qraw_text, Qemacs_mule; -extern Lisp_Object Qbuffer_file_coding_system; -extern Lisp_Object Vcoding_category_list; - -extern Lisp_Object Qtranslation_table; -extern Lisp_Object Qtranslation_table_id; - -/* Mnemonic strings to indicate each type of end-of-line. */ -extern Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; -/* Mnemonic string to indicate type of end-of-line is not yet decided. */ -extern Lisp_Object eol_mnemonic_undecided; - -#ifdef emacs -extern Lisp_Object Qfile_coding_system; -extern Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; -extern Lisp_Object Qstart_process, Qopen_network_stream; -extern Lisp_Object Qwrite_region; - -extern char *emacs_strerror P_ ((int)); - -/* Coding-system for reading files and receiving data from process. */ -extern Lisp_Object Vcoding_system_for_read; -/* Coding-system for writing files and sending data to process. */ -extern Lisp_Object Vcoding_system_for_write; -/* Coding-system actually used in the latest I/O. */ -extern Lisp_Object Vlast_coding_system_used; -/* Coding-system to use with system messages (e.g. strerror). */ -extern Lisp_Object Vlocale_coding_system; - -/* If non-zero, process buffer inherits the coding system used to decode - the subprocess output. */ -extern int inherit_process_coding_system; - -/* Coding-system to be used for encoding terminal output. This - structure contains information of a coding-system specified by the - function `set-terminal-coding-system'. */ -extern struct coding_system terminal_coding; - -/* Coding system to be used to encode text for terminal display when - terminal coding system is nil. */ -extern struct coding_system safe_terminal_coding; - -/* Coding-system of what is sent from terminal keyboard. This - structure contains information of a coding-system specified by the - function `set-keyboard-coding-system'. */ -extern struct coding_system keyboard_coding; - -/* Default coding system to be used to write a file. */ -extern struct coding_system default_buffer_file_coding; - -/* Default coding systems used for process I/O. */ -extern Lisp_Object Vdefault_process_coding_system; - -/* Function to call to force a user to force select a proper coding - system. */ -extern Lisp_Object Vselect_safe_coding_system_function; - -/* Coding system for file names, or nil if none. */ -extern Lisp_Object Vfile_name_coding_system; - -/* Coding system for file names used only when - Vfile_name_coding_system is nil. */ -extern Lisp_Object Vdefault_file_name_coding_system; - -#endif - -/* Error signaled when there's a problem with detecting coding system */ -extern Lisp_Object Qcoding_system_error; - -#endif /* EMACS_CODING_H */ +/* Header for coding system handler. + Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN. + Licensed to the Free Software Foundation. + Copyright (C) 2001, 2002 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H13PRO009 + +This file is part of GNU Emacs. + +GNU Emacs is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Emacs is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Emacs; see the file COPYING. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#ifndef EMACS_CODING_H +#define EMACS_CODING_H + +/* Index to arguments of Fdefine_coding_system_internal. */ + +enum define_coding_system_arg_index + { + coding_arg_name, + coding_arg_mnemonic, + coding_arg_coding_type, + coding_arg_charset_list, + coding_arg_ascii_compatible_p, + coding_arg_decode_translation_table, + coding_arg_encode_translation_table, + coding_arg_post_read_conversion, + coding_arg_pre_write_conversion, + coding_arg_default_char, + coding_arg_plist, + coding_arg_eol_type, + coding_arg_max + }; + +enum define_coding_iso2022_arg_index + { + coding_arg_iso2022_initial = coding_arg_max, + coding_arg_iso2022_reg_usage, + coding_arg_iso2022_request, + coding_arg_iso2022_flags, + coding_arg_iso2022_max + }; + +enum define_coding_utf16_arg_index + { + coding_arg_utf16_bom = coding_arg_max, + coding_arg_utf16_endian, + coding_arg_utf16_max + }; + +enum define_coding_ccl_arg_index + { + coding_arg_ccl_decoder, + coding_arg_ccl_encoder, + coding_arg_ccl_valids, + coding_arg_ccl_max + }; + +extern Lisp_Object Vcoding_system_hash_table; + +/* Enumeration of coding system type. */ + +enum coding_system_type + { + coding_type_charset, + coding_type_utf_8, + coding_type_utf_16, + coding_type_iso_2022, + coding_type_emacs_mule, + coding_type_sjis, + coding_type_ccl, + coding_type_raw_text, + coding_type_undecided, + coding_type_max + }; + + +/* Enumeration of end-of-line format type. */ + +enum end_of_line_type + { + eol_lf, /* Line-feed only, same as Emacs' internal + format. */ + eol_crlf, /* Sequence of carriage-return and + line-feed. */ + eol_cr, /* Carriage-return only. */ + eol_any, /* Accept any of above. Produce line-feed + only. */ + eol_undecided, /* This value is used to denote that the + eol-type is not yet undecided. */ + eol_type_max + }; + +/* Enumeration of index to an attribute vector of a coding system. */ + +enum coding_attr_index + { + coding_attr_base_name, + coding_attr_docstring, + coding_attr_mnemonic, + coding_attr_type, + coding_attr_charset_list, + coding_attr_ascii_compat, + coding_attr_decode_tbl, + coding_attr_encode_tbl, + coding_attr_post_read, + coding_attr_pre_write, + coding_attr_default_char, + coding_attr_plist, + + coding_attr_category, + coding_attr_safe_charsets, + + /* The followings are extra attributes for each type. */ + coding_attr_charset_valids, + + coding_attr_ccl_decoder, + coding_attr_ccl_encoder, + coding_attr_ccl_valids, + + coding_attr_iso_initial, + coding_attr_iso_usage, + coding_attr_iso_request, + coding_attr_iso_flags, + + coding_attr_utf_16_bom, + coding_attr_utf_16_endian, + + coding_attr_emacs_mule_full, + + coding_attr_last_index + }; + + +#define CODING_ATTR_BASE_NAME(attrs) AREF (attrs, coding_attr_base_name) +#define CODING_ATTR_TYPE(attrs) AREF (attrs, coding_attr_type) +#define CODING_ATTR_CHARSET_LIST(attrs) AREF (attrs, coding_attr_charset_list) +#define CODING_ATTR_MNEMONIC(attrs) AREF (attrs, coding_attr_mnemonic) +#define CODING_ATTR_DOCSTRING(attrs) AREF (attrs, coding_attr_docstring) +#define CODING_ATTR_ASCII_COMPAT(attrs) AREF (attrs, coding_attr_ascii_compat) +#define CODING_ATTR_DECODE_TBL(attrs) AREF (attrs, coding_attr_decode_tbl) +#define CODING_ATTR_ENCODE_TBL(attrs) AREF (attrs, coding_attr_encode_tbl) +#define CODING_ATTR_POST_READ(attrs) AREF (attrs, coding_attr_post_read) +#define CODING_ATTR_PRE_WRITE(attrs) AREF (attrs, coding_attr_pre_write) +#define CODING_ATTR_DEFAULT_CHAR(attrs) AREF (attrs, coding_attr_default_char) +#define CODING_ATTR_DIRECTION(attrs) AREF (attrs, coding_attr_direction) +#define CODING_ATTR_FLUSHING(attrs) AREF (attrs, coding_attr_flushing) +#define CODING_ATTR_PLIST(attrs) AREF (attrs, coding_attr_plist) +#define CODING_ATTR_CATEGORY(attrs) AREF (attrs, coding_attr_category) +#define CODING_ATTR_SAFE_CHARSETS(attrs)AREF (attrs, coding_attr_safe_charsets) + + +#define CODING_ID_ATTRS(id) \ + (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 0)) + +#define CODING_ID_ALIASES(id) \ + (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 1)) + +#define CODING_ID_EOL_TYPE(id) \ + (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 2)) + +#define CODING_ID_NAME(id) \ + (HASH_KEY (XHASH_TABLE (Vcoding_system_hash_table), id)) + +#define CODING_SYSTEM_SPEC(coding_system_symbol) \ + (Fgethash (coding_system_symbol, Vcoding_system_hash_table, Qnil)) + +#define CODING_SYSTEM_ID(coding_system_symbol) \ + hash_lookup (XHASH_TABLE (Vcoding_system_hash_table), \ + coding_system_symbol, NULL) + +#define CODING_SYSTEM_P(coding_system_symbol) \ + (! NILP (CODING_SYSTEM_SPEC (coding_system_symbol))) + +#define CHECK_CODING_SYSTEM(x) \ + do { \ + if (!CODING_SYSTEM_P (x)) \ + x = wrong_type_argument (Qcoding_system_p, (x)); \ + } while (0) + + +#define CHECK_CODING_SYSTEM_GET_SPEC(x, spec) \ + do { \ + spec = CODING_SYSTEM_SPEC (x); \ + if (NILP (spec)) \ + x = wrong_type_argument (Qcoding_system_p, (x)); \ + } while (0) + + +#define CHECK_CODING_SYSTEM_GET_ID(x, id) \ + do \ + { \ + id = CODING_SYSTEM_ID (x); \ + if (id < 0) \ + x = wrong_type_argument (Qcoding_system_p, (x)); \ + } while (0) + + +/*** GENERAL section ***/ + +/* Enumeration of result code of code conversion. */ +enum coding_result_code + { + CODING_RESULT_SUCCESS, + CODING_RESULT_INSUFFICIENT_SRC, + CODING_RESULT_INSUFFICIENT_DST, + CODING_RESULT_INCONSISTENT_EOL, + CODING_RESULT_INSUFFICIENT_CMP, + CODING_RESULT_INTERRUPT, + CODING_RESULT_INSUFFICIENT_MEM + }; + + +/* Macros used for the member `mode' of the struct coding_system. */ + +/* If set, recover the original CR or LF of the already decoded text + when the decoding routine encounters an inconsistent eol format. */ +#define CODING_MODE_INHIBIT_INCONSISTENT_EOL 0x01 + +/* If set, the decoding/encoding routines treat the current data as + the last block of the whole text to be converted, and do + appropriate fisishing job. */ +#define CODING_MODE_LAST_BLOCK 0x02 + +/* If set, it means that the current source text is in a buffer which + enables selective display. */ +#define CODING_MODE_SELECTIVE_DISPLAY 0x04 + +/* This flag is used by the decoding/encoding routines on the fly. If + set, it means that right-to-left text is being processed. */ +#define CODING_MODE_DIRECTION 0x08 + +#define CODING_MODE_FIXED_DESTINATION 0x10 + +#define CODING_MODE_SAFE_ENCODING 0x20 + +/* Structure of the field `spec.iso_2022' in the structure + `coding_system'. */ +struct iso_2022_spec +{ + /* */ + unsigned flags; + + /* The current graphic register invoked to each graphic plane. */ + int current_invocation[2]; + + /* The current charset designated to each graphic register. The + value -1 means that not charset is designated, -2 means that + there was an invalid designation previously. */ + int current_designation[4]; + + /* Set to 1 temporarily only when graphic register 2 or 3 is invoked + by single-shift while encoding. */ + int single_shifting; + + /* Set to 1 temporarily only when processing at beginning of line. */ + int bol; +}; + +struct ccl_spec; + +enum utf_16_bom_type + { + utf_16_detect_bom, + utf_16_without_bom, + utf_16_with_bom + }; + +enum utf_16_endian_type + { + utf_16_big_endian, + utf_16_little_endian + }; + +struct utf_16_spec +{ + enum utf_16_bom_type bom; + enum utf_16_endian_type endian; + int surrogate; +}; + +struct coding_system +{ + /* ID number of the coding system. This is an index to + Vcoding_system_hash_table. This value is set by + setup_coding_system. At the early stage of building time, this + value is -1 in the array coding_categories to indicate that no + coding-system of that category is yet defined. */ + int id; + + /* Flag bits of the coding system. The meaning of each bit is common + to all types of coding systems. */ + int common_flags; + + /* Mode bits of the coding system. See the comments of the macros + CODING_MODE_XXX. */ + unsigned int mode; + + /* Detailed information specific to each type of coding system. */ + union + { + struct iso_2022_spec iso_2022; + struct ccl_spec *ccl; /* Defined in ccl.h. */ + struct utf_16_spec utf_16; + int emacs_mule_full_support; + } spec; + + int max_charset_id; + char *safe_charsets; + + /* The following two members specify how binary 8-bit code 128..255 + are represented in source and destination text respectively. 1 + means they are represented by 2-byte sequence, 0 means they are + represented by 1-byte as is (see the comment in character.h). */ + unsigned src_multibyte : 1; + unsigned dst_multibyte : 1; + + /* How may heading bytes we can skip for decoding. This is set to + -1 in setup_coding_system, and updated by detect_coding. So, + when this is equal to the byte length of the text being + converted, we can skip the actual conversion process. */ + int head_ascii; + + /* The following members are set by encoding/decoding routine. */ + EMACS_INT produced, produced_char, consumed, consumed_char; + + /* Number of error source data found in a decoding routine. */ + int errors; + + /* Store the positions of error source data. */ + EMACS_INT *error_positions; + + /* Finish status of code conversion. */ + enum coding_result_code result; + + /* The following members are all Lisp symbols. We don't have to + protect them from GC because the current garbage collection + doesn't relocate Lisp symbols. But, when it is changed, we must + find a way to protect them. */ + + EMACS_INT src_pos, src_pos_byte, src_chars, src_bytes; + Lisp_Object src_object; + unsigned char *source; + + EMACS_INT dst_pos, dst_pos_byte, dst_bytes; + Lisp_Object dst_object; + unsigned char *destination; + + int chars_at_source; + + /* If an element is non-negative, it is a character code. + + If it is in the range -128..-1, it is a 8-bit character code + minus 256. + + If it is less than -128, it specifies the start of an annotation + chunk. The length of the chunk is -128 minus the value of the + element. The following elements are OFFSET, ANNOTATION-TYPE, and + a sequence of actual data for the annotation. OFFSET is a + character position offset from dst_pos or src_pos, + ANNOTATION-TYPE specfies the meaning of the annotation and how to + handle the following data.. */ + int *charbuf; + int charbuf_size, charbuf_used; + + /* Set to 1 if charbuf contains an annotation. */ + int annotated; + + unsigned char carryover[64]; + int carryover_bytes; + + int default_char; + + int (*detector) P_ ((struct coding_system *, int *)); + void (*decoder) P_ ((struct coding_system *)); + int (*encoder) P_ ((struct coding_system *)); +}; + +/* Meanings of bits in the member `common_flags' of the structure + coding_system. The lowest 8 bits are reserved for various kind of + annotations (currently two of them are used). */ +#define CODING_ANNOTATION_MASK 0x00FF +#define CODING_ANNOTATE_COMPOSITION_MASK 0x0001 +#define CODING_ANNOTATE_DIRECTION_MASK 0x0002 +#define CODING_FOR_UNIBYTE_MASK 0x0100 +#define CODING_REQUIRE_FLUSHING_MASK 0x0200 +#define CODING_REQUIRE_DECODING_MASK 0x0400 +#define CODING_REQUIRE_ENCODING_MASK 0x0800 +#define CODING_REQUIRE_DETECTION_MASK 0x1000 +#define CODING_RESET_AT_BOL_MASK 0x2000 + +/* Return 1 if the coding context CODING requires annotaion + handling. */ +#define CODING_REQUIRE_ANNOTATION(coding) \ + ((coding)->common_flags & CODING_ANNOTATION_MASK) + +/* Return 1 if the coding context CODING prefers decoding into unibyte. */ +#define CODING_FOR_UNIBYTE(coding) \ + ((coding)->common_flags & CODING_FOR_UNIBYTE_MASK) + +/* Return 1 if the coding context CODING requires specific code to be + attached at the tail of converted text. */ +#define CODING_REQUIRE_FLUSHING(coding) \ + ((coding)->common_flags & CODING_REQUIRE_FLUSHING_MASK) + +/* Return 1 if the coding context CODING requires code conversion on + decoding. */ +#define CODING_REQUIRE_DECODING(coding) \ + ((coding)->dst_multibyte \ + || (coding)->common_flags & CODING_REQUIRE_DECODING_MASK) + + +/* Return 1 if the coding context CODING requires code conversion on + encoding. */ +#define CODING_REQUIRE_ENCODING(coding) \ + ((coding)->src_multibyte \ + || (coding)->common_flags & CODING_REQUIRE_ENCODING_MASK \ + || (coding)->mode & CODING_MODE_SELECTIVE_DISPLAY) + + +/* Return 1 if the coding context CODING requires some kind of code + detection. */ +#define CODING_REQUIRE_DETECTION(coding) \ + ((coding)->common_flags & CODING_REQUIRE_DETECTION_MASK) + +/* Return 1 if the coding context CODING requires code conversion on + decoding or some kind of code detection. */ +#define CODING_MAY_REQUIRE_DECODING(coding) \ + (CODING_REQUIRE_DECODING (coding) \ + || CODING_REQUIRE_DETECTION (coding)) + +/* Macros to decode or encode a character of JISX0208 in SJIS. S1 and + S2 are the 1st and 2nd position-codes of JISX0208 in SJIS coding + system. C1 and C2 are the 1st and 2nd position codes of Emacs' + internal format. */ + +#define SJIS_TO_JIS(code) \ + do { \ + int s1, s2, j1, j2; \ + \ + s1 = (code) >> 8, s2 = (code) & 0xFF; \ + \ + if (s2 >= 0x9F) \ + (j1 = s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0), \ + j2 = s2 - 0x7E); \ + else \ + (j1 = s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1), \ + j2 = s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F)); \ + (code) = (j1 << 8) | j2; \ + } while (0) + + +#define JIS_TO_SJIS(code) \ + do { \ + int s1, s2, j1, j2; \ + \ + j1 = (code) >> 8, j2 = (code) & 0xFF; \ + if (j1 & 1) \ + (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x71 : 0xB1), \ + s2 = j2 + ((j2 >= 0x60) ? 0x20 : 0x1F)); \ + else \ + (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x70 : 0xB0), \ + s2 = j2 + 0x7E); \ + (code) = (j1 << 8) | j2; \ + } while (0) + + +/* Encode the file name NAME using the specified coding system + for file names, if any. */ +#define ENCODE_FILE(name) \ + (! NILP (Vfile_name_coding_system) \ + && XFASTINT (Vfile_name_coding_system) != 0 \ + ? code_convert_string_norecord (name, Vfile_name_coding_system, 1) \ + : (! NILP (Vdefault_file_name_coding_system) \ + && XFASTINT (Vdefault_file_name_coding_system) != 0 \ + ? code_convert_string_norecord (name, Vdefault_file_name_coding_system, 1) \ + : name)) + + +/* Decode the file name NAME using the specified coding system + for file names, if any. */ +#define DECODE_FILE(name) \ + (! NILP (Vfile_name_coding_system) \ + && XFASTINT (Vfile_name_coding_system) != 0 \ + ? code_convert_string_norecord (name, Vfile_name_coding_system, 0) \ + : (! NILP (Vdefault_file_name_coding_system) \ + && XFASTINT (Vdefault_file_name_coding_system) != 0 \ + ? code_convert_string_norecord (name, Vdefault_file_name_coding_system, 0) \ + : name)) + + +#ifdef WINDOWSNT +/* Encode the string STR using the specified coding system + for w32 system functions, if any. */ +#define ENCODE_SYSTEM(str) \ + (! NILP (Vlocale_coding_system) \ + && XFASTINT (Vlocale_coding_system) != 0 \ + ? code_convert_string_norecord (str, Vlocale_coding_system, 1) \ + : str) + +/* Decode the string STR using the specified coding system + for w32 system functions, if any. */ +#define DECODE_SYSTEM(name) \ + (! NILP (Vlocale_coding_system) \ + && XFASTINT (Vlocale_coding_system) != 0 \ + ? code_convert_string_norecord (str, Vlocale_coding_system, 0) \ + : str) + +#else /* WINDOWSNT */ + +#define ENCODE_SYSTEM(str) string_make_unibyte(str) +#define DECODE_SYSTEM(name) name + +#endif /* !WINDOWSNT */ + +/* Extern declarations. */ +extern Lisp_Object make_conversion_work_buffer P_ ((int)); +extern Lisp_Object code_conversion_restore P_ ((Lisp_Object)); +extern int decoding_buffer_size P_ ((struct coding_system *, int)); +extern int encoding_buffer_size P_ ((struct coding_system *, int)); +extern void setup_coding_system P_ ((Lisp_Object, struct coding_system *)); +extern void detect_coding P_ ((struct coding_system *)); +extern Lisp_Object code_convert_region P_ ((EMACS_INT, EMACS_INT, + Lisp_Object, Lisp_Object, + int, int)); +extern Lisp_Object code_convert_string P_ ((Lisp_Object, Lisp_Object, + Lisp_Object, int, int, int)); +extern Lisp_Object code_convert_string_norecord P_ ((Lisp_Object, Lisp_Object, + int)); +extern Lisp_Object raw_text_coding_system P_ ((Lisp_Object)); +extern Lisp_Object coding_inherit_eol_type P_ ((Lisp_Object, Lisp_Object)); + +extern int decode_coding_gap P_ ((struct coding_system *, + EMACS_INT, EMACS_INT)); +extern int encode_coding_gap P_ ((struct coding_system *, + EMACS_INT, EMACS_INT)); +extern void decode_coding_object P_ ((struct coding_system *, + Lisp_Object, EMACS_INT, EMACS_INT, + EMACS_INT, EMACS_INT, Lisp_Object)); +extern void encode_coding_object P_ ((struct coding_system *, + Lisp_Object, EMACS_INT, EMACS_INT, + EMACS_INT, EMACS_INT, Lisp_Object)); + +#define decode_coding_region(coding, from, to) \ + decode_coding_object (coding, Fcurrent_buffer (), \ + from, CHAR_TO_BYTE (from), \ + to, CHAR_TO_BYTE (to), Fcurrent_buffer ()) + + +#define encode_coding_region(coding, from, to) \ + encode_coding_object (coding, Fcurrent_buffer (), \ + from, CHAR_TO_BYTE (from), \ + to, CHAR_TO_BYTE (to), Fcurrent_buffer ()) + + +#define decode_coding_string(coding, string, nocopy) \ + decode_coding_object (coding, string, 0, 0, XSTRING (string)->size, \ + STRING_BYTES (XSTRING (string)), Qt) + +#define encode_coding_string(coding, string, nocopy) \ + (encode_coding_object (coding, string, 0, 0, XSTRING (string)->size, \ + STRING_BYTES (XSTRING (string)), Qt), \ + (coding)->dst_object) + + +#define decode_coding_c_string(coding, src, bytes, dst_object) \ + do { \ + (coding)->source = (src); \ + (coding)->src_chars = (coding)->src_bytes = (bytes); \ + decode_coding_object ((coding), Qnil, 0, 0, (bytes), (bytes), \ + (dst_object)); \ + } while (0) + + +extern Lisp_Object preferred_coding_system P_ (()); + + +extern Lisp_Object Qcoding_system, Qeol_type, Qcoding_category_index; +extern Lisp_Object Qcoding_system_p; +extern Lisp_Object Qraw_text, Qemacs_mule, Qno_conversion, Qundecided; +extern Lisp_Object Qiso_2022; +extern Lisp_Object Qbuffer_file_coding_system; + +extern Lisp_Object Qunix, Qdos, Qmac; + +extern Lisp_Object Qtranslation_table; +extern Lisp_Object Qtranslation_table_id; + +/* Mnemonic strings to indicate each type of end-of-line. */ +extern Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; +/* Mnemonic string to indicate type of end-of-line is not yet decided. */ +extern Lisp_Object eol_mnemonic_undecided; + +#ifdef emacs +extern Lisp_Object Qfile_coding_system; +extern Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; +extern Lisp_Object Qstart_process, Qopen_network_stream; +extern Lisp_Object Qwrite_region; + +extern char *emacs_strerror P_ ((int)); + +/* Coding-system for reading files and receiving data from process. */ +extern Lisp_Object Vcoding_system_for_read; +/* Coding-system for writing files and sending data to process. */ +extern Lisp_Object Vcoding_system_for_write; +/* Coding-system actually used in the latest I/O. */ +extern Lisp_Object Vlast_coding_system_used; +/* Coding-system to use with system messages (e.g. strerror). */ +extern Lisp_Object Vlocale_coding_system; + +/* If non-zero, process buffer inherits the coding system used to decode + the subprocess output. */ +extern int inherit_process_coding_system; + +/* Coding-system to be used for encoding terminal output. This + structure contains information of a coding-system specified by the + function `set-terminal-coding-system'. */ +extern struct coding_system terminal_coding; + +/* Coding system to be used to encode text for terminal display when + terminal coding system is nil. */ +extern struct coding_system safe_terminal_coding; + +/* Coding-system of what is sent from terminal keyboard. This + structure contains information of a coding-system specified by the + function `set-keyboard-coding-system'. */ +extern struct coding_system keyboard_coding; + +/* Default coding systems used for process I/O. */ +extern Lisp_Object Vdefault_process_coding_system; + +/* Function to call to force a user to force select a propert coding + system. */ +extern Lisp_Object Vselect_safe_coding_system_function; + +/* Coding system for file names, or nil if none. */ +extern Lisp_Object Vfile_name_coding_system; + +/* Coding system for file names used only when + Vfile_name_coding_system is nil. */ +extern Lisp_Object Vdefault_file_name_coding_system; + +#endif + +/* Error signaled when there's a problem with detecting coding system */ +extern Lisp_Object Qcoding_system_error; + +extern char emacs_mule_bytes[256]; +extern int emacs_mule_string_char P_ ((unsigned char *)); + +#endif /* EMACS_CODING_H */ -- 2.20.1