* README: Add a note about ranges in copyright years.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
aaef169d 2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
5df4f04c 3 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
7976eda0 4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 5 2005, 2006, 2007, 2008, 2009, 2010, 2011
ce03bf76
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8f924df7 8 Copyright (C) 2003
df7492f9
KH
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
4ed46869 11
369314dc
KH
12This file is part of GNU Emacs.
13
9ec0b715 14GNU Emacs is free software: you can redistribute it and/or modify
369314dc 15it under the terms of the GNU General Public License as published by
9ec0b715
GM
16the Free Software Foundation, either version 3 of the License, or
17(at your option) any later version.
4ed46869 18
369314dc
KH
19GNU Emacs is distributed in the hope that it will be useful,
20but WITHOUT ANY WARRANTY; without even the implied warranty of
21MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22GNU General Public License for more details.
4ed46869 23
369314dc 24You should have received a copy of the GNU General Public License
9ec0b715 25along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
26
27/*** TABLE OF CONTENTS ***
28
b73bfc1c 29 0. General comments
4ed46869 30 1. Preamble
df7492f9
KH
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
4ed46869
KH
42
43*/
44
df7492f9 45/*** 0. General comments ***
b73bfc1c
KH
46
47
df7492f9 48CODING SYSTEM
4ed46869 49
5bad0796
DL
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
e19c3639
KH
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 57 coding system.
4ed46869 58
e19c3639
KH
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
5bad0796 61 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
4ed46869 64
e19c3639 65 Coding systems are classified into the following types depending on
5bad0796 66 the encoding mechanism. Here's a brief description of the types.
4ed46869 67
df7492f9
KH
68 o UTF-8
69
70 o UTF-16
71
72 o Charset-base coding system
73
74 A coding system defined by one or more (coded) character sets.
5bad0796 75 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
76 character set.
77
5bad0796 78 o Old Emacs internal format (emacs-mule)
df7492f9 79
5bad0796 80 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 81
df7492f9 82 o ISO2022-base coding system
4ed46869
KH
83
84 The most famous coding system for multiple character sets. X's
df7492f9
KH
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
4ed46869 88
df7492f9 89 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 90
4ed46869
KH
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 93 section 8.
4ed46869 94
df7492f9 95 o BIG5
4ed46869 96
df7492f9 97 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
4ed46869 102
df7492f9 103 o CCL
27901516 104
5bad0796 105 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
27901516 109
df7492f9 110 o Raw-text
4ed46869 111
5a936b46 112 A coding system for text containing raw eight-bit data. Emacs
5bad0796 113 treats each byte of source text as a character (except for
df7492f9 114 end-of-line conversion).
4ed46869 115
df7492f9
KH
116 o No-conversion
117
118 Like raw text, but don't do end-of-line conversion.
4ed46869 119
4ed46869 120
df7492f9 121END-OF-LINE FORMAT
4ed46869 122
5bad0796 123 How text end-of-line is encoded depends on operating system. For
df7492f9 124 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 125 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
4ed46869 128
cfb43547 129 Since text character encoding and end-of-line encoding are
df7492f9
KH
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
4ed46869 132
e19c3639
KH
133STRUCT CODING_SYSTEM
134
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
5bad0796 138 conversion (e.g. the location of source and destination data).
4ed46869
KH
139
140*/
141
df7492f9
KH
142/* COMMON MACROS */
143
144
4ed46869
KH
145/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146
df7492f9 147 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
df7492f9 150
ff0dacd7 151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
152
153 Below is the template of these functions. */
154
4ed46869 155#if 0
df7492f9 156static int
cf84bb53
JB
157detect_coding_XXX (struct coding_system *coding,
158 struct coding_detection_info *detect_info)
4ed46869 159{
f1d34bca
MB
160 const unsigned char *src = coding->source;
161 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 162 int multibytep = coding->src_multibyte;
ff0dacd7 163 int consumed_chars = 0;
df7492f9
KH
164 int found = 0;
165 ...;
166
167 while (1)
168 {
ad1746f5 169 /* Get one byte from the source. If the source is exhausted, jump
df7492f9
KH
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
ff0dacd7
KH
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
df7492f9 177 }
ff0dacd7
KH
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 180 return 0;
ff0dacd7 181
df7492f9 182 no_more_source:
ad1746f5 183 /* The source exhausted successfully. */
ff0dacd7 184 detect_info->found |= found;
df7492f9 185 return 1;
4ed46869
KH
186}
187#endif
188
189/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
df7492f9
KH
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
d46c5b12 195
df7492f9
KH
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
d46c5b12 200
df7492f9 201 Below is the template of these functions. */
d46c5b12 202
4ed46869 203#if 0
b73bfc1c 204static void
cf84bb53 205decode_coding_XXXX (struct coding_system *coding)
4ed46869 206{
f1d34bca
MB
207 const unsigned char *src = coding->source + coding->consumed;
208 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
209 /* SRC_BASE remembers the start position in source in each loop.
210 The loop will be exited when there's not enough source code, or
211 when there's no room in CHARBUF for a decoded character. */
f1d34bca 212 const unsigned char *src_base;
df7492f9 213 /* A buffer to produce decoded characters. */
69a80ea3
KH
214 int *charbuf = coding->charbuf + coding->charbuf_used;
215 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
216 int multibytep = coding->src_multibyte;
217
218 while (1)
219 {
220 src_base = src;
221 if (charbuf < charbuf_end)
222 /* No more room to produce a decoded character. */
223 break;
224 ONE_MORE_BYTE (c);
225 /* Decode it. */
226 }
227
228 no_more_source:
229 if (src_base < src_end
230 && coding->mode & CODING_MODE_LAST_BLOCK)
231 /* If the source ends by partial bytes to construct a character,
232 treat them as eight-bit raw data. */
233 while (src_base < src_end && charbuf < charbuf_end)
234 *charbuf++ = *src_base++;
235 /* Remember how many bytes and characters we consumed. If the
236 source is multibyte, the bytes and chars are not identical. */
237 coding->consumed = coding->consumed_char = src_base - coding->source;
238 /* Remember how many characters we produced. */
239 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
240}
241#endif
242
243/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244
df7492f9
KH
245 These functions encode SRC_BYTES length text at SOURCE of Emacs'
246 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
247 goes to a place pointed to by DESTINATION, the length of which
248 should not exceed DST_BYTES.
d46c5b12 249
df7492f9
KH
250 These functions set the information of original and encoded texts in
251 the members produced, produced_char, consumed, and consumed_char of
252 the structure *CODING. They also set the member result to one of
253 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 254
df7492f9
KH
255 DST_BYTES zero means that source area and destination area are
256 overlapped, which means that we can produce a encoded text until it
257 reaches at the head of not-yet-encoded source text.
d46c5b12 258
df7492f9 259 Below is a template of these functions. */
4ed46869 260#if 0
b73bfc1c 261static void
cf84bb53 262encode_coding_XXX (struct coding_system *coding)
4ed46869 263{
df7492f9
KH
264 int multibytep = coding->dst_multibyte;
265 int *charbuf = coding->charbuf;
266 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
267 unsigned char *dst = coding->destination + coding->produced;
268 unsigned char *dst_end = coding->destination + coding->dst_bytes;
269 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
270 int produced_chars = 0;
271
272 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 {
274 int c = *charbuf;
275 /* Encode C into DST, and increment DST. */
276 }
277 label_no_more_destination:
278 /* How many chars and bytes we produced. */
279 coding->produced_char += produced_chars;
280 coding->produced = dst - coding->destination;
4ed46869
KH
281}
282#endif
283
4ed46869
KH
284\f
285/*** 1. Preamble ***/
286
68c45bf0 287#include <config.h>
4ed46869 288#include <stdio.h>
d7306fe6 289#include <setjmp.h>
4ed46869 290
4ed46869
KH
291#include "lisp.h"
292#include "buffer.h"
df7492f9 293#include "character.h"
4ed46869
KH
294#include "charset.h"
295#include "ccl.h"
df7492f9 296#include "composite.h"
4ed46869
KH
297#include "coding.h"
298#include "window.h"
b8299c66
KL
299#include "frame.h"
300#include "termhooks.h"
4ed46869 301
df7492f9 302Lisp_Object Vcoding_system_hash_table;
4ed46869 303
df7492f9 304Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73 305Lisp_Object Qunix, Qdos;
4ed46869
KH
306Lisp_Object Qbuffer_file_coding_system;
307Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 308Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
df7492f9 310Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 311Lisp_Object Qbig, Qlittle;
bb0115a2 312Lisp_Object Qcoding_system_history;
1397dc18 313Lisp_Object Qvalid_codes;
2133e2d1 314Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
a6f87d34
KH
315Lisp_Object QCdecode_translation_table, QCencode_translation_table;
316Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
35befdaa 317Lisp_Object QCascii_compatible_p;
4ed46869 318
387f6ba5 319Lisp_Object Qcall_process, Qcall_process_region;
4ed46869
KH
320Lisp_Object Qstart_process, Qopen_network_stream;
321Lisp_Object Qtarget_idx;
322
065e3595
KH
323Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
324Lisp_Object Qinterrupted, Qinsufficient_memory;
325
44e8490d
KH
326/* If a symbol has this property, evaluate the value to define the
327 symbol as a coding system. */
328static Lisp_Object Qcoding_system_define_form;
329
fcbcfb64
KH
330/* Format of end-of-line decided by system. This is Qunix on
331 Unix and Mac, Qdos on DOS/Windows.
332 This has an effect only for external encoding (i.e. for output to
333 file and process), not for in-buffer or Lisp string encoding. */
334static Lisp_Object system_eol_type;
335
4ed46869
KH
336#ifdef emacs
337
4608c386 338Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 339
d46c5b12
KH
340/* Coding system emacs-mule and raw-text are for converting only
341 end-of-line format. */
342Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 343Lisp_Object Qutf_8_emacs;
ecf488bc 344
4ed46869
KH
345/* Coding-systems are handed between Emacs Lisp programs and C internal
346 routines by the following three variables. */
c4825358
KH
347/* Coding system to be used to encode text for terminal display when
348 terminal coding system is nil. */
349struct coding_system safe_terminal_coding;
350
4ed46869
KH
351#endif /* emacs */
352
f967223b
KH
353Lisp_Object Qtranslation_table;
354Lisp_Object Qtranslation_table_id;
355Lisp_Object Qtranslation_table_for_decode;
356Lisp_Object Qtranslation_table_for_encode;
4ed46869 357
df7492f9
KH
358/* Two special coding systems. */
359Lisp_Object Vsjis_coding_system;
360Lisp_Object Vbig5_coding_system;
361
df7492f9
KH
362/* ISO2022 section */
363
364#define CODING_ISO_INITIAL(coding, reg) \
365 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
366 coding_attr_iso_initial), \
367 reg)))
368
369
1b3b981b
AS
370#define CODING_ISO_REQUEST(coding, charset_id) \
371 (((charset_id) <= (coding)->max_charset_id \
372 ? ((coding)->safe_charsets[charset_id] != 255 \
373 ? (coding)->safe_charsets[charset_id] \
374 : -1) \
df7492f9
KH
375 : -1))
376
377
378#define CODING_ISO_FLAGS(coding) \
379 ((coding)->spec.iso_2022.flags)
380#define CODING_ISO_DESIGNATION(coding, reg) \
381 ((coding)->spec.iso_2022.current_designation[reg])
382#define CODING_ISO_INVOCATION(coding, plane) \
383 ((coding)->spec.iso_2022.current_invocation[plane])
384#define CODING_ISO_SINGLE_SHIFTING(coding) \
385 ((coding)->spec.iso_2022.single_shifting)
386#define CODING_ISO_BOL(coding) \
387 ((coding)->spec.iso_2022.bol)
388#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
389 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
e951386e
KH
390#define CODING_ISO_CMP_STATUS(coding) \
391 (&(coding)->spec.iso_2022.cmp_status)
392#define CODING_ISO_EXTSEGMENT_LEN(coding) \
393 ((coding)->spec.iso_2022.ctext_extended_segment_len)
394#define CODING_ISO_EMBEDDED_UTF_8(coding) \
395 ((coding)->spec.iso_2022.embedded_utf_8)
df7492f9
KH
396
397/* Control characters of ISO2022. */
398 /* code */ /* function */
399#define ISO_CODE_LF 0x0A /* line-feed */
400#define ISO_CODE_CR 0x0D /* carriage-return */
401#define ISO_CODE_SO 0x0E /* shift-out */
402#define ISO_CODE_SI 0x0F /* shift-in */
403#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
404#define ISO_CODE_ESC 0x1B /* escape */
405#define ISO_CODE_SS2 0x8E /* single-shift-2 */
406#define ISO_CODE_SS3 0x8F /* single-shift-3 */
407#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
408
409/* All code (1-byte) of ISO2022 is classified into one of the
410 followings. */
411enum iso_code_class_type
412 {
413 ISO_control_0, /* Control codes in the range
414 0x00..0x1F and 0x7F, except for the
415 following 5 codes. */
df7492f9
KH
416 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
417 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
418 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
419 ISO_escape, /* ISO_CODE_SO (0x1B) */
420 ISO_control_1, /* Control codes in the range
421 0x80..0x9F, except for the
422 following 3 codes. */
423 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
424 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
425 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
426 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
427 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
428 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
429 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
430 };
05e6f5dc 431
df7492f9
KH
432/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
433 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 434
df7492f9
KH
435/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
436 instead of the correct short-form sequence (e.g. ESC $ A). */
437#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 438
df7492f9
KH
439/* If set, reset graphic planes and registers at end-of-line to the
440 initial state. */
441#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 442
df7492f9
KH
443/* If set, reset graphic planes and registers before any control
444 characters to the initial state. */
445#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 446
df7492f9
KH
447/* If set, encode by 7-bit environment. */
448#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 449
df7492f9
KH
450/* If set, use locking-shift function. */
451#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 452
df7492f9
KH
453/* If set, use single-shift function. Overwrite
454 CODING_ISO_FLAG_LOCKING_SHIFT. */
455#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 456
df7492f9
KH
457/* If set, use designation escape sequence. */
458#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 459
df7492f9
KH
460/* If set, produce revision number sequence. */
461#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 462
df7492f9
KH
463/* If set, produce ISO6429's direction specifying sequence. */
464#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 465
df7492f9
KH
466/* If set, assume designation states are reset at beginning of line on
467 output. */
468#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 469
df7492f9
KH
470/* If set, designation sequence should be placed at beginning of line
471 on output. */
472#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 473
ad1746f5 474/* If set, do not encode unsafe characters on output. */
df7492f9 475#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 476
df7492f9
KH
477/* If set, extra latin codes (128..159) are accepted as a valid code
478 on input. */
479#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 480
df7492f9 481#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 482
df7492f9 483#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 484
bf16eb23 485#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 486
bf16eb23 487#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 488
bf16eb23 489#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 490
df7492f9
KH
491/* A character to be produced on output if encoding of the original
492 character is prohibited by CODING_ISO_FLAG_SAFE. */
493#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 494
a470d443
KH
495/* UTF-8 section */
496#define CODING_UTF_8_BOM(coding) \
497 ((coding)->spec.utf_8_bom)
4ed46869 498
df7492f9
KH
499/* UTF-16 section */
500#define CODING_UTF_16_BOM(coding) \
501 ((coding)->spec.utf_16.bom)
4ed46869 502
df7492f9
KH
503#define CODING_UTF_16_ENDIAN(coding) \
504 ((coding)->spec.utf_16.endian)
4ed46869 505
df7492f9
KH
506#define CODING_UTF_16_SURROGATE(coding) \
507 ((coding)->spec.utf_16.surrogate)
4ed46869 508
4ed46869 509
df7492f9
KH
510/* CCL section */
511#define CODING_CCL_DECODER(coding) \
512 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
513#define CODING_CCL_ENCODER(coding) \
514 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
515#define CODING_CCL_VALIDS(coding) \
8f924df7 516 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 517
5a936b46 518/* Index for each coding category in `coding_categories' */
4ed46869 519
df7492f9
KH
520enum coding_category
521 {
522 coding_category_iso_7,
523 coding_category_iso_7_tight,
524 coding_category_iso_8_1,
525 coding_category_iso_8_2,
526 coding_category_iso_7_else,
527 coding_category_iso_8_else,
a470d443
KH
528 coding_category_utf_8_auto,
529 coding_category_utf_8_nosig,
530 coding_category_utf_8_sig,
df7492f9
KH
531 coding_category_utf_16_auto,
532 coding_category_utf_16_be,
533 coding_category_utf_16_le,
534 coding_category_utf_16_be_nosig,
535 coding_category_utf_16_le_nosig,
536 coding_category_charset,
537 coding_category_sjis,
538 coding_category_big5,
539 coding_category_ccl,
540 coding_category_emacs_mule,
541 /* All above are targets of code detection. */
542 coding_category_raw_text,
543 coding_category_undecided,
544 coding_category_max
545 };
546
547/* Definitions of flag bits used in detect_coding_XXXX. */
548#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
549#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
550#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
551#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
552#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
553#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
554#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
555#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
556#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 557#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
558#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
559#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
560#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
561#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
562#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
563#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
564#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
565#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
566#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 567#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
568
569/* This value is returned if detect_coding_mask () find nothing other
570 than ASCII characters. */
571#define CATEGORY_MASK_ANY \
572 (CATEGORY_MASK_ISO_7 \
573 | CATEGORY_MASK_ISO_7_TIGHT \
574 | CATEGORY_MASK_ISO_8_1 \
575 | CATEGORY_MASK_ISO_8_2 \
576 | CATEGORY_MASK_ISO_7_ELSE \
577 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
578 | CATEGORY_MASK_UTF_8_AUTO \
579 | CATEGORY_MASK_UTF_8_NOSIG \
580 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 581 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
582 | CATEGORY_MASK_UTF_16_BE \
583 | CATEGORY_MASK_UTF_16_LE \
584 | CATEGORY_MASK_UTF_16_BE_NOSIG \
585 | CATEGORY_MASK_UTF_16_LE_NOSIG \
586 | CATEGORY_MASK_CHARSET \
587 | CATEGORY_MASK_SJIS \
588 | CATEGORY_MASK_BIG5 \
589 | CATEGORY_MASK_CCL \
590 | CATEGORY_MASK_EMACS_MULE)
591
592
593#define CATEGORY_MASK_ISO_7BIT \
594 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
595
596#define CATEGORY_MASK_ISO_8BIT \
597 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
598
599#define CATEGORY_MASK_ISO_ELSE \
600 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
601
602#define CATEGORY_MASK_ISO_ESCAPE \
603 (CATEGORY_MASK_ISO_7 \
604 | CATEGORY_MASK_ISO_7_TIGHT \
605 | CATEGORY_MASK_ISO_7_ELSE \
606 | CATEGORY_MASK_ISO_8_ELSE)
607
608#define CATEGORY_MASK_ISO \
609 ( CATEGORY_MASK_ISO_7BIT \
610 | CATEGORY_MASK_ISO_8BIT \
611 | CATEGORY_MASK_ISO_ELSE)
612
613#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
614 (CATEGORY_MASK_UTF_16_AUTO \
615 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
616 | CATEGORY_MASK_UTF_16_LE \
617 | CATEGORY_MASK_UTF_16_BE_NOSIG \
618 | CATEGORY_MASK_UTF_16_LE_NOSIG)
619
a470d443
KH
620#define CATEGORY_MASK_UTF_8 \
621 (CATEGORY_MASK_UTF_8_AUTO \
622 | CATEGORY_MASK_UTF_8_NOSIG \
623 | CATEGORY_MASK_UTF_8_SIG)
df7492f9 624
df7492f9 625/* Table of coding categories (Lisp symbols). This variable is for
ad1746f5 626 internal use only. */
df7492f9
KH
627static Lisp_Object Vcoding_category_table;
628
629/* Table of coding-categories ordered by priority. */
630static enum coding_category coding_priorities[coding_category_max];
631
632/* Nth element is a coding context for the coding system bound to the
633 Nth coding category. */
634static struct coding_system coding_categories[coding_category_max];
635
df7492f9
KH
636/*** Commonly used macros and functions ***/
637
638#ifndef min
639#define min(a, b) ((a) < (b) ? (a) : (b))
640#endif
641#ifndef max
642#define max(a, b) ((a) > (b) ? (a) : (b))
643#endif
4ed46869 644
24a73b0a
KH
645#define CODING_GET_INFO(coding, attrs, charset_list) \
646 do { \
647 (attrs) = CODING_ID_ATTRS ((coding)->id); \
648 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 649 } while (0)
4ed46869 650
4ed46869 651
df7492f9
KH
652/* Safely get one byte from the source text pointed by SRC which ends
653 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
654 in the source, it jumps to `no_more_source'. If multibytep is
655 nonzero, and a multibyte character is found at SRC, set C to the
656 negative value of the character code. The caller should declare
657 and set these variables appropriately in advance:
658 src, src_end, multibytep */
aa72b389 659
065e3595
KH
660#define ONE_MORE_BYTE(c) \
661 do { \
662 if (src == src_end) \
663 { \
664 if (src_base < src) \
665 record_conversion_result \
666 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
667 goto no_more_source; \
668 } \
669 c = *src++; \
670 if (multibytep && (c & 0x80)) \
671 { \
672 if ((c & 0xFE) == 0xC0) \
673 c = ((c & 1) << 6) | *src++; \
674 else \
675 { \
35befdaa
KH
676 src--; \
677 c = - string_char (src, &src, NULL); \
065e3595
KH
678 record_conversion_result \
679 (coding, CODING_RESULT_INVALID_SRC); \
680 } \
681 } \
682 consumed_chars++; \
aa72b389
KH
683 } while (0)
684
f56a4450 685/* Safely get two bytes from the source text pointed by SRC which ends
220eeac9
KH
686 at SRC_END, and set C1 and C2 to those bytes while skipping the
687 heading multibyte characters. If there are not enough bytes in the
688 source, it jumps to `no_more_source'. If multibytep is nonzero and
689 a multibyte character is found for C2, set C2 to the negative value
690 of the character code. The caller should declare and set these
691 variables appropriately in advance:
f56a4450
KH
692 src, src_end, multibytep
693 It is intended that this macro is used in detect_coding_utf_16. */
694
220eeac9
KH
695#define TWO_MORE_BYTES(c1, c2) \
696 do { \
697 do { \
698 if (src == src_end) \
699 goto no_more_source; \
700 c1 = *src++; \
701 if (multibytep && (c1 & 0x80)) \
702 { \
703 if ((c1 & 0xFE) == 0xC0) \
704 c1 = ((c1 & 1) << 6) | *src++; \
705 else \
706 { \
707 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
708 c1 = -1; \
709 } \
710 } \
711 } while (c1 < 0); \
712 if (src == src_end) \
713 goto no_more_source; \
714 c2 = *src++; \
715 if (multibytep && (c2 & 0x80)) \
716 { \
717 if ((c2 & 0xFE) == 0xC0) \
718 c2 = ((c2 & 1) << 6) | *src++; \
719 else \
720 c2 = -1; \
721 } \
f56a4450
KH
722 } while (0)
723
aa72b389 724
065e3595
KH
725#define ONE_MORE_BYTE_NO_CHECK(c) \
726 do { \
727 c = *src++; \
728 if (multibytep && (c & 0x80)) \
729 { \
730 if ((c & 0xFE) == 0xC0) \
731 c = ((c & 1) << 6) | *src++; \
732 else \
733 { \
35befdaa
KH
734 src--; \
735 c = - string_char (src, &src, NULL); \
065e3595
KH
736 record_conversion_result \
737 (coding, CODING_RESULT_INVALID_SRC); \
738 } \
739 } \
740 consumed_chars++; \
aa72b389
KH
741 } while (0)
742
aa72b389 743
df7492f9
KH
744/* Store a byte C in the place pointed by DST and increment DST to the
745 next free point, and increment PRODUCED_CHARS. The caller should
746 assure that C is 0..127, and declare and set the variable `dst'
747 appropriately in advance.
748*/
aa72b389
KH
749
750
df7492f9
KH
751#define EMIT_ONE_ASCII_BYTE(c) \
752 do { \
753 produced_chars++; \
754 *dst++ = (c); \
b6871cc7 755 } while (0)
aa72b389
KH
756
757
ad1746f5 758/* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
aa72b389 759
df7492f9
KH
760#define EMIT_TWO_ASCII_BYTES(c1, c2) \
761 do { \
762 produced_chars += 2; \
763 *dst++ = (c1), *dst++ = (c2); \
764 } while (0)
aa72b389
KH
765
766
df7492f9
KH
767/* Store a byte C in the place pointed by DST and increment DST to the
768 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
769 nonzero, store in an appropriate multibyte from. The caller should
770 declare and set the variables `dst' and `multibytep' appropriately
771 in advance. */
772
773#define EMIT_ONE_BYTE(c) \
774 do { \
775 produced_chars++; \
776 if (multibytep) \
777 { \
778 int ch = (c); \
779 if (ch >= 0x80) \
780 ch = BYTE8_TO_CHAR (ch); \
781 CHAR_STRING_ADVANCE (ch, dst); \
782 } \
783 else \
784 *dst++ = (c); \
aa72b389 785 } while (0)
aa72b389 786
aa72b389 787
df7492f9 788/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 789
e19c3639
KH
790#define EMIT_TWO_BYTES(c1, c2) \
791 do { \
792 produced_chars += 2; \
793 if (multibytep) \
794 { \
795 int ch; \
796 \
797 ch = (c1); \
798 if (ch >= 0x80) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
801 ch = (c2); \
802 if (ch >= 0x80) \
803 ch = BYTE8_TO_CHAR (ch); \
804 CHAR_STRING_ADVANCE (ch, dst); \
805 } \
806 else \
807 { \
808 *dst++ = (c1); \
809 *dst++ = (c2); \
810 } \
aa72b389
KH
811 } while (0)
812
813
df7492f9
KH
814#define EMIT_THREE_BYTES(c1, c2, c3) \
815 do { \
816 EMIT_ONE_BYTE (c1); \
817 EMIT_TWO_BYTES (c2, c3); \
818 } while (0)
aa72b389 819
aa72b389 820
df7492f9
KH
821#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
822 do { \
823 EMIT_TWO_BYTES (c1, c2); \
824 EMIT_TWO_BYTES (c3, c4); \
825 } while (0)
aa72b389 826
aa72b389 827
f6cbaf43 828/* Prototypes for static functions. */
f57e2426
J
829static void record_conversion_result (struct coding_system *coding,
830 enum coding_result_code result);
831static int detect_coding_utf_8 (struct coding_system *,
832 struct coding_detection_info *info);
833static void decode_coding_utf_8 (struct coding_system *);
834static int encode_coding_utf_8 (struct coding_system *);
835
836static int detect_coding_utf_16 (struct coding_system *,
837 struct coding_detection_info *info);
838static void decode_coding_utf_16 (struct coding_system *);
839static int encode_coding_utf_16 (struct coding_system *);
840
841static int detect_coding_iso_2022 (struct coding_system *,
842 struct coding_detection_info *info);
843static void decode_coding_iso_2022 (struct coding_system *);
844static int encode_coding_iso_2022 (struct coding_system *);
845
846static int detect_coding_emacs_mule (struct coding_system *,
847 struct coding_detection_info *info);
848static void decode_coding_emacs_mule (struct coding_system *);
849static int encode_coding_emacs_mule (struct coding_system *);
850
851static int detect_coding_sjis (struct coding_system *,
852 struct coding_detection_info *info);
853static void decode_coding_sjis (struct coding_system *);
854static int encode_coding_sjis (struct coding_system *);
855
856static int detect_coding_big5 (struct coding_system *,
857 struct coding_detection_info *info);
858static void decode_coding_big5 (struct coding_system *);
859static int encode_coding_big5 (struct coding_system *);
860
861static int detect_coding_ccl (struct coding_system *,
862 struct coding_detection_info *info);
863static void decode_coding_ccl (struct coding_system *);
864static int encode_coding_ccl (struct coding_system *);
865
866static void decode_coding_raw_text (struct coding_system *);
867static int encode_coding_raw_text (struct coding_system *);
868
869static void coding_set_source (struct coding_system *);
870static void coding_set_destination (struct coding_system *);
871static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
872static void coding_alloc_by_making_gap (struct coding_system *,
873 EMACS_INT, EMACS_INT);
874static unsigned char *alloc_destination (struct coding_system *,
875 EMACS_INT, unsigned char *);
876static void setup_iso_safe_charsets (Lisp_Object);
877static unsigned char *encode_designation_at_bol (struct coding_system *,
878 int *, int *,
879 unsigned char *);
880static int detect_eol (const unsigned char *,
881 EMACS_INT, enum coding_category);
882static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
883static void decode_eol (struct coding_system *);
884static Lisp_Object get_translation_table (Lisp_Object, int, int *);
885static Lisp_Object get_translation (Lisp_Object, int *, int *);
886static int produce_chars (struct coding_system *, Lisp_Object, int);
887static INLINE void produce_charset (struct coding_system *, int *,
888 EMACS_INT);
889static void produce_annotation (struct coding_system *, EMACS_INT);
890static int decode_coding (struct coding_system *);
891static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
892 struct coding_system *,
893 int *, EMACS_INT *);
894static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
895 struct coding_system *,
896 int *, EMACS_INT *);
897static void consume_chars (struct coding_system *, Lisp_Object, int);
898static int encode_coding (struct coding_system *);
899static Lisp_Object make_conversion_work_buffer (int);
900static Lisp_Object code_conversion_restore (Lisp_Object);
901static INLINE int char_encodable_p (int, Lisp_Object);
902static Lisp_Object make_subsidiaries (Lisp_Object);
f6cbaf43 903
065e3595
KH
904static void
905record_conversion_result (struct coding_system *coding,
906 enum coding_result_code result)
907{
908 coding->result = result;
909 switch (result)
910 {
911 case CODING_RESULT_INSUFFICIENT_SRC:
912 Vlast_code_conversion_error = Qinsufficient_source;
913 break;
914 case CODING_RESULT_INCONSISTENT_EOL:
915 Vlast_code_conversion_error = Qinconsistent_eol;
916 break;
917 case CODING_RESULT_INVALID_SRC:
918 Vlast_code_conversion_error = Qinvalid_source;
919 break;
920 case CODING_RESULT_INTERRUPT:
921 Vlast_code_conversion_error = Qinterrupted;
922 break;
923 case CODING_RESULT_INSUFFICIENT_MEM:
924 Vlast_code_conversion_error = Qinsufficient_memory;
925 break;
ebaf11b6
KH
926 case CODING_RESULT_INSUFFICIENT_DST:
927 /* Don't record this error in Vlast_code_conversion_error
928 because it happens just temporarily and is resolved when the
929 whole conversion is finished. */
930 break;
409ea3a1
AS
931 case CODING_RESULT_SUCCESS:
932 break;
35befdaa
KH
933 default:
934 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
935 }
936}
937
75f80e63
EZ
938/* This wrapper macro is used to preserve validity of pointers into
939 buffer text across calls to decode_char, which could cause
940 relocation of buffers if it loads a charset map, because loading a
941 charset map allocates large structures. */
df7492f9
KH
942#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
943 do { \
944 charset_map_loaded = 0; \
945 c = DECODE_CHAR (charset, code); \
946 if (charset_map_loaded) \
947 { \
8f924df7 948 const unsigned char *orig = coding->source; \
df7492f9
KH
949 EMACS_INT offset; \
950 \
951 coding_set_source (coding); \
952 offset = coding->source - orig; \
953 src += offset; \
954 src_base += offset; \
955 src_end += offset; \
956 } \
aa72b389
KH
957 } while (0)
958
959
119852e7
KH
960/* If there are at least BYTES length of room at dst, allocate memory
961 for coding->destination and update dst and dst_end. We don't have
962 to take care of coding->source which will be relocated. It is
963 handled by calling coding_set_source in encode_coding. */
964
df7492f9
KH
965#define ASSURE_DESTINATION(bytes) \
966 do { \
967 if (dst + (bytes) >= dst_end) \
968 { \
969 int more_bytes = charbuf_end - charbuf + (bytes); \
970 \
971 dst = alloc_destination (coding, more_bytes, dst); \
972 dst_end = coding->destination + coding->dst_bytes; \
973 } \
974 } while (0)
aa72b389 975
aa72b389 976
db274c7a
KH
977/* Store multibyte form of the character C in P, and advance P to the
978 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
979 never calls MAYBE_UNIFY_CHAR. */
980
981#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
982 do { \
983 if ((c) <= MAX_1_BYTE_CHAR) \
984 *(p)++ = (c); \
985 else if ((c) <= MAX_2_BYTE_CHAR) \
986 *(p)++ = (0xC0 | ((c) >> 6)), \
987 *(p)++ = (0x80 | ((c) & 0x3F)); \
988 else if ((c) <= MAX_3_BYTE_CHAR) \
989 *(p)++ = (0xE0 | ((c) >> 12)), \
990 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
991 *(p)++ = (0x80 | ((c) & 0x3F)); \
992 else if ((c) <= MAX_4_BYTE_CHAR) \
993 *(p)++ = (0xF0 | (c >> 18)), \
994 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
995 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
996 *(p)++ = (0x80 | (c & 0x3F)); \
997 else if ((c) <= MAX_5_BYTE_CHAR) \
998 *(p)++ = 0xF8, \
999 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1000 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1001 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1002 *(p)++ = (0x80 | (c & 0x3F)); \
1003 else \
1004 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1005 } while (0)
1006
1007
1008/* Return the character code of character whose multibyte form is at
1009 P, and advance P to the end of the multibyte form. This is like
1010 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1011
1012#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1013 (!((p)[0] & 0x80) \
1014 ? *(p)++ \
1015 : ! ((p)[0] & 0x20) \
1016 ? ((p) += 2, \
1017 ((((p)[-2] & 0x1F) << 6) \
1018 | ((p)[-1] & 0x3F) \
1019 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1020 : ! ((p)[0] & 0x10) \
1021 ? ((p) += 3, \
1022 ((((p)[-3] & 0x0F) << 12) \
1023 | (((p)[-2] & 0x3F) << 6) \
1024 | ((p)[-1] & 0x3F))) \
1025 : ! ((p)[0] & 0x08) \
1026 ? ((p) += 4, \
1027 ((((p)[-4] & 0xF) << 18) \
1028 | (((p)[-3] & 0x3F) << 12) \
1029 | (((p)[-2] & 0x3F) << 6) \
1030 | ((p)[-1] & 0x3F))) \
1031 : ((p) += 5, \
1032 ((((p)[-4] & 0x3F) << 18) \
1033 | (((p)[-3] & 0x3F) << 12) \
1034 | (((p)[-2] & 0x3F) << 6) \
1035 | ((p)[-1] & 0x3F))))
1036
aa72b389 1037
df7492f9 1038static void
971de7fb 1039coding_set_source (struct coding_system *coding)
aa72b389 1040{
df7492f9
KH
1041 if (BUFFERP (coding->src_object))
1042 {
2cb26057 1043 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1044
df7492f9 1045 if (coding->src_pos < 0)
2cb26057 1046 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1047 else
2cb26057 1048 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1049 }
df7492f9 1050 else if (STRINGP (coding->src_object))
aa72b389 1051 {
8f924df7 1052 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1053 }
df7492f9
KH
1054 else
1055 /* Otherwise, the source is C string and is never relocated
1056 automatically. Thus we don't have to update anything. */
1057 ;
1058}
aa72b389 1059
df7492f9 1060static void
971de7fb 1061coding_set_destination (struct coding_system *coding)
df7492f9
KH
1062{
1063 if (BUFFERP (coding->dst_object))
aa72b389 1064 {
df7492f9 1065 if (coding->src_pos < 0)
aa72b389 1066 {
13818c30 1067 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1068 coding->dst_bytes = (GAP_END_ADDR
1069 - (coding->src_bytes - coding->consumed)
1070 - coding->destination);
aa72b389 1071 }
df7492f9 1072 else
28f67a95
KH
1073 {
1074 /* We are sure that coding->dst_pos_byte is before the gap
1075 of the buffer. */
1076 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1077 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1078 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1079 - coding->destination);
1080 }
df7492f9
KH
1081 }
1082 else
1083 /* Otherwise, the destination is C string and is never relocated
1084 automatically. Thus we don't have to update anything. */
1085 ;
1086}
1087
1088
1089static void
971de7fb 1090coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
df7492f9
KH
1091{
1092 coding->destination = (unsigned char *) xrealloc (coding->destination,
1093 coding->dst_bytes + bytes);
1094 coding->dst_bytes += bytes;
1095}
1096
1097static void
cf84bb53
JB
1098coding_alloc_by_making_gap (struct coding_system *coding,
1099 EMACS_INT gap_head_used, EMACS_INT bytes)
df7492f9 1100{
db274c7a 1101 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1102 {
db274c7a
KH
1103 /* The gap may contain the produced data at the head and not-yet
1104 consumed data at the tail. To preserve those data, we at
1105 first make the gap size to zero, then increase the gap
1106 size. */
1107 EMACS_INT add = GAP_SIZE;
1108
1109 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1110 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1111 make_gap (bytes);
1112 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1113 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1114 }
730fff51 1115 else
df7492f9 1116 {
2c78b7e1
KH
1117 Lisp_Object this_buffer;
1118
1119 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1120 set_buffer_internal (XBUFFER (coding->dst_object));
1121 make_gap (bytes);
1122 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1123 }
df7492f9 1124}
8f924df7 1125
df7492f9
KH
1126
1127static unsigned char *
cf84bb53
JB
1128alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1129 unsigned char *dst)
df7492f9
KH
1130{
1131 EMACS_INT offset = dst - coding->destination;
1132
1133 if (BUFFERP (coding->dst_object))
db274c7a
KH
1134 {
1135 struct buffer *buf = XBUFFER (coding->dst_object);
1136
1137 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1138 }
aa72b389 1139 else
df7492f9 1140 coding_alloc_by_realloc (coding, nbytes);
df7492f9
KH
1141 coding_set_destination (coding);
1142 dst = coding->destination + offset;
1143 return dst;
1144}
aa72b389 1145
ff0dacd7
KH
1146/** Macros for annotations. */
1147
ff0dacd7
KH
1148/* An annotation data is stored in the array coding->charbuf in this
1149 format:
69a80ea3 1150 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1151 LENGTH is the number of elements in the annotation.
1152 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1153 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1154
1155 The format of the following elements depend on ANNOTATION_MASK.
1156
1157 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1158 follows:
e951386e
KH
1159 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1160
1161 NBYTES is the number of bytes specified in the header part of
1162 old-style emacs-mule encoding, or 0 for the other kind of
1163 composition.
1164
ff0dacd7 1165 METHOD is one of enum composition_method.
e951386e 1166
ad1746f5 1167 Optional COMPOSITION-COMPONENTS are characters and composition
ff0dacd7
KH
1168 rules.
1169
1170 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
e951386e
KH
1171 follows.
1172
1173 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1174 recover from an invalid annotation, and should be skipped by
1175 produce_annotation. */
1176
1177/* Maximum length of the header of annotation data. */
1178#define MAX_ANNOTATION_LENGTH 5
ff0dacd7 1179
69a80ea3 1180#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1181 do { \
1182 *(buf)++ = -(len); \
1183 *(buf)++ = (mask); \
69a80ea3 1184 *(buf)++ = (nchars); \
ff0dacd7
KH
1185 coding->annotated = 1; \
1186 } while (0);
1187
e951386e 1188#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
69a80ea3 1189 do { \
e951386e
KH
1190 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1191 *buf++ = nbytes; \
69a80ea3 1192 *buf++ = method; \
ff0dacd7
KH
1193 } while (0)
1194
1195
69a80ea3
KH
1196#define ADD_CHARSET_DATA(buf, nchars, id) \
1197 do { \
1198 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1199 *buf++ = id; \
ff0dacd7
KH
1200 } while (0)
1201
df7492f9
KH
1202\f
1203/*** 2. Emacs' internal format (emacs-utf-8) ***/
1204
1205
1206
1207\f
1208/*** 3. UTF-8 ***/
1209
1210/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1211 Check if a text is encoded in UTF-8. If it is, return 1, else
1212 return 0. */
df7492f9
KH
1213
1214#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1215#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1216#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1217#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1218#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1219#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1220
a470d443
KH
1221#define UTF_BOM 0xFEFF
1222#define UTF_8_BOM_1 0xEF
1223#define UTF_8_BOM_2 0xBB
1224#define UTF_8_BOM_3 0xBF
1225
df7492f9 1226static int
cf84bb53
JB
1227detect_coding_utf_8 (struct coding_system *coding,
1228 struct coding_detection_info *detect_info)
df7492f9 1229{
065e3595 1230 const unsigned char *src = coding->source, *src_base;
8f924df7 1231 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1232 int multibytep = coding->src_multibyte;
1233 int consumed_chars = 0;
a470d443 1234 int bom_found = 0;
df7492f9
KH
1235 int found = 0;
1236
ff0dacd7 1237 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1238 /* A coding system of this category is always ASCII compatible. */
1239 src += coding->head_ascii;
1240
1241 while (1)
aa72b389 1242 {
df7492f9 1243 int c, c1, c2, c3, c4;
aa72b389 1244
065e3595 1245 src_base = src;
df7492f9 1246 ONE_MORE_BYTE (c);
065e3595 1247 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1248 continue;
1249 ONE_MORE_BYTE (c1);
065e3595 1250 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1251 break;
1252 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1253 {
a470d443 1254 found = 1;
df7492f9 1255 continue;
aa72b389 1256 }
df7492f9 1257 ONE_MORE_BYTE (c2);
065e3595 1258 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1259 break;
1260 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1261 {
a470d443
KH
1262 found = 1;
1263 if (src_base == coding->source
1264 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1265 bom_found = 1;
df7492f9 1266 continue;
aa72b389 1267 }
df7492f9 1268 ONE_MORE_BYTE (c3);
065e3595 1269 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1270 break;
1271 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1272 {
a470d443 1273 found = 1;
df7492f9
KH
1274 continue;
1275 }
1276 ONE_MORE_BYTE (c4);
065e3595 1277 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1278 break;
1279 if (UTF_8_5_OCTET_LEADING_P (c))
1280 {
a470d443 1281 found = 1;
df7492f9
KH
1282 continue;
1283 }
1284 break;
aa72b389 1285 }
ff0dacd7 1286 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1287 return 0;
aa72b389 1288
df7492f9 1289 no_more_source:
065e3595 1290 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1291 {
ff0dacd7 1292 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1293 return 0;
aa72b389 1294 }
a470d443
KH
1295 if (bom_found)
1296 {
1297 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1298 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1299 }
1300 else
1301 {
1302 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1303 if (found)
1304 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1305 }
ff0dacd7 1306 return 1;
aa72b389
KH
1307}
1308
4ed46869 1309
b73bfc1c 1310static void
971de7fb 1311decode_coding_utf_8 (struct coding_system *coding)
b73bfc1c 1312{
8f924df7
KH
1313 const unsigned char *src = coding->source + coding->consumed;
1314 const unsigned char *src_end = coding->source + coding->src_bytes;
1315 const unsigned char *src_base;
69a80ea3
KH
1316 int *charbuf = coding->charbuf + coding->charbuf_used;
1317 int *charbuf_end = coding->charbuf + coding->charbuf_size;
453b38f0 1318 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1319 int multibytep = coding->src_multibyte;
a470d443 1320 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
24a73b0a 1321 Lisp_Object attr, charset_list;
0a9564cb
EZ
1322 int eol_crlf =
1323 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1324 int byte_after_cr = -1;
4ed46869 1325
24a73b0a 1326 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1327
a470d443
KH
1328 if (bom != utf_without_bom)
1329 {
1330 int c1, c2, c3;
1331
1332 src_base = src;
1333 ONE_MORE_BYTE (c1);
1334 if (! UTF_8_3_OCTET_LEADING_P (c1))
1335 src = src_base;
1336 else
1337 {
159bd5a2 1338 ONE_MORE_BYTE (c2);
a470d443
KH
1339 if (! UTF_8_EXTRA_OCTET_P (c2))
1340 src = src_base;
1341 else
1342 {
159bd5a2 1343 ONE_MORE_BYTE (c3);
a470d443
KH
1344 if (! UTF_8_EXTRA_OCTET_P (c3))
1345 src = src_base;
1346 else
1347 {
1348 if ((c1 != UTF_8_BOM_1)
1349 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1350 src = src_base;
1351 else
1352 CODING_UTF_8_BOM (coding) = utf_without_bom;
1353 }
1354 }
1355 }
1356 }
1357 CODING_UTF_8_BOM (coding) = utf_without_bom;
1358
df7492f9 1359 while (1)
b73bfc1c 1360 {
df7492f9 1361 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1362
df7492f9
KH
1363 src_base = src;
1364 consumed_chars_base = consumed_chars;
4af310db 1365
df7492f9 1366 if (charbuf >= charbuf_end)
b71f6f73
KH
1367 {
1368 if (byte_after_cr >= 0)
1369 src_base--;
1370 break;
1371 }
df7492f9 1372
119852e7
KH
1373 if (byte_after_cr >= 0)
1374 c1 = byte_after_cr, byte_after_cr = -1;
1375 else
1376 ONE_MORE_BYTE (c1);
065e3595
KH
1377 if (c1 < 0)
1378 {
1379 c = - c1;
1380 }
1a4990fb 1381 else if (UTF_8_1_OCTET_P (c1))
df7492f9 1382 {
119852e7
KH
1383 if (eol_crlf && c1 == '\r')
1384 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1385 c = c1;
4af310db 1386 }
df7492f9 1387 else
4af310db 1388 {
df7492f9 1389 ONE_MORE_BYTE (c2);
065e3595 1390 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1391 goto invalid_code;
1392 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1393 {
b0edb2c5
DL
1394 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1395 /* Reject overlong sequences here and below. Encoders
1396 producing them are incorrect, they can be misleading,
1397 and they mess up read/write invariance. */
1398 if (c < 128)
1399 goto invalid_code;
4af310db 1400 }
df7492f9 1401 else
aa72b389 1402 {
df7492f9 1403 ONE_MORE_BYTE (c3);
065e3595 1404 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1405 goto invalid_code;
1406 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1407 {
1408 c = (((c1 & 0xF) << 12)
1409 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1410 if (c < 0x800
1411 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1412 goto invalid_code;
1413 }
df7492f9
KH
1414 else
1415 {
1416 ONE_MORE_BYTE (c4);
065e3595 1417 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1418 goto invalid_code;
1419 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1420 {
df7492f9
KH
1421 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1422 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1423 if (c < 0x10000)
1424 goto invalid_code;
1425 }
df7492f9
KH
1426 else
1427 {
1428 ONE_MORE_BYTE (c5);
065e3595 1429 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1430 goto invalid_code;
1431 if (UTF_8_5_OCTET_LEADING_P (c1))
1432 {
1433 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1434 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1435 | (c5 & 0x3F));
b0edb2c5 1436 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1437 goto invalid_code;
1438 }
1439 else
1440 goto invalid_code;
1441 }
1442 }
aa72b389 1443 }
b73bfc1c 1444 }
df7492f9
KH
1445
1446 *charbuf++ = c;
1447 continue;
1448
1449 invalid_code:
1450 src = src_base;
1451 consumed_chars = consumed_chars_base;
1452 ONE_MORE_BYTE (c);
1453 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1454 coding->errors++;
aa72b389
KH
1455 }
1456
df7492f9
KH
1457 no_more_source:
1458 coding->consumed_char += consumed_chars_base;
1459 coding->consumed = src_base - coding->source;
1460 coding->charbuf_used = charbuf - coding->charbuf;
1461}
1462
1463
1464static int
971de7fb 1465encode_coding_utf_8 (struct coding_system *coding)
df7492f9
KH
1466{
1467 int multibytep = coding->dst_multibyte;
1468 int *charbuf = coding->charbuf;
1469 int *charbuf_end = charbuf + coding->charbuf_used;
1470 unsigned char *dst = coding->destination + coding->produced;
1471 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1472 int produced_chars = 0;
df7492f9
KH
1473 int c;
1474
a470d443
KH
1475 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1476 {
1477 ASSURE_DESTINATION (3);
1478 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1479 CODING_UTF_8_BOM (coding) = utf_without_bom;
1480 }
1481
df7492f9 1482 if (multibytep)
aa72b389 1483 {
df7492f9
KH
1484 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1485
1486 while (charbuf < charbuf_end)
b73bfc1c 1487 {
df7492f9 1488 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1489
df7492f9
KH
1490 ASSURE_DESTINATION (safe_room);
1491 c = *charbuf++;
28f67a95
KH
1492 if (CHAR_BYTE8_P (c))
1493 {
1494 c = CHAR_TO_BYTE8 (c);
1495 EMIT_ONE_BYTE (c);
1496 }
1497 else
1498 {
db274c7a 1499 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1500 for (p = str; p < pend; p++)
1501 EMIT_ONE_BYTE (*p);
1502 }
b73bfc1c 1503 }
aa72b389 1504 }
df7492f9
KH
1505 else
1506 {
1507 int safe_room = MAX_MULTIBYTE_LENGTH;
1508
1509 while (charbuf < charbuf_end)
b73bfc1c 1510 {
df7492f9
KH
1511 ASSURE_DESTINATION (safe_room);
1512 c = *charbuf++;
f03caae0
KH
1513 if (CHAR_BYTE8_P (c))
1514 *dst++ = CHAR_TO_BYTE8 (c);
1515 else
db274c7a 1516 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1517 produced_chars++;
4ed46869
KH
1518 }
1519 }
065e3595 1520 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1521 coding->produced_char += produced_chars;
1522 coding->produced = dst - coding->destination;
1523 return 0;
4ed46869
KH
1524}
1525
b73bfc1c 1526
df7492f9 1527/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1528 Check if a text is encoded in one of UTF-16 based coding systems.
1529 If it is, return 1, else return 0. */
aa72b389 1530
df7492f9
KH
1531#define UTF_16_HIGH_SURROGATE_P(val) \
1532 (((val) & 0xFC00) == 0xD800)
1533
1534#define UTF_16_LOW_SURROGATE_P(val) \
1535 (((val) & 0xFC00) == 0xDC00)
93dec019 1536
df7492f9
KH
1537#define UTF_16_INVALID_P(val) \
1538 (((val) == 0xFFFE) \
1539 || ((val) == 0xFFFF) \
1540 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1541
aa72b389 1542
df7492f9 1543static int
cf84bb53
JB
1544detect_coding_utf_16 (struct coding_system *coding,
1545 struct coding_detection_info *detect_info)
aa72b389 1546{
ef1b0ba7 1547 const unsigned char *src = coding->source;
8f924df7 1548 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 1549 int multibytep = coding->src_multibyte;
df7492f9 1550 int c1, c2;
aa72b389 1551
ff0dacd7 1552 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1553 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1554 && (coding->src_chars & 1))
ff0dacd7
KH
1555 {
1556 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1557 return 0;
1558 }
24a73b0a 1559
f56a4450 1560 TWO_MORE_BYTES (c1, c2);
df7492f9 1561 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1562 {
b49a1807
KH
1563 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1564 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1565 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1566 | CATEGORY_MASK_UTF_16_BE_NOSIG
1567 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1568 }
df7492f9 1569 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1570 {
b49a1807
KH
1571 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1572 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1573 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1574 | CATEGORY_MASK_UTF_16_BE_NOSIG
1575 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1576 }
220eeac9 1577 else if (c2 < 0)
f56a4450
KH
1578 {
1579 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1580 return 0;
1581 }
2f3cbb32 1582 else
24a73b0a 1583 {
2f3cbb32
KH
1584 /* We check the dispersion of Eth and Oth bytes where E is even and
1585 O is odd. If both are high, we assume binary data.*/
1586 unsigned char e[256], o[256];
1587 unsigned e_num = 1, o_num = 1;
1588
1589 memset (e, 0, 256);
1590 memset (o, 0, 256);
1591 e[c1] = 1;
1592 o[c2] = 1;
1593
cc13543e
KH
1594 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1595 |CATEGORY_MASK_UTF_16_BE
1596 | CATEGORY_MASK_UTF_16_LE);
2f3cbb32 1597
7f1faf1c
KH
1598 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1599 != CATEGORY_MASK_UTF_16)
2f3cbb32 1600 {
f56a4450 1601 TWO_MORE_BYTES (c1, c2);
220eeac9 1602 if (c2 < 0)
f56a4450 1603 break;
2f3cbb32
KH
1604 if (! e[c1])
1605 {
1606 e[c1] = 1;
1607 e_num++;
cc13543e
KH
1608 if (e_num >= 128)
1609 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
2f3cbb32
KH
1610 }
1611 if (! o[c2])
1612 {
977b85f4 1613 o[c2] = 1;
2f3cbb32 1614 o_num++;
cc13543e
KH
1615 if (o_num >= 128)
1616 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
2f3cbb32
KH
1617 }
1618 }
2f3cbb32 1619 return 0;
ff0dacd7 1620 }
2f3cbb32 1621
df7492f9 1622 no_more_source:
ff0dacd7 1623 return 1;
df7492f9 1624}
aa72b389 1625
df7492f9 1626static void
971de7fb 1627decode_coding_utf_16 (struct coding_system *coding)
df7492f9 1628{
8f924df7
KH
1629 const unsigned char *src = coding->source + coding->consumed;
1630 const unsigned char *src_end = coding->source + coding->src_bytes;
1631 const unsigned char *src_base;
69a80ea3 1632 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
1633 /* We may produces at most 3 chars in one loop. */
1634 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
3a8406e1 1635 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1636 int multibytep = coding->src_multibyte;
a470d443 1637 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1638 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1639 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1640 Lisp_Object attr, charset_list;
0a9564cb
EZ
1641 int eol_crlf =
1642 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1643 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1644
24a73b0a 1645 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1646
a470d443 1647 if (bom == utf_with_bom)
aa72b389 1648 {
df7492f9 1649 int c, c1, c2;
4af310db 1650
aa72b389 1651 src_base = src;
df7492f9
KH
1652 ONE_MORE_BYTE (c1);
1653 ONE_MORE_BYTE (c2);
e19c3639 1654 c = (c1 << 8) | c2;
aa72b389 1655
b49a1807
KH
1656 if (endian == utf_16_big_endian
1657 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1658 {
b49a1807
KH
1659 /* The first two bytes are not BOM. Treat them as bytes
1660 for a normal character. */
1661 src = src_base;
1662 coding->errors++;
aa72b389 1663 }
a470d443 1664 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1665 }
a470d443 1666 else if (bom == utf_detect_bom)
b49a1807
KH
1667 {
1668 /* We have already tried to detect BOM and failed in
1669 detect_coding. */
a470d443 1670 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1671 }
aa72b389 1672
df7492f9
KH
1673 while (1)
1674 {
1675 int c, c1, c2;
1676
1677 src_base = src;
1678 consumed_chars_base = consumed_chars;
1679
df80c7f0 1680 if (charbuf >= charbuf_end)
b71f6f73
KH
1681 {
1682 if (byte_after_cr1 >= 0)
1683 src_base -= 2;
1684 break;
1685 }
df7492f9 1686
119852e7
KH
1687 if (byte_after_cr1 >= 0)
1688 c1 = byte_after_cr1, byte_after_cr1 = -1;
1689 else
1690 ONE_MORE_BYTE (c1);
065e3595
KH
1691 if (c1 < 0)
1692 {
1693 *charbuf++ = -c1;
1694 continue;
1695 }
119852e7
KH
1696 if (byte_after_cr2 >= 0)
1697 c2 = byte_after_cr2, byte_after_cr2 = -1;
1698 else
1699 ONE_MORE_BYTE (c2);
065e3595
KH
1700 if (c2 < 0)
1701 {
1702 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1703 *charbuf++ = -c2;
1704 continue;
1705 }
df7492f9 1706 c = (endian == utf_16_big_endian
e19c3639 1707 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1708
df7492f9 1709 if (surrogate)
fd3ae0b9 1710 {
df7492f9 1711 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1712 {
df7492f9
KH
1713 if (endian == utf_16_big_endian)
1714 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1715 else
1716 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1717 *charbuf++ = c1;
1718 *charbuf++ = c2;
1719 coding->errors++;
1720 if (UTF_16_HIGH_SURROGATE_P (c))
1721 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1722 else
df7492f9 1723 *charbuf++ = c;
fd3ae0b9
KH
1724 }
1725 else
df7492f9
KH
1726 {
1727 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1728 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1729 *charbuf++ = 0x10000 + c;
df7492f9 1730 }
fd3ae0b9 1731 }
aa72b389 1732 else
df7492f9
KH
1733 {
1734 if (UTF_16_HIGH_SURROGATE_P (c))
1735 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1736 else
119852e7
KH
1737 {
1738 if (eol_crlf && c == '\r')
1739 {
1740 ONE_MORE_BYTE (byte_after_cr1);
1741 ONE_MORE_BYTE (byte_after_cr2);
1742 }
1743 *charbuf++ = c;
1744 }
8f924df7 1745 }
aa72b389 1746 }
df7492f9
KH
1747
1748 no_more_source:
1749 coding->consumed_char += consumed_chars_base;
1750 coding->consumed = src_base - coding->source;
1751 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1752}
b73bfc1c 1753
df7492f9 1754static int
971de7fb 1755encode_coding_utf_16 (struct coding_system *coding)
df7492f9
KH
1756{
1757 int multibytep = coding->dst_multibyte;
1758 int *charbuf = coding->charbuf;
1759 int *charbuf_end = charbuf + coding->charbuf_used;
1760 unsigned char *dst = coding->destination + coding->produced;
1761 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1762 int safe_room = 8;
a470d443 1763 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1764 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1765 int produced_chars = 0;
24a73b0a 1766 Lisp_Object attrs, charset_list;
df7492f9 1767 int c;
4ed46869 1768
24a73b0a 1769 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1770
a470d443 1771 if (bom != utf_without_bom)
df7492f9
KH
1772 {
1773 ASSURE_DESTINATION (safe_room);
1774 if (big_endian)
df7492f9 1775 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1776 else
1777 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1778 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1779 }
1780
1781 while (charbuf < charbuf_end)
1782 {
1783 ASSURE_DESTINATION (safe_room);
1784 c = *charbuf++;
60afa08d 1785 if (c > MAX_UNICODE_CHAR)
e19c3639 1786 c = coding->default_char;
df7492f9
KH
1787
1788 if (c < 0x10000)
1789 {
1790 if (big_endian)
1791 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1792 else
1793 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1794 }
1795 else
1796 {
1797 int c1, c2;
1798
1799 c -= 0x10000;
1800 c1 = (c >> 10) + 0xD800;
1801 c2 = (c & 0x3FF) + 0xDC00;
1802 if (big_endian)
1803 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1804 else
1805 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1806 }
1807 }
065e3595 1808 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1809 coding->produced = dst - coding->destination;
1810 coding->produced_char += produced_chars;
1811 return 0;
1812}
1813
1814\f
1815/*** 6. Old Emacs' internal format (emacs-mule) ***/
1816
1817/* Emacs' internal format for representation of multiple character
1818 sets is a kind of multi-byte encoding, i.e. characters are
1819 represented by variable-length sequences of one-byte codes.
1820
1821 ASCII characters and control characters (e.g. `tab', `newline') are
1822 represented by one-byte sequences which are their ASCII codes, in
1823 the range 0x00 through 0x7F.
1824
1825 8-bit characters of the range 0x80..0x9F are represented by
1826 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1827 code + 0x20).
1828
1829 8-bit characters of the range 0xA0..0xFF are represented by
1830 one-byte sequences which are their 8-bit code.
1831
1832 The other characters are represented by a sequence of `base
1833 leading-code', optional `extended leading-code', and one or two
1834 `position-code's. The length of the sequence is determined by the
1835 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1836 whereas extended leading-code and position-code take the range 0xA0
1837 through 0xFF. See `charset.h' for more details about leading-code
1838 and position-code.
1839
1840 --- CODE RANGE of Emacs' internal format ---
1841 character set range
1842 ------------- -----
1843 ascii 0x00..0x7F
1844 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1845 eight-bit-graphic 0xA0..0xBF
1846 ELSE 0x81..0x9D + [0xA0..0xFF]+
1847 ---------------------------------------------
1848
1849 As this is the internal character representation, the format is
1850 usually not used externally (i.e. in a file or in a data sent to a
1851 process). But, it is possible to have a text externally in this
1852 format (i.e. by encoding by the coding system `emacs-mule').
1853
1854 In that case, a sequence of one-byte codes has a slightly different
1855 form.
1856
1857 At first, all characters in eight-bit-control are represented by
1858 one-byte sequences which are their 8-bit code.
1859
1860 Next, character composition data are represented by the byte
1861 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1862 where,
e951386e 1863 METHOD is 0xF2 plus one of composition method (enum
df7492f9
KH
1864 composition_method),
1865
1866 BYTES is 0xA0 plus a byte length of this composition data,
1867
e951386e 1868 CHARS is 0xA0 plus a number of characters composed by this
df7492f9
KH
1869 data,
1870
ad1746f5 1871 COMPONENTs are characters of multibyte form or composition
df7492f9
KH
1872 rules encoded by two-byte of ASCII codes.
1873
1874 In addition, for backward compatibility, the following formats are
1875 also recognized as composition data on decoding.
1876
1877 0x80 MSEQ ...
1878 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1879
1880 Here,
1881 MSEQ is a multibyte form but in these special format:
1882 ASCII: 0xA0 ASCII_CODE+0x80,
1883 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1884 RULE is a one byte code of the range 0xA0..0xF0 that
1885 represents a composition rule.
1886 */
1887
1888char emacs_mule_bytes[256];
1889
e951386e
KH
1890
1891/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1892 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1893 else return 0. */
1894
1895static int
cf84bb53
JB
1896detect_coding_emacs_mule (struct coding_system *coding,
1897 struct coding_detection_info *detect_info)
e951386e
KH
1898{
1899 const unsigned char *src = coding->source, *src_base;
1900 const unsigned char *src_end = coding->source + coding->src_bytes;
1901 int multibytep = coding->src_multibyte;
1902 int consumed_chars = 0;
1903 int c;
1904 int found = 0;
1905
1906 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1907 /* A coding system of this category is always ASCII compatible. */
1908 src += coding->head_ascii;
1909
1910 while (1)
1911 {
1912 src_base = src;
1913 ONE_MORE_BYTE (c);
1914 if (c < 0)
1915 continue;
1916 if (c == 0x80)
1917 {
1918 /* Perhaps the start of composite character. We simply skip
1919 it because analyzing it is too heavy for detecting. But,
1920 at least, we check that the composite character
1921 constitutes of more than 4 bytes. */
1922 const unsigned char *src_base;
1923
1924 repeat:
1925 src_base = src;
1926 do
1927 {
1928 ONE_MORE_BYTE (c);
1929 }
1930 while (c >= 0xA0);
1931
1932 if (src - src_base <= 4)
1933 break;
1934 found = CATEGORY_MASK_EMACS_MULE;
1935 if (c == 0x80)
1936 goto repeat;
1937 }
1938
1939 if (c < 0x80)
1940 {
1941 if (c < 0x20
1942 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1943 break;
1944 }
1945 else
1946 {
396475b7 1947 int more_bytes = emacs_mule_bytes[c] - 1;
e951386e
KH
1948
1949 while (more_bytes > 0)
1950 {
1951 ONE_MORE_BYTE (c);
1952 if (c < 0xA0)
1953 {
1954 src--; /* Unread the last byte. */
1955 break;
1956 }
1957 more_bytes--;
1958 }
1959 if (more_bytes != 0)
1960 break;
1961 found = CATEGORY_MASK_EMACS_MULE;
1962 }
1963 }
1964 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1965 return 0;
1966
1967 no_more_source:
1968 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1969 {
1970 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1971 return 0;
1972 }
1973 detect_info->found |= found;
1974 return 1;
1975}
1976
1977
1978/* Parse emacs-mule multibyte sequence at SRC and return the decoded
1979 character. If CMP_STATUS indicates that we must expect MSEQ or
1980 RULE described above, decode it and return the negative value of
685ebdc8 1981 the decoded character or rule. If an invalid byte is found, return
e951386e
KH
1982 -1. If SRC is too short, return -2. */
1983
df7492f9 1984int
cf84bb53
JB
1985emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1986 int *nbytes, int *nchars, int *id,
1987 struct composition_status *cmp_status)
df7492f9 1988{
8f924df7
KH
1989 const unsigned char *src_end = coding->source + coding->src_bytes;
1990 const unsigned char *src_base = src;
df7492f9 1991 int multibytep = coding->src_multibyte;
b84ae584 1992 int charset_id;
df7492f9
KH
1993 unsigned code;
1994 int c;
1995 int consumed_chars = 0;
e951386e 1996 int mseq_found = 0;
df7492f9
KH
1997
1998 ONE_MORE_BYTE (c);
065e3595 1999 if (c < 0)
df7492f9 2000 {
065e3595 2001 c = -c;
b84ae584 2002 charset_id = emacs_mule_charset[0];
065e3595
KH
2003 }
2004 else
2005 {
4d41e8b7
KH
2006 if (c >= 0xA0)
2007 {
e951386e
KH
2008 if (cmp_status->state != COMPOSING_NO
2009 && cmp_status->old_form)
4d41e8b7 2010 {
e951386e
KH
2011 if (cmp_status->state == COMPOSING_CHAR)
2012 {
2013 if (c == 0xA0)
2014 {
2015 ONE_MORE_BYTE (c);
2016 c -= 0x80;
2017 if (c < 0)
2018 goto invalid_code;
2019 }
2020 else
2021 c -= 0x20;
2022 mseq_found = 1;
2023 }
2024 else
2025 {
2026 *nbytes = src - src_base;
2027 *nchars = consumed_chars;
2028 return -c;
2029 }
4d41e8b7
KH
2030 }
2031 else
e951386e 2032 goto invalid_code;
4d41e8b7
KH
2033 }
2034
065e3595 2035 switch (emacs_mule_bytes[c])
b73bfc1c 2036 {
065e3595 2037 case 2:
b84ae584 2038 if ((charset_id = emacs_mule_charset[c]) < 0)
df7492f9
KH
2039 goto invalid_code;
2040 ONE_MORE_BYTE (c);
9ffd559c 2041 if (c < 0xA0)
065e3595 2042 goto invalid_code;
df7492f9 2043 code = c & 0x7F;
065e3595
KH
2044 break;
2045
2046 case 3:
2047 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2048 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2049 {
2050 ONE_MORE_BYTE (c);
b84ae584 2051 if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
065e3595
KH
2052 goto invalid_code;
2053 ONE_MORE_BYTE (c);
9ffd559c 2054 if (c < 0xA0)
065e3595
KH
2055 goto invalid_code;
2056 code = c & 0x7F;
2057 }
2058 else
2059 {
b84ae584 2060 if ((charset_id = emacs_mule_charset[c]) < 0)
065e3595
KH
2061 goto invalid_code;
2062 ONE_MORE_BYTE (c);
9ffd559c 2063 if (c < 0xA0)
065e3595
KH
2064 goto invalid_code;
2065 code = (c & 0x7F) << 8;
2066 ONE_MORE_BYTE (c);
9ffd559c 2067 if (c < 0xA0)
065e3595
KH
2068 goto invalid_code;
2069 code |= c & 0x7F;
2070 }
2071 break;
2072
2073 case 4:
2074 ONE_MORE_BYTE (c);
b84ae584 2075 if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
df7492f9
KH
2076 goto invalid_code;
2077 ONE_MORE_BYTE (c);
9ffd559c 2078 if (c < 0xA0)
065e3595 2079 goto invalid_code;
781d7a48 2080 code = (c & 0x7F) << 8;
df7492f9 2081 ONE_MORE_BYTE (c);
9ffd559c 2082 if (c < 0xA0)
065e3595 2083 goto invalid_code;
df7492f9 2084 code |= c & 0x7F;
065e3595 2085 break;
df7492f9 2086
065e3595
KH
2087 case 1:
2088 code = c;
b84ae584 2089 charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
065e3595 2090 break;
df7492f9 2091
065e3595
KH
2092 default:
2093 abort ();
2094 }
b84ae584
KH
2095 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2096 CHARSET_FROM_ID (charset_id), code, c);
065e3595
KH
2097 if (c < 0)
2098 goto invalid_code;
df7492f9 2099 }
df7492f9
KH
2100 *nbytes = src - src_base;
2101 *nchars = consumed_chars;
ff0dacd7 2102 if (id)
b84ae584 2103 *id = charset_id;
e951386e 2104 return (mseq_found ? -c : c);
df7492f9
KH
2105
2106 no_more_source:
2107 return -2;
2108
2109 invalid_code:
2110 return -1;
2111}
2112
2113
e951386e 2114/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
df7492f9 2115
e951386e
KH
2116/* Handle these composition sequence ('|': the end of header elements,
2117 BYTES and CHARS >= 0xA0):
df7492f9 2118
e951386e
KH
2119 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2120 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2121 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
df7492f9 2122
e951386e 2123 and these old form:
1a4990fb 2124
e951386e
KH
2125 (4) relative composition: 0x80 | MSEQ ... MSEQ
2126 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
df7492f9 2127
e951386e
KH
2128 When the starter 0x80 and the following header elements are found,
2129 this annotation header is produced.
df7492f9 2130
e951386e 2131 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
df7492f9 2132
e951386e
KH
2133 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2134 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
df7492f9 2135
e951386e
KH
2136 Then, upon reading the following elements, these codes are produced
2137 until the composition end is found:
df7492f9 2138
e951386e
KH
2139 (1) CHAR ... CHAR
2140 (2) ALT ... ALT CHAR ... CHAR
2141 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2142 (4) CHAR ... CHAR
2143 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
4ed46869 2144
e951386e
KH
2145 When the composition end is found, LENGTH and NCHARS in the
2146 annotation header is updated as below:
b73bfc1c 2147
e951386e
KH
2148 (1) LENGTH: unchanged, NCHARS: unchanged
2149 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2150 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2151 (4) LENGTH: unchanged, NCHARS: number of CHARs
2152 (5) LENGTH: unchanged, NCHARS: number of CHARs
df7492f9 2153
e951386e
KH
2154 If an error is found while composing, the annotation header is
2155 changed to the original composition header (plus filler -1s) as
2156 below:
2157
2158 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2159 (5) [ 0x80 0xFF -1 -1- -1 ]
2160
2161 and the sequence [ -2 DECODED-RULE ] is changed to the original
2162 byte sequence as below:
2163 o the original byte sequence is B: [ B -1 ]
2164 o the original byte sequence is B1 B2: [ B1 B2 ]
2165
2166 Most of the routines are implemented by macros because many
2167 variables and labels in the caller decode_coding_emacs_mule must be
2168 accessible, and they are usually called just once (thus doesn't
2169 increase the size of compiled object). */
2170
2171/* Decode a composition rule represented by C as a component of
2172 composition sequence of Emacs 20 style. Set RULE to the decoded
2173 rule. */
2174
2175#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
df7492f9 2176 do { \
e951386e
KH
2177 int gref, nref; \
2178 \
4d41e8b7 2179 c -= 0xA0; \
df7492f9
KH
2180 if (c < 0 || c >= 81) \
2181 goto invalid_code; \
df7492f9 2182 gref = c / 9, nref = c % 9; \
e951386e
KH
2183 if (gref == 4) gref = 10; \
2184 if (nref == 4) nref = 10; \
2185 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
df7492f9
KH
2186 } while (0)
2187
2188
e951386e
KH
2189/* Decode a composition rule represented by C and the following byte
2190 at SRC as a component of composition sequence of Emacs 21 style.
2191 Set RULE to the decoded rule. */
781d7a48 2192
e951386e 2193#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
781d7a48
KH
2194 do { \
2195 int gref, nref; \
e951386e
KH
2196 \
2197 gref = c - 0x20; \
2198 if (gref < 0 || gref >= 81) \
781d7a48 2199 goto invalid_code; \
e951386e
KH
2200 ONE_MORE_BYTE (c); \
2201 nref = c - 0x20; \
2202 if (nref < 0 || nref >= 81) \
781d7a48 2203 goto invalid_code; \
e951386e 2204 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
781d7a48
KH
2205 } while (0)
2206
2207
e951386e
KH
2208/* Start of Emacs 21 style format. The first three bytes at SRC are
2209 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2210 byte length of this composition information, CHARS is the number of
2211 characters composed by this composition. */
2212
2213#define DECODE_EMACS_MULE_21_COMPOSITION() \
aa72b389 2214 do { \
781d7a48 2215 enum composition_method method = c - 0xF2; \
df7492f9 2216 int nbytes, nchars; \
e951386e 2217 \
df7492f9 2218 ONE_MORE_BYTE (c); \
065e3595
KH
2219 if (c < 0) \
2220 goto invalid_code; \
df7492f9 2221 nbytes = c - 0xA0; \
e951386e 2222 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
df7492f9
KH
2223 goto invalid_code; \
2224 ONE_MORE_BYTE (c); \
2225 nchars = c - 0xA0; \
e951386e
KH
2226 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2227 goto invalid_code; \
2228 cmp_status->old_form = 0; \
2229 cmp_status->method = method; \
2230 if (method == COMPOSITION_RELATIVE) \
2231 cmp_status->state = COMPOSING_CHAR; \
2232 else \
2233 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2234 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2235 cmp_status->nchars = nchars; \
2236 cmp_status->ncomps = nbytes - 4; \
2237 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
aa72b389 2238 } while (0)
93dec019 2239
aa72b389 2240
e951386e
KH
2241/* Start of Emacs 20 style format for relative composition. */
2242
2243#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2244 do { \
2245 cmp_status->old_form = 1; \
2246 cmp_status->method = COMPOSITION_RELATIVE; \
2247 cmp_status->state = COMPOSING_CHAR; \
2248 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2249 cmp_status->nchars = cmp_status->ncomps = 0; \
2250 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2251 } while (0)
2252
2253
2254/* Start of Emacs 20 style format for rule-base composition. */
2255
2256#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2257 do { \
2258 cmp_status->old_form = 1; \
2259 cmp_status->method = COMPOSITION_WITH_RULE; \
2260 cmp_status->state = COMPOSING_CHAR; \
2261 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2262 cmp_status->nchars = cmp_status->ncomps = 0; \
2263 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
df7492f9
KH
2264 } while (0)
2265
2266
e951386e
KH
2267#define DECODE_EMACS_MULE_COMPOSITION_START() \
2268 do { \
2269 const unsigned char *current_src = src; \
2270 \
2271 ONE_MORE_BYTE (c); \
2272 if (c < 0) \
2273 goto invalid_code; \
2274 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2275 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2276 DECODE_EMACS_MULE_21_COMPOSITION (); \
2277 else if (c < 0xA0) \
2278 goto invalid_code; \
2279 else if (c < 0xC0) \
2280 { \
2281 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2282 /* Re-read C as a composition component. */ \
2283 src = current_src; \
2284 } \
2285 else if (c == 0xFF) \
2286 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2287 else \
2288 goto invalid_code; \
2289 } while (0)
2290
2291#define EMACS_MULE_COMPOSITION_END() \
df7492f9 2292 do { \
e951386e 2293 int idx = - cmp_status->length; \
4d41e8b7 2294 \
e951386e
KH
2295 if (cmp_status->old_form) \
2296 charbuf[idx + 2] = cmp_status->nchars; \
2297 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2298 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2299 cmp_status->state = COMPOSING_NO; \
2300 } while (0)
2301
2302
2303static int
cf84bb53
JB
2304emacs_mule_finish_composition (int *charbuf,
2305 struct composition_status *cmp_status)
e951386e
KH
2306{
2307 int idx = - cmp_status->length;
2308 int new_chars;
2309
2310 if (cmp_status->old_form && cmp_status->nchars > 0)
2311 {
2312 charbuf[idx + 2] = cmp_status->nchars;
2313 new_chars = 0;
2314 if (cmp_status->method == COMPOSITION_WITH_RULE
2315 && cmp_status->state == COMPOSING_CHAR)
2316 {
2317 /* The last rule was invalid. */
2318 int rule = charbuf[-1] + 0xA0;
2319
2320 charbuf[-2] = BYTE8_TO_CHAR (rule);
2321 charbuf[-1] = -1;
2322 new_chars = 1;
2323 }
2324 }
2325 else
2326 {
2327 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2328
2329 if (cmp_status->method == COMPOSITION_WITH_RULE)
2330 {
2331 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2332 charbuf[idx++] = -3;
2333 charbuf[idx++] = 0;
2334 new_chars = 1;
2335 }
2336 else
2337 {
2338 int nchars = charbuf[idx + 1] + 0xA0;
2339 int nbytes = charbuf[idx + 2] + 0xA0;
2340
2341 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2342 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2343 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2344 charbuf[idx++] = -1;
2345 new_chars = 4;
2346 }
2347 }
2348 cmp_status->state = COMPOSING_NO;
2349 return new_chars;
2350}
2351
2352#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2353 do { \
2354 if (cmp_status->state != COMPOSING_NO) \
2355 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
df7492f9
KH
2356 } while (0)
2357
aa72b389
KH
2358
2359static void
971de7fb 2360decode_coding_emacs_mule (struct coding_system *coding)
aa72b389 2361{
8f924df7
KH
2362 const unsigned char *src = coding->source + coding->consumed;
2363 const unsigned char *src_end = coding->source + coding->src_bytes;
2364 const unsigned char *src_base;
69a80ea3 2365 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
2366 /* We may produce two annotations (charset and composition) in one
2367 loop and one more charset annotation at the end. */
69a80ea3 2368 int *charbuf_end
df80c7f0 2369 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
df7492f9 2370 int consumed_chars = 0, consumed_chars_base;
df7492f9 2371 int multibytep = coding->src_multibyte;
24a73b0a 2372 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2373 int char_offset = coding->produced_char;
2374 int last_offset = char_offset;
2375 int last_id = charset_ascii;
0a9564cb
EZ
2376 int eol_crlf =
2377 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 2378 int byte_after_cr = -1;
e951386e 2379 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
aa72b389 2380
24a73b0a 2381 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2382
e951386e
KH
2383 if (cmp_status->state != COMPOSING_NO)
2384 {
2385 int i;
2386
2387 for (i = 0; i < cmp_status->length; i++)
2388 *charbuf++ = cmp_status->carryover[i];
2389 coding->annotated = 1;
2390 }
2391
aa72b389
KH
2392 while (1)
2393 {
e951386e 2394 int c, id;
df7492f9 2395
aa72b389 2396 src_base = src;
df7492f9
KH
2397 consumed_chars_base = consumed_chars;
2398
2399 if (charbuf >= charbuf_end)
b71f6f73
KH
2400 {
2401 if (byte_after_cr >= 0)
2402 src_base--;
2403 break;
2404 }
aa72b389 2405
119852e7
KH
2406 if (byte_after_cr >= 0)
2407 c = byte_after_cr, byte_after_cr = -1;
2408 else
2409 ONE_MORE_BYTE (c);
e951386e
KH
2410
2411 if (c < 0 || c == 0x80)
065e3595 2412 {
e951386e
KH
2413 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2414 if (c < 0)
2415 {
2416 *charbuf++ = -c;
2417 char_offset++;
2418 }
2419 else
2420 DECODE_EMACS_MULE_COMPOSITION_START ();
2421 continue;
065e3595 2422 }
e951386e
KH
2423
2424 if (c < 0x80)
aa72b389 2425 {
119852e7
KH
2426 if (eol_crlf && c == '\r')
2427 ONE_MORE_BYTE (byte_after_cr);
e951386e
KH
2428 id = charset_ascii;
2429 if (cmp_status->state != COMPOSING_NO)
2430 {
2431 if (cmp_status->old_form)
2432 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2433 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2434 cmp_status->ncomps--;
2435 }
2436 }
2437 else
2438 {
2439 int nchars, nbytes;
75f80e63
EZ
2440 /* emacs_mule_char can load a charset map from a file, which
2441 allocates a large structure and might cause buffer text
2442 to be relocated as result. Thus, we need to remember the
ad1746f5 2443 original pointer to buffer text, and fix up all related
75f80e63
EZ
2444 pointers after the call. */
2445 const unsigned char *orig = coding->source;
2446 EMACS_INT offset;
e951386e
KH
2447
2448 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2449 cmp_status);
75f80e63
EZ
2450 offset = coding->source - orig;
2451 if (offset)
2452 {
2453 src += offset;
2454 src_base += offset;
2455 src_end += offset;
2456 }
e951386e
KH
2457 if (c < 0)
2458 {
2459 if (c == -1)
2460 goto invalid_code;
2461 if (c == -2)
2462 break;
2463 }
2464 src = src_base + nbytes;
2465 consumed_chars = consumed_chars_base + nchars;
2466 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2467 cmp_status->ncomps -= nchars;
2468 }
2469
ad1746f5 2470 /* Now if C >= 0, we found a normally encoded character, if C <
e951386e
KH
2471 0, we found an old-style composition component character or
2472 rule. */
2473
2474 if (cmp_status->state == COMPOSING_NO)
2475 {
2476 if (last_id != id)
2477 {
2478 if (last_id != charset_ascii)
2479 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2480 last_id);
2481 last_id = id;
2482 last_offset = char_offset;
2483 }
df7492f9
KH
2484 *charbuf++ = c;
2485 char_offset++;
aa72b389 2486 }
e951386e 2487 else if (cmp_status->state == COMPOSING_CHAR)
df7492f9 2488 {
e951386e
KH
2489 if (cmp_status->old_form)
2490 {
2491 if (c >= 0)
2492 {
2493 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2494 *charbuf++ = c;
2495 char_offset++;
2496 }
2497 else
2498 {
2499 *charbuf++ = -c;
2500 cmp_status->nchars++;
2501 cmp_status->length++;
2502 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2503 EMACS_MULE_COMPOSITION_END ();
2504 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2505 cmp_status->state = COMPOSING_RULE;
2506 }
2507 }
df7492f9 2508 else
e951386e
KH
2509 {
2510 *charbuf++ = c;
2511 cmp_status->length++;
2512 cmp_status->nchars--;
2513 if (cmp_status->nchars == 0)
2514 EMACS_MULE_COMPOSITION_END ();
2515 }
df7492f9 2516 }
e951386e 2517 else if (cmp_status->state == COMPOSING_RULE)
df7492f9 2518 {
e951386e 2519 int rule;
ff0dacd7 2520
e951386e 2521 if (c >= 0)
df7492f9 2522 {
e951386e
KH
2523 EMACS_MULE_COMPOSITION_END ();
2524 *charbuf++ = c;
2525 char_offset++;
df7492f9 2526 }
e951386e 2527 else
ff0dacd7 2528 {
e951386e
KH
2529 c = -c;
2530 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2531 if (rule < 0)
2532 goto invalid_code;
2533 *charbuf++ = -2;
2534 *charbuf++ = rule;
2535 cmp_status->length += 2;
2536 cmp_status->state = COMPOSING_CHAR;
ff0dacd7 2537 }
e951386e
KH
2538 }
2539 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2540 {
df7492f9 2541 *charbuf++ = c;
e951386e
KH
2542 cmp_status->length++;
2543 if (cmp_status->ncomps == 0)
2544 cmp_status->state = COMPOSING_CHAR;
2545 else if (cmp_status->ncomps > 0)
2546 {
2547 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2548 cmp_status->state = COMPOSING_COMPONENT_RULE;
2549 }
2550 else
2551 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9 2552 }
e951386e
KH
2553 else /* COMPOSING_COMPONENT_RULE */
2554 {
2555 int rule;
2556
2557 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2558 if (rule < 0)
2559 goto invalid_code;
2560 *charbuf++ = -2;
2561 *charbuf++ = rule;
2562 cmp_status->length += 2;
2563 cmp_status->ncomps--;
2564 if (cmp_status->ncomps > 0)
2565 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2566 else
2567 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2568 }
2569 continue;
2570
df7492f9 2571 invalid_code:
e951386e 2572 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9
KH
2573 src = src_base;
2574 consumed_chars = consumed_chars_base;
2575 ONE_MORE_BYTE (c);
2576 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2577 char_offset++;
df7492f9
KH
2578 coding->errors++;
2579 }
2580
2581 no_more_source:
e951386e
KH
2582 if (cmp_status->state != COMPOSING_NO)
2583 {
2584 if (coding->mode & CODING_MODE_LAST_BLOCK)
2585 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586 else
2587 {
2588 int i;
2589
2590 charbuf -= cmp_status->length;
2591 for (i = 0; i < cmp_status->length; i++)
2592 cmp_status->carryover[i] = charbuf[i];
2593 }
2594 }
ff0dacd7 2595 if (last_id != charset_ascii)
69a80ea3 2596 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2597 coding->consumed_char += consumed_chars_base;
2598 coding->consumed = src_base - coding->source;
2599 coding->charbuf_used = charbuf - coding->charbuf;
2600}
2601
2602
2603#define EMACS_MULE_LEADING_CODES(id, codes) \
2604 do { \
2605 if (id < 0xA0) \
2606 codes[0] = id, codes[1] = 0; \
2607 else if (id < 0xE0) \
2608 codes[0] = 0x9A, codes[1] = id; \
2609 else if (id < 0xF0) \
2610 codes[0] = 0x9B, codes[1] = id; \
2611 else if (id < 0xF5) \
2612 codes[0] = 0x9C, codes[1] = id; \
2613 else \
2614 codes[0] = 0x9D, codes[1] = id; \
2615 } while (0);
2616
aa72b389 2617
df7492f9 2618static int
971de7fb 2619encode_coding_emacs_mule (struct coding_system *coding)
df7492f9
KH
2620{
2621 int multibytep = coding->dst_multibyte;
2622 int *charbuf = coding->charbuf;
2623 int *charbuf_end = charbuf + coding->charbuf_used;
2624 unsigned char *dst = coding->destination + coding->produced;
2625 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2626 int safe_room = 8;
df7492f9 2627 int produced_chars = 0;
24a73b0a 2628 Lisp_Object attrs, charset_list;
df7492f9 2629 int c;
ff0dacd7 2630 int preferred_charset_id = -1;
df7492f9 2631
24a73b0a 2632 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2633 if (! EQ (charset_list, Vemacs_mule_charset_list))
2634 {
2635 CODING_ATTR_CHARSET_LIST (attrs)
2636 = charset_list = Vemacs_mule_charset_list;
2637 }
df7492f9
KH
2638
2639 while (charbuf < charbuf_end)
2640 {
2641 ASSURE_DESTINATION (safe_room);
2642 c = *charbuf++;
ff0dacd7
KH
2643
2644 if (c < 0)
2645 {
2646 /* Handle an annotation. */
2647 switch (*charbuf)
2648 {
2649 case CODING_ANNOTATE_COMPOSITION_MASK:
2650 /* Not yet implemented. */
2651 break;
2652 case CODING_ANNOTATE_CHARSET_MASK:
2653 preferred_charset_id = charbuf[3];
2654 if (preferred_charset_id >= 0
2655 && NILP (Fmemq (make_number (preferred_charset_id),
2656 charset_list)))
2657 preferred_charset_id = -1;
2658 break;
2659 default:
2660 abort ();
2661 }
2662 charbuf += -c - 1;
2663 continue;
2664 }
2665
df7492f9
KH
2666 if (ASCII_CHAR_P (c))
2667 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2668 else if (CHAR_BYTE8_P (c))
2669 {
2670 c = CHAR_TO_BYTE8 (c);
2671 EMIT_ONE_BYTE (c);
2672 }
df7492f9 2673 else
aa72b389 2674 {
df7492f9
KH
2675 struct charset *charset;
2676 unsigned code;
2677 int dimension;
2678 int emacs_mule_id;
2679 unsigned char leading_codes[2];
2680
ff0dacd7
KH
2681 if (preferred_charset_id >= 0)
2682 {
2683 charset = CHARSET_FROM_ID (preferred_charset_id);
905ca9d2
KH
2684 if (CHAR_CHARSET_P (c, charset))
2685 code = ENCODE_CHAR (charset, c);
2686 else
2687 charset = char_charset (c, charset_list, &code);
ff0dacd7
KH
2688 }
2689 else
2690 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2691 if (! charset)
2692 {
2693 c = coding->default_char;
2694 if (ASCII_CHAR_P (c))
2695 {
2696 EMIT_ONE_ASCII_BYTE (c);
2697 continue;
2698 }
2699 charset = char_charset (c, charset_list, &code);
2700 }
2701 dimension = CHARSET_DIMENSION (charset);
2702 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2703 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2704 EMIT_ONE_BYTE (leading_codes[0]);
2705 if (leading_codes[1])
2706 EMIT_ONE_BYTE (leading_codes[1]);
2707 if (dimension == 1)
1fa663f9 2708 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2709 else
df7492f9 2710 {
1fa663f9 2711 code |= 0x8080;
df7492f9
KH
2712 EMIT_ONE_BYTE (code >> 8);
2713 EMIT_ONE_BYTE (code & 0xFF);
2714 }
aa72b389 2715 }
aa72b389 2716 }
065e3595 2717 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2718 coding->produced_char += produced_chars;
2719 coding->produced = dst - coding->destination;
2720 return 0;
aa72b389 2721}
b73bfc1c 2722
4ed46869 2723\f
df7492f9 2724/*** 7. ISO2022 handlers ***/
4ed46869
KH
2725
2726/* The following note describes the coding system ISO2022 briefly.
39787efd 2727 Since the intention of this note is to help understand the
5a936b46 2728 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2729 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2730 original document of ISO2022. This is equivalent to the standard
cfb43547 2731 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2732
2733 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2734 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2735 is encoded using bytes less than 128. This may make the encoded
2736 text a little bit longer, but the text passes more easily through
cfb43547 2737 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2738 Significant Bit).
b73bfc1c 2739
cfb43547
DL
2740 There are two kinds of character sets: control character sets and
2741 graphic character sets. The former contain control characters such
4ed46869 2742 as `newline' and `escape' to provide control functions (control
39787efd 2743 functions are also provided by escape sequences). The latter
cfb43547 2744 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2745 two control character sets and many graphic character sets.
2746
2747 Graphic character sets are classified into one of the following
39787efd
KH
2748 four classes, according to the number of bytes (DIMENSION) and
2749 number of characters in one dimension (CHARS) of the set:
2750 - DIMENSION1_CHARS94
2751 - DIMENSION1_CHARS96
2752 - DIMENSION2_CHARS94
2753 - DIMENSION2_CHARS96
2754
2755 In addition, each character set is assigned an identification tag,
cfb43547 2756 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2757 hereafter). The <F> of each character set is decided by ECMA(*)
2758 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2759 (0x30..0x3F are for private use only).
4ed46869
KH
2760
2761 Note (*): ECMA = European Computer Manufacturers Association
2762
cfb43547 2763 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2764 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2765 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2766 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2767 o DIMENSION2_CHARS96 -- none for the moment
2768
39787efd 2769 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2770 C0 [0x00..0x1F] -- control character plane 0
2771 GL [0x20..0x7F] -- graphic character plane 0
2772 C1 [0x80..0x9F] -- control character plane 1
2773 GR [0xA0..0xFF] -- graphic character plane 1
2774
2775 A control character set is directly designated and invoked to C0 or
39787efd
KH
2776 C1 by an escape sequence. The most common case is that:
2777 - ISO646's control character set is designated/invoked to C0, and
2778 - ISO6429's control character set is designated/invoked to C1,
2779 and usually these designations/invocations are omitted in encoded
2780 text. In a 7-bit environment, only C0 can be used, and a control
2781 character for C1 is encoded by an appropriate escape sequence to
2782 fit into the environment. All control characters for C1 are
2783 defined to have corresponding escape sequences.
4ed46869
KH
2784
2785 A graphic character set is at first designated to one of four
2786 graphic registers (G0 through G3), then these graphic registers are
2787 invoked to GL or GR. These designations and invocations can be
2788 done independently. The most common case is that G0 is invoked to
39787efd
KH
2789 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2790 these invocations and designations are omitted in encoded text.
2791 In a 7-bit environment, only GL can be used.
4ed46869 2792
39787efd
KH
2793 When a graphic character set of CHARS94 is invoked to GL, codes
2794 0x20 and 0x7F of the GL area work as control characters SPACE and
2795 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2796 be used.
4ed46869
KH
2797
2798 There are two ways of invocation: locking-shift and single-shift.
2799 With locking-shift, the invocation lasts until the next different
39787efd
KH
2800 invocation, whereas with single-shift, the invocation affects the
2801 following character only and doesn't affect the locking-shift
2802 state. Invocations are done by the following control characters or
2803 escape sequences:
4ed46869
KH
2804
2805 ----------------------------------------------------------------------
39787efd 2806 abbrev function cntrl escape seq description
4ed46869 2807 ----------------------------------------------------------------------
39787efd
KH
2808 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2809 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2810 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2811 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2812 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2813 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2814 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2815 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2816 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2817 ----------------------------------------------------------------------
39787efd
KH
2818 (*) These are not used by any known coding system.
2819
2820 Control characters for these functions are defined by macros
2821 ISO_CODE_XXX in `coding.h'.
4ed46869 2822
39787efd 2823 Designations are done by the following escape sequences:
4ed46869
KH
2824 ----------------------------------------------------------------------
2825 escape sequence description
2826 ----------------------------------------------------------------------
2827 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2828 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2829 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2830 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2831 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2832 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2833 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2834 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2835 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2836 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2837 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2838 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2839 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2840 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2841 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2842 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2843 ----------------------------------------------------------------------
2844
2845 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2846 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2847
2848 Note (*): Although these designations are not allowed in ISO2022,
2849 Emacs accepts them on decoding, and produces them on encoding
39787efd 2850 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2851 7-bit environment, non-locking-shift, and non-single-shift.
2852
2853 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2854 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2855
cfb43547 2856 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2857 same multilingual text in ISO2022. Actually, there exist many
2858 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2859 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2860 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2861 localized platforms), and all of these are variants of ISO2022.
2862
2863 In addition to the above, Emacs handles two more kinds of escape
2864 sequences: ISO6429's direction specification and Emacs' private
2865 sequence for specifying character composition.
2866
39787efd 2867 ISO6429's direction specification takes the following form:
4ed46869
KH
2868 o CSI ']' -- end of the current direction
2869 o CSI '0' ']' -- end of the current direction
2870 o CSI '1' ']' -- start of left-to-right text
2871 o CSI '2' ']' -- start of right-to-left text
2872 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2873 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2874
2875 Character composition specification takes the following form:
ec6d2bb8
KH
2876 o ESC '0' -- start relative composition
2877 o ESC '1' -- end composition
2878 o ESC '2' -- start rule-base composition (*)
2879 o ESC '3' -- start relative composition with alternate chars (**)
2880 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2881 Since these are not standard escape sequences of any ISO standard,
cfb43547 2882 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2883
5a936b46
DL
2884 (*) This form is used only in Emacs 20.7 and older versions,
2885 but newer versions can safely decode it.
cfb43547 2886 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2887 and older versions can't decode it.
ec6d2bb8 2888
cfb43547 2889 Here's a list of example usages of these composition escape
b73bfc1c 2890 sequences (categorized by `enum composition_method').
ec6d2bb8 2891
b73bfc1c 2892 COMPOSITION_RELATIVE:
ec6d2bb8 2893 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2894 COMPOSITION_WITH_RULE:
ec6d2bb8 2895 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2896 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2897 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2898 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2899 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2900
2901enum iso_code_class_type iso_code_class[256];
2902
df7492f9
KH
2903#define SAFE_CHARSET_P(coding, id) \
2904 ((id) <= (coding)->max_charset_id \
1b3b981b 2905 && (coding)->safe_charsets[id] != 255)
df7492f9
KH
2906
2907
2908#define SHIFT_OUT_OK(category) \
2909 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2910
2911static void
971de7fb 2912setup_iso_safe_charsets (Lisp_Object attrs)
df7492f9
KH
2913{
2914 Lisp_Object charset_list, safe_charsets;
2915 Lisp_Object request;
2916 Lisp_Object reg_usage;
2917 Lisp_Object tail;
2918 int reg94, reg96;
2919 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2920 int max_charset_id;
2921
2922 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2923 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2924 && ! EQ (charset_list, Viso_2022_charset_list))
2925 {
2926 CODING_ATTR_CHARSET_LIST (attrs)
2927 = charset_list = Viso_2022_charset_list;
2928 ASET (attrs, coding_attr_safe_charsets, Qnil);
2929 }
2930
2931 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2932 return;
2933
2934 max_charset_id = 0;
2935 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2936 {
2937 int id = XINT (XCAR (tail));
2938 if (max_charset_id < id)
2939 max_charset_id = id;
2940 }
d46c5b12 2941
1b3b981b
AS
2942 safe_charsets = make_uninit_string (max_charset_id + 1);
2943 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
2944 request = AREF (attrs, coding_attr_iso_request);
2945 reg_usage = AREF (attrs, coding_attr_iso_usage);
2946 reg94 = XINT (XCAR (reg_usage));
2947 reg96 = XINT (XCDR (reg_usage));
2948
2949 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2950 {
2951 Lisp_Object id;
2952 Lisp_Object reg;
2953 struct charset *charset;
2954
2955 id = XCAR (tail);
2956 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2957 reg = Fcdr (Fassq (id, request));
df7492f9 2958 if (! NILP (reg))
8f924df7 2959 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2960 else if (charset->iso_chars_96)
2961 {
2962 if (reg96 < 4)
8f924df7 2963 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2964 }
2965 else
2966 {
2967 if (reg94 < 4)
8f924df7 2968 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2969 }
2970 }
2971 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2972}
d46c5b12 2973
b6871cc7 2974
4ed46869 2975/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ad1746f5 2976 Check if a text is encoded in one of ISO-2022 based coding systems.
ff0dacd7 2977 If it is, return 1, else return 0. */
4ed46869 2978
0a28aafb 2979static int
cf84bb53
JB
2980detect_coding_iso_2022 (struct coding_system *coding,
2981 struct coding_detection_info *detect_info)
4ed46869 2982{
8f924df7
KH
2983 const unsigned char *src = coding->source, *src_base = src;
2984 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2985 int multibytep = coding->src_multibyte;
ff0dacd7 2986 int single_shifting = 0;
df7492f9
KH
2987 int id;
2988 int c, c1;
2989 int consumed_chars = 0;
2990 int i;
ff0dacd7
KH
2991 int rejected = 0;
2992 int found = 0;
cee53ed4 2993 int composition_count = -1;
ff0dacd7
KH
2994
2995 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2996
2997 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2998 {
2999 struct coding_system *this = &(coding_categories[i]);
3000 Lisp_Object attrs, val;
3001
c6b278e7
KH
3002 if (this->id < 0)
3003 continue;
df7492f9
KH
3004 attrs = CODING_ID_ATTRS (this->id);
3005 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
1b3b981b 3006 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
df7492f9
KH
3007 setup_iso_safe_charsets (attrs);
3008 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 3009 this->max_charset_id = SCHARS (val) - 1;
1b3b981b 3010 this->safe_charsets = SDATA (val);
df7492f9
KH
3011 }
3012
3013 /* A coding system of this category is always ASCII compatible. */
3014 src += coding->head_ascii;
3f003981 3015
ff0dacd7 3016 while (rejected != CATEGORY_MASK_ISO)
4ed46869 3017 {
065e3595 3018 src_base = src;
df7492f9 3019 ONE_MORE_BYTE (c);
4ed46869
KH
3020 switch (c)
3021 {
3022 case ISO_CODE_ESC:
74383408
KH
3023 if (inhibit_iso_escape_detection)
3024 break;
f46869e4 3025 single_shifting = 0;
df7492f9 3026 ONE_MORE_BYTE (c);
d46c5b12 3027 if (c >= '(' && c <= '/')
4ed46869 3028 {
bf9cdd4e 3029 /* Designation sequence for a charset of dimension 1. */
df7492f9 3030 ONE_MORE_BYTE (c1);
d46c5b12 3031 if (c1 < ' ' || c1 >= 0x80
df7492f9 3032 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
3033 /* Invalid designation sequence. Just ignore. */
3034 break;
bf9cdd4e
KH
3035 }
3036 else if (c == '$')
3037 {
3038 /* Designation sequence for a charset of dimension 2. */
df7492f9 3039 ONE_MORE_BYTE (c);
bf9cdd4e
KH
3040 if (c >= '@' && c <= 'B')
3041 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 3042 id = iso_charset_table[1][0][c];
bf9cdd4e 3043 else if (c >= '(' && c <= '/')
bcf26d6a 3044 {
df7492f9 3045 ONE_MORE_BYTE (c1);
d46c5b12 3046 if (c1 < ' ' || c1 >= 0x80
df7492f9 3047 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
3048 /* Invalid designation sequence. Just ignore. */
3049 break;
bcf26d6a 3050 }
bf9cdd4e 3051 else
ff0dacd7 3052 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
3053 break;
3054 }
ae9ff118 3055 else if (c == 'N' || c == 'O')
d46c5b12 3056 {
ae9ff118 3057 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
3058 single_shifting = 1;
3059 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 3060 break;
4ed46869 3061 }
cee53ed4
KH
3062 else if (c == '1')
3063 {
3064 /* End of composition. */
3065 if (composition_count < 0
3066 || composition_count > MAX_COMPOSITION_COMPONENTS)
3067 /* Invalid */
3068 break;
3069 composition_count = -1;
3070 found |= CATEGORY_MASK_ISO;
3071 }
ec6d2bb8
KH
3072 else if (c >= '0' && c <= '4')
3073 {
3074 /* ESC <Fp> for start/end composition. */
cee53ed4 3075 composition_count = 0;
ec6d2bb8
KH
3076 break;
3077 }
bf9cdd4e 3078 else
df7492f9 3079 {
ff0dacd7 3080 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
3081 break;
3082 }
d46c5b12
KH
3083
3084 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 3085 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
3086 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3087 id))
ff0dacd7 3088 found |= CATEGORY_MASK_ISO_7;
d46c5b12 3089 else
ff0dacd7 3090 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
3091 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3092 id))
ff0dacd7 3093 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 3094 else
ff0dacd7 3095 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
3096 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3097 id))
ff0dacd7 3098 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 3099 else
ff0dacd7 3100 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
3101 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3102 id))
ff0dacd7 3103 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 3104 else
ff0dacd7 3105 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
3106 break;
3107
4ed46869 3108 case ISO_CODE_SO:
d46c5b12 3109 case ISO_CODE_SI:
ff0dacd7 3110 /* Locking shift out/in. */
74383408
KH
3111 if (inhibit_iso_escape_detection)
3112 break;
f46869e4 3113 single_shifting = 0;
ff0dacd7 3114 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
3115 break;
3116
4ed46869 3117 case ISO_CODE_CSI:
ff0dacd7 3118 /* Control sequence introducer. */
f46869e4 3119 single_shifting = 0;
ff0dacd7
KH
3120 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3121 found |= CATEGORY_MASK_ISO_8_ELSE;
3122 goto check_extra_latin;
3123
4ed46869
KH
3124 case ISO_CODE_SS2:
3125 case ISO_CODE_SS3:
ff0dacd7
KH
3126 /* Single shift. */
3127 if (inhibit_iso_escape_detection)
3128 break;
75e2a253 3129 single_shifting = 0;
ff0dacd7
KH
3130 rejected |= CATEGORY_MASK_ISO_7BIT;
3131 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3132 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253 3133 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
ff0dacd7
KH
3134 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3135 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253
KH
3136 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3137 if (single_shifting)
3138 break;
ff0dacd7 3139 goto check_extra_latin;
4ed46869
KH
3140
3141 default:
065e3595
KH
3142 if (c < 0)
3143 continue;
4ed46869 3144 if (c < 0x80)
f46869e4 3145 {
cee53ed4
KH
3146 if (composition_count >= 0)
3147 composition_count++;
f46869e4
KH
3148 single_shifting = 0;
3149 break;
3150 }
ff0dacd7 3151 if (c >= 0xA0)
c4825358 3152 {
ff0dacd7
KH
3153 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3154 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 3155 /* Check the length of succeeding codes of the range
ff0dacd7
KH
3156 0xA0..0FF. If the byte length is even, we include
3157 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3158 only when we are not single shifting. */
3159 if (! single_shifting
3160 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 3161 {
e17de821 3162 int i = 1;
b73bfc1c
KH
3163 while (src < src_end)
3164 {
d12bd917 3165 src_base = src;
df7492f9 3166 ONE_MORE_BYTE (c);
b73bfc1c 3167 if (c < 0xA0)
d12bd917
KH
3168 {
3169 src = src_base;
3170 break;
3171 }
b73bfc1c
KH
3172 i++;
3173 }
3174
3175 if (i & 1 && src < src_end)
cee53ed4
KH
3176 {
3177 rejected |= CATEGORY_MASK_ISO_8_2;
3178 if (composition_count >= 0)
3179 composition_count += i;
3180 }
f46869e4 3181 else
cee53ed4
KH
3182 {
3183 found |= CATEGORY_MASK_ISO_8_2;
3184 if (composition_count >= 0)
3185 composition_count += i / 2;
3186 }
f46869e4 3187 }
ff0dacd7 3188 break;
4ed46869 3189 }
ff0dacd7
KH
3190 check_extra_latin:
3191 single_shifting = 0;
3192 if (! VECTORP (Vlatin_extra_code_table)
3193 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3194 {
3195 rejected = CATEGORY_MASK_ISO;
3196 break;
3197 }
3198 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199 & CODING_ISO_FLAG_LATIN_EXTRA)
3200 found |= CATEGORY_MASK_ISO_8_1;
3201 else
3202 rejected |= CATEGORY_MASK_ISO_8_1;
75e2a253 3203 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
3204 }
3205 }
ff0dacd7
KH
3206 detect_info->rejected |= CATEGORY_MASK_ISO;
3207 return 0;
4ed46869 3208
df7492f9 3209 no_more_source:
ff0dacd7
KH
3210 detect_info->rejected |= rejected;
3211 detect_info->found |= (found & ~rejected);
df7492f9 3212 return 1;
4ed46869 3213}
ec6d2bb8 3214
4ed46869 3215
134b9549
KH
3216/* Set designation state into CODING. Set CHARS_96 to -1 if the
3217 escape sequence should be kept. */
df7492f9
KH
3218#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3219 do { \
3220 int id, prev; \
3221 \
3222 if (final < '0' || final >= 128 \
3223 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3224 || !SAFE_CHARSET_P (coding, id)) \
3225 { \
3226 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3227 chars_96 = -1; \
3228 break; \
df7492f9
KH
3229 } \
3230 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3231 if (id == charset_jisx0201_roman) \
3232 { \
3233 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3234 id = charset_ascii; \
3235 } \
3236 else if (id == charset_jisx0208_1978) \
3237 { \
3238 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3239 id = charset_jisx0208; \
3240 } \
df7492f9
KH
3241 CODING_ISO_DESIGNATION (coding, reg) = id; \
3242 /* If there was an invalid designation to REG previously, and this \
3243 designation is ASCII to REG, we should keep this designation \
3244 sequence. */ \
3245 if (prev == -2 && id == charset_ascii) \
134b9549 3246 chars_96 = -1; \
4ed46869
KH
3247 } while (0)
3248
d46c5b12 3249
e951386e
KH
3250/* Handle these composition sequence (ALT: alternate char):
3251
3252 (1) relative composition: ESC 0 CHAR ... ESC 1
3253 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3254 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3255 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3256
3257 When the start sequence (ESC 0/2/3/4) is found, this annotation
3258 header is produced.
3259
3260 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3261
3262 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3263 produced until the end sequence (ESC 1) is found:
3264
3265 (1) CHAR ... CHAR
3266 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3267 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3268 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3269
3270 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3271 annotation header is updated as below:
3272
3273 (1) LENGTH: unchanged, NCHARS: number of CHARs
3274 (2) LENGTH: unchanged, NCHARS: number of CHARs
3275 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3276 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3277
3278 If an error is found while composing, the annotation header is
3279 changed to:
3280
3281 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3282
3283 and the sequence [ -2 DECODED-RULE ] is changed to the original
3284 byte sequence as below:
3285 o the original byte sequence is B: [ B -1 ]
3286 o the original byte sequence is B1 B2: [ B1 B2 ]
3287 and the sequence [ -1 -1 ] is changed to the original byte
3288 sequence:
3289 [ ESC '0' ]
3290*/
3291
3292/* Decode a composition rule C1 and maybe one more byte from the
3293 source, and set RULE to the encoded composition rule, NBYTES to the
3294 length of the composition rule. If the rule is invalid, set RULE
3295 to some negative value. */
3296
3297#define DECODE_COMPOSITION_RULE(rule, nbytes) \
3298 do { \
3299 rule = c1 - 32; \
3300 if (rule < 0) \
3301 break; \
3302 if (rule < 81) /* old format (before ver.21) */ \
3303 { \
3304 int gref = (rule) / 9; \
3305 int nref = (rule) % 9; \
3306 if (gref == 4) gref = 10; \
3307 if (nref == 4) nref = 10; \
3308 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3309 nbytes = 1; \
3310 } \
3311 else /* new format (after ver.21) */ \
3312 { \
3313 int c; \
3314 \
3315 ONE_MORE_BYTE (c); \
3316 rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
3317 if (rule >= 0) \
3318 rule += 0x100; /* to destinguish it from the old format */ \
3319 nbytes = 2; \
3320 } \
3321 } while (0)
3322
3323#define ENCODE_COMPOSITION_RULE(rule) \
df7492f9 3324 do { \
e951386e
KH
3325 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3326 \
3327 if (rule < 0x100) /* old format */ \
df7492f9 3328 { \
e951386e
KH
3329 if (gref == 10) gref = 4; \
3330 if (nref == 10) nref = 4; \
3331 charbuf[idx] = 32 + gref * 9 + nref; \
3332 charbuf[idx + 1] = -1; \
3333 new_chars++; \
df7492f9 3334 } \
e951386e 3335 else /* new format */ \
df7492f9 3336 { \
e951386e
KH
3337 charbuf[idx] = 32 + 81 + gref; \
3338 charbuf[idx + 1] = 32 + nref; \
3339 new_chars += 2; \
df7492f9
KH
3340 } \
3341 } while (0)
3342
e951386e
KH
3343/* Finish the current composition as invalid. */
3344
f57e2426 3345static int finish_composition (int *, struct composition_status *);
e951386e
KH
3346
3347static int
971de7fb 3348finish_composition (int *charbuf, struct composition_status *cmp_status)
e951386e
KH
3349{
3350 int idx = - cmp_status->length;
3351 int new_chars;
3352
3353 /* Recover the original ESC sequence */
3354 charbuf[idx++] = ISO_CODE_ESC;
3355 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3356 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3357 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3358 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3359 : '4');
3360 charbuf[idx++] = -2;
3361 charbuf[idx++] = 0;
3362 charbuf[idx++] = -1;
3363 new_chars = cmp_status->nchars;
3364 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3365 for (; idx < 0; idx++)
3366 {
3367 int elt = charbuf[idx];
3368
3369 if (elt == -2)
3370 {
3371 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3372 idx++;
3373 }
3374 else if (elt == -1)
3375 {
3376 charbuf[idx++] = ISO_CODE_ESC;
3377 charbuf[idx] = '0';
3378 new_chars += 2;
3379 }
3380 }
3381 cmp_status->state = COMPOSING_NO;
3382 return new_chars;
3383}
3384
ad1746f5 3385/* If characters are under composition, finish the composition. */
e951386e
KH
3386#define MAYBE_FINISH_COMPOSITION() \
3387 do { \
3388 if (cmp_status->state != COMPOSING_NO) \
3389 char_offset += finish_composition (charbuf, cmp_status); \
3390 } while (0)
d46c5b12 3391
aa72b389 3392/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
e951386e 3393
aa72b389
KH
3394 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3395 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3396 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3397 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
ec6d2bb8 3398
e951386e
KH
3399 Produce this annotation sequence now:
3400
3401 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3402*/
3403
3404#define DECODE_COMPOSITION_START(c1) \
3405 do { \
3406 if (c1 == '0' \
3407 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3408 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3409 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3410 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3411 { \
3412 *charbuf++ = -1; \
3413 *charbuf++= -1; \
3414 cmp_status->state = COMPOSING_CHAR; \
3415 cmp_status->length += 2; \
3416 } \
3417 else \
3418 { \
3419 MAYBE_FINISH_COMPOSITION (); \
3420 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3421 : c1 == '2' ? COMPOSITION_WITH_RULE \
3422 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3423 : COMPOSITION_WITH_RULE_ALTCHARS); \
3424 cmp_status->state \
3425 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3426 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3427 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3428 cmp_status->nchars = cmp_status->ncomps = 0; \
3429 coding->annotated = 1; \
3430 } \
ec6d2bb8
KH
3431 } while (0)
3432
ec6d2bb8 3433
e951386e 3434/* Handle composition end sequence ESC 1. */
df7492f9
KH
3435
3436#define DECODE_COMPOSITION_END() \
ec6d2bb8 3437 do { \
e951386e
KH
3438 if (cmp_status->nchars == 0 \
3439 || ((cmp_status->state == COMPOSING_CHAR) \
3440 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
ec6d2bb8 3441 { \
e951386e
KH
3442 MAYBE_FINISH_COMPOSITION (); \
3443 goto invalid_code; \
ec6d2bb8 3444 } \
e951386e
KH
3445 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3446 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3447 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3448 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3449 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3450 char_offset += cmp_status->nchars; \
3451 cmp_status->state = COMPOSING_NO; \
ec6d2bb8
KH
3452 } while (0)
3453
e951386e 3454/* Store a composition rule RULE in charbuf, and update cmp_status. */
df7492f9 3455
e951386e
KH
3456#define STORE_COMPOSITION_RULE(rule) \
3457 do { \
3458 *charbuf++ = -2; \
3459 *charbuf++ = rule; \
3460 cmp_status->length += 2; \
3461 cmp_status->state--; \
3462 } while (0)
ec6d2bb8 3463
e951386e
KH
3464/* Store a composed char or a component char C in charbuf, and update
3465 cmp_status. */
3466
3467#define STORE_COMPOSITION_CHAR(c) \
ec6d2bb8 3468 do { \
e951386e
KH
3469 *charbuf++ = (c); \
3470 cmp_status->length++; \
3471 if (cmp_status->state == COMPOSING_CHAR) \
3472 cmp_status->nchars++; \
df7492f9 3473 else \
e951386e
KH
3474 cmp_status->ncomps++; \
3475 if (cmp_status->method == COMPOSITION_WITH_RULE \
3476 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3477 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3478 cmp_status->state++; \
ec6d2bb8 3479 } while (0)
88993dfd 3480
d46c5b12 3481
4ed46869
KH
3482/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3483
b73bfc1c 3484static void
971de7fb 3485decode_coding_iso_2022 (struct coding_system *coding)
4ed46869 3486{
8f924df7
KH
3487 const unsigned char *src = coding->source + coding->consumed;
3488 const unsigned char *src_end = coding->source + coding->src_bytes;
3489 const unsigned char *src_base;
69a80ea3 3490 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
3491 /* We may produce two annotations (charset and composition) in one
3492 loop and one more charset annotation at the end. */
ff0dacd7 3493 int *charbuf_end
df80c7f0 3494 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
df7492f9 3495 int consumed_chars = 0, consumed_chars_base;
df7492f9 3496 int multibytep = coding->src_multibyte;
4ed46869 3497 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3498 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3499 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3500 int charset_id_2, charset_id_3;
df7492f9
KH
3501 struct charset *charset;
3502 int c;
e951386e 3503 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
24a73b0a 3504 Lisp_Object attrs, charset_list;
ff0dacd7
KH
3505 int char_offset = coding->produced_char;
3506 int last_offset = char_offset;
3507 int last_id = charset_ascii;
0a9564cb
EZ
3508 int eol_crlf =
3509 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 3510 int byte_after_cr = -1;
e951386e 3511 int i;
df7492f9 3512
24a73b0a 3513 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 3514 setup_iso_safe_charsets (attrs);
287c57d7
KH
3515 /* Charset list may have been changed. */
3516 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 3517 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
b73bfc1c 3518
e951386e
KH
3519 if (cmp_status->state != COMPOSING_NO)
3520 {
3521 for (i = 0; i < cmp_status->length; i++)
3522 *charbuf++ = cmp_status->carryover[i];
3523 coding->annotated = 1;
3524 }
3525
b73bfc1c 3526 while (1)
4ed46869 3527 {
cf299835 3528 int c1, c2, c3;
b73bfc1c
KH
3529
3530 src_base = src;
df7492f9
KH
3531 consumed_chars_base = consumed_chars;
3532
3533 if (charbuf >= charbuf_end)
b71f6f73
KH
3534 {
3535 if (byte_after_cr >= 0)
3536 src_base--;
3537 break;
3538 }
df7492f9 3539
119852e7
KH
3540 if (byte_after_cr >= 0)
3541 c1 = byte_after_cr, byte_after_cr = -1;
3542 else
3543 ONE_MORE_BYTE (c1);
065e3595
KH
3544 if (c1 < 0)
3545 goto invalid_code;
4ed46869 3546
e951386e 3547 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
4ed46869 3548 {
e951386e
KH
3549 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3550 char_offset++;
3551 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3552 continue;
3553 }
3554
3555 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3556 {
3557 if (c1 == ISO_CODE_ESC)
ec6d2bb8 3558 {
e951386e
KH
3559 if (src + 1 >= src_end)
3560 goto no_more_source;
3561 *charbuf++ = ISO_CODE_ESC;
3562 char_offset++;
3563 if (src[0] == '%' && src[1] == '@')
df7492f9 3564 {
e951386e
KH
3565 src += 2;
3566 consumed_chars += 2;
3567 char_offset += 2;
3568 /* We are sure charbuf can contain two more chars. */
3569 *charbuf++ = '%';
3570 *charbuf++ = '@';
3571 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
df7492f9 3572 }
4ed46869 3573 }
e951386e
KH
3574 else
3575 {
3576 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3577 char_offset++;
3578 }
3579 continue;
3580 }
3581
3582 if ((cmp_status->state == COMPOSING_RULE
3583 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3584 && c1 != ISO_CODE_ESC)
3585 {
3586 int rule, nbytes;
3587
3588 DECODE_COMPOSITION_RULE (rule, nbytes);
3589 if (rule < 0)
3590 goto invalid_code;
3591 STORE_COMPOSITION_RULE (rule);
3592 continue;
3593 }
3594
3595 /* We produce at most one character. */
3596 switch (iso_code_class [c1])
3597 {
3598 case ISO_0x20_or_0x7F:
df7492f9
KH
3599 if (charset_id_0 < 0
3600 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3601 /* This is SPACE or DEL. */
3602 charset = CHARSET_FROM_ID (charset_ascii);
3603 else
3604 charset = CHARSET_FROM_ID (charset_id_0);
3605 break;
4ed46869
KH
3606
3607 case ISO_graphic_plane_0:
134b9549
KH
3608 if (charset_id_0 < 0)
3609 charset = CHARSET_FROM_ID (charset_ascii);
3610 else
3611 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3612 break;
3613
3614 case ISO_0xA0_or_0xFF:
df7492f9
KH
3615 if (charset_id_1 < 0
3616 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3617 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3618 goto invalid_code;
4ed46869
KH
3619 /* This is a graphic character, we fall down ... */
3620
3621 case ISO_graphic_plane_1:
df7492f9
KH
3622 if (charset_id_1 < 0)
3623 goto invalid_code;
3624 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3625 break;
3626
df7492f9 3627 case ISO_control_0:
119852e7
KH
3628 if (eol_crlf && c1 == '\r')
3629 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3630 MAYBE_FINISH_COMPOSITION ();
3631 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3632 break;
3633
df7492f9 3634 case ISO_control_1:
df7492f9
KH
3635 goto invalid_code;
3636
4ed46869 3637 case ISO_shift_out:
df7492f9
KH
3638 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3639 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3640 goto invalid_code;
3641 CODING_ISO_INVOCATION (coding, 0) = 1;
3642 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3643 continue;
4ed46869
KH
3644
3645 case ISO_shift_in:
df7492f9
KH
3646 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3647 goto invalid_code;
3648 CODING_ISO_INVOCATION (coding, 0) = 0;
3649 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3650 continue;
4ed46869
KH
3651
3652 case ISO_single_shift_2_7:
a63dba42
KH
3653 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3654 goto invalid_code;
4ed46869 3655 case ISO_single_shift_2:
df7492f9
KH
3656 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3657 goto invalid_code;
4ed46869
KH
3658 /* SS2 is handled as an escape sequence of ESC 'N' */
3659 c1 = 'N';
3660 goto label_escape_sequence;
3661
3662 case ISO_single_shift_3:
df7492f9
KH
3663 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3664 goto invalid_code;
4ed46869
KH
3665 /* SS2 is handled as an escape sequence of ESC 'O' */
3666 c1 = 'O';
3667 goto label_escape_sequence;
3668
3669 case ISO_control_sequence_introducer:
3670 /* CSI is handled as an escape sequence of ESC '[' ... */
3671 c1 = '[';
3672 goto label_escape_sequence;
3673
3674 case ISO_escape:
3675 ONE_MORE_BYTE (c1);
3676 label_escape_sequence:
df7492f9 3677 /* Escape sequences handled here are invocation,
4ed46869
KH
3678 designation, direction specification, and character
3679 composition specification. */
3680 switch (c1)
3681 {
3682 case '&': /* revision of following character set */
3683 ONE_MORE_BYTE (c1);
3684 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3685 goto invalid_code;
4ed46869
KH
3686 ONE_MORE_BYTE (c1);
3687 if (c1 != ISO_CODE_ESC)
df7492f9 3688 goto invalid_code;
4ed46869
KH
3689 ONE_MORE_BYTE (c1);
3690 goto label_escape_sequence;
3691
3692 case '$': /* designation of 2-byte character set */
df7492f9
KH
3693 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3694 goto invalid_code;
134b9549
KH
3695 {
3696 int reg, chars96;
3697
3698 ONE_MORE_BYTE (c1);
3699 if (c1 >= '@' && c1 <= 'B')
3700 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3701 or JISX0208.1980 */
134b9549
KH
3702 reg = 0, chars96 = 0;
3703 }
3704 else if (c1 >= 0x28 && c1 <= 0x2B)
3705 { /* designation of DIMENSION2_CHARS94 character set */
3706 reg = c1 - 0x28, chars96 = 0;
3707 ONE_MORE_BYTE (c1);
3708 }
3709 else if (c1 >= 0x2C && c1 <= 0x2F)
3710 { /* designation of DIMENSION2_CHARS96 character set */
3711 reg = c1 - 0x2C, chars96 = 1;
3712 ONE_MORE_BYTE (c1);
3713 }
3714 else
3715 goto invalid_code;
3716 DECODE_DESIGNATION (reg, 2, chars96, c1);
3717 /* We must update these variables now. */
3718 if (reg == 0)
3719 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3720 else if (reg == 1)
3721 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3722 if (chars96 < 0)
3723 goto invalid_code;
3724 }
b73bfc1c 3725 continue;
4ed46869
KH
3726
3727 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3728 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3729 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3730 goto invalid_code;
3731 CODING_ISO_INVOCATION (coding, 0) = 2;
3732 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3733 continue;
4ed46869
KH
3734
3735 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3736 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3737 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3738 goto invalid_code;
3739 CODING_ISO_INVOCATION (coding, 0) = 3;
3740 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3741 continue;
4ed46869
KH
3742
3743 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3744 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3745 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3746 goto invalid_code;
134b9549
KH
3747 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3748 if (charset_id_2 < 0)
3749 charset = CHARSET_FROM_ID (charset_ascii);
3750 else
3751 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3752 ONE_MORE_BYTE (c1);
e7046a18 3753 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3754 goto invalid_code;
4ed46869
KH
3755 break;
3756
3757 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3758 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3759 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3760 goto invalid_code;
134b9549
KH
3761 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3762 if (charset_id_3 < 0)
3763 charset = CHARSET_FROM_ID (charset_ascii);
3764 else
3765 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3766 ONE_MORE_BYTE (c1);
e7046a18 3767 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3768 goto invalid_code;
4ed46869
KH
3769 break;
3770
ec6d2bb8 3771 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3772 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3773 goto invalid_code;
e951386e
KH
3774 if (last_id != charset_ascii)
3775 {
3776 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3777 last_id = charset_ascii;
3778 last_offset = char_offset;
3779 }
ec6d2bb8 3780 DECODE_COMPOSITION_START (c1);
b73bfc1c 3781 continue;
4ed46869 3782
ec6d2bb8 3783 case '1': /* end composition */
e951386e 3784 if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3785 goto invalid_code;
3786 DECODE_COMPOSITION_END ();
b73bfc1c 3787 continue;
4ed46869
KH
3788
3789 case '[': /* specification of direction */
de59072a 3790 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
df7492f9 3791 goto invalid_code;
4ed46869 3792 /* For the moment, nested direction is not supported.
d46c5b12 3793 So, `coding->mode & CODING_MODE_DIRECTION' zero means
ad1746f5 3794 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
3795 ONE_MORE_BYTE (c1);
3796 switch (c1)
3797 {
3798 case ']': /* end of the current direction */
d46c5b12 3799 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3800
3801 case '0': /* end of the current direction */
3802 case '1': /* start of left-to-right direction */
3803 ONE_MORE_BYTE (c1);
3804 if (c1 == ']')
d46c5b12 3805 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3806 else
df7492f9 3807 goto invalid_code;
4ed46869
KH
3808 break;
3809
3810 case '2': /* start of right-to-left direction */
3811 ONE_MORE_BYTE (c1);
3812 if (c1 == ']')
d46c5b12 3813 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3814 else
df7492f9 3815 goto invalid_code;
4ed46869
KH
3816 break;
3817
3818 default:
df7492f9 3819 goto invalid_code;
4ed46869 3820 }
b73bfc1c 3821 continue;
4ed46869 3822
103e0180 3823 case '%':
103e0180
KH
3824 ONE_MORE_BYTE (c1);
3825 if (c1 == '/')
3826 {
3827 /* CTEXT extended segment:
3828 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3829 We keep these bytes as is for the moment.
3830 They may be decoded by post-read-conversion. */
3831 int dim, M, L;
4776e638 3832 int size;
8f924df7 3833
103e0180 3834 ONE_MORE_BYTE (dim);
7a84eee5 3835 if (dim < '0' || dim > '4')
e951386e 3836 goto invalid_code;
103e0180 3837 ONE_MORE_BYTE (M);
e951386e
KH
3838 if (M < 128)
3839 goto invalid_code;
103e0180 3840 ONE_MORE_BYTE (L);
e951386e
KH
3841 if (L < 128)
3842 goto invalid_code;
103e0180 3843 size = ((M - 128) * 128) + (L - 128);
e951386e 3844 if (charbuf + 6 > charbuf_end)
4776e638
KH
3845 goto break_loop;
3846 *charbuf++ = ISO_CODE_ESC;
3847 *charbuf++ = '%';
3848 *charbuf++ = '/';
3849 *charbuf++ = dim;
3850 *charbuf++ = BYTE8_TO_CHAR (M);
3851 *charbuf++ = BYTE8_TO_CHAR (L);
e951386e 3852 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
103e0180
KH
3853 }
3854 else if (c1 == 'G')
3855 {
103e0180
KH
3856 /* XFree86 extension for embedding UTF-8 in CTEXT:
3857 ESC % G --UTF-8-BYTES-- ESC % @
3858 We keep these bytes as is for the moment.
3859 They may be decoded by post-read-conversion. */
e951386e 3860 if (charbuf + 3 > charbuf_end)
4776e638 3861 goto break_loop;
e951386e
KH
3862 *charbuf++ = ISO_CODE_ESC;
3863 *charbuf++ = '%';
3864 *charbuf++ = 'G';
3865 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
103e0180
KH
3866 }
3867 else
4776e638 3868 goto invalid_code;
103e0180 3869 continue;
4776e638 3870 break;
103e0180 3871
4ed46869 3872 default:
df7492f9
KH
3873 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3874 goto invalid_code;
134b9549
KH
3875 {
3876 int reg, chars96;
3877
3878 if (c1 >= 0x28 && c1 <= 0x2B)
3879 { /* designation of DIMENSION1_CHARS94 character set */
3880 reg = c1 - 0x28, chars96 = 0;
3881 ONE_MORE_BYTE (c1);
3882 }
3883 else if (c1 >= 0x2C && c1 <= 0x2F)
3884 { /* designation of DIMENSION1_CHARS96 character set */
3885 reg = c1 - 0x2C, chars96 = 1;
3886 ONE_MORE_BYTE (c1);
3887 }
3888 else
3889 goto invalid_code;
3890 DECODE_DESIGNATION (reg, 1, chars96, c1);
3891 /* We must update these variables now. */
3892 if (reg == 0)
3893 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3894 else if (reg == 1)
3895 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3896 if (chars96 < 0)
3897 goto invalid_code;
3898 }
b73bfc1c 3899 continue;
4ed46869 3900 }
b73bfc1c 3901 }
4ed46869 3902
e951386e
KH
3903 if (cmp_status->state == COMPOSING_NO
3904 && charset->id != charset_ascii
ff0dacd7
KH
3905 && last_id != charset->id)
3906 {
3907 if (last_id != charset_ascii)
69a80ea3 3908 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3909 last_id = charset->id;
3910 last_offset = char_offset;
3911 }
3912
b73bfc1c 3913 /* Now we know CHARSET and 1st position code C1 of a character.
cf299835
KH
3914 Produce a decoded character while getting 2nd and 3rd
3915 position codes C2, C3 if necessary. */
df7492f9 3916 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3917 {
3918 ONE_MORE_BYTE (c2);
cf299835
KH
3919 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3920 || ((c1 & 0x80) != (c2 & 0x80)))
b73bfc1c 3921 /* C2 is not in a valid range. */
df7492f9 3922 goto invalid_code;
cf299835
KH
3923 if (CHARSET_DIMENSION (charset) == 2)
3924 c1 = (c1 << 8) | c2;
3925 else
df7492f9 3926 {
cf299835
KH
3927 ONE_MORE_BYTE (c3);
3928 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3929 || ((c1 & 0x80) != (c3 & 0x80)))
3930 /* C3 is not in a valid range. */
df7492f9 3931 goto invalid_code;
cf299835 3932 c1 = (c1 << 16) | (c2 << 8) | c2;
df7492f9
KH
3933 }
3934 }
cf299835 3935 c1 &= 0x7F7F7F;
df7492f9
KH
3936 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3937 if (c < 0)
3938 {
3939 MAYBE_FINISH_COMPOSITION ();
3940 for (; src_base < src; src_base++, char_offset++)
3941 {
3942 if (ASCII_BYTE_P (*src_base))
3943 *charbuf++ = *src_base;
3944 else
3945 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3946 }
3947 }
e951386e 3948 else if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3949 {
3950 *charbuf++ = c;
3951 char_offset++;
4ed46869 3952 }
e951386e
KH
3953 else if ((cmp_status->state == COMPOSING_CHAR
3954 ? cmp_status->nchars
3955 : cmp_status->ncomps)
3956 >= MAX_COMPOSITION_COMPONENTS)
781d7a48 3957 {
e951386e
KH
3958 /* Too long composition. */
3959 MAYBE_FINISH_COMPOSITION ();
3960 *charbuf++ = c;
3961 char_offset++;
4ed46869 3962 }
e951386e
KH
3963 else
3964 STORE_COMPOSITION_CHAR (c);
4ed46869
KH
3965 continue;
3966
df7492f9
KH
3967 invalid_code:
3968 MAYBE_FINISH_COMPOSITION ();
4ed46869 3969 src = src_base;
df7492f9
KH
3970 consumed_chars = consumed_chars_base;
3971 ONE_MORE_BYTE (c);
065e3595 3972 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3973 char_offset++;
df7492f9 3974 coding->errors++;
4776e638
KH
3975 continue;
3976
3977 break_loop:
3978 break;
4ed46869 3979 }
fb88bf2d 3980
df7492f9 3981 no_more_source:
e951386e
KH
3982 if (cmp_status->state != COMPOSING_NO)
3983 {
3984 if (coding->mode & CODING_MODE_LAST_BLOCK)
3985 MAYBE_FINISH_COMPOSITION ();
3986 else
3987 {
3988 charbuf -= cmp_status->length;
3989 for (i = 0; i < cmp_status->length; i++)
3990 cmp_status->carryover[i] = charbuf[i];
3991 }
3992 }
3993 else if (last_id != charset_ascii)
69a80ea3 3994 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3995 coding->consumed_char += consumed_chars_base;
3996 coding->consumed = src_base - coding->source;
3997 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3998}
3999
b73bfc1c 4000
f4dee582 4001/* ISO2022 encoding stuff. */
4ed46869
KH
4002
4003/*
f4dee582 4004 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 4005 specify more details. In Emacs, each coding system of ISO2022
4ed46869 4006 variant has the following specifications:
df7492f9 4007 1. Initial designation to G0 thru G3.
4ed46869
KH
4008 2. Allows short-form designation?
4009 3. ASCII should be designated to G0 before control characters?
4010 4. ASCII should be designated to G0 at end of line?
4011 5. 7-bit environment or 8-bit environment?
4012 6. Use locking-shift?
4013 7. Use Single-shift?
4014 And the following two are only for Japanese:
4015 8. Use ASCII in place of JIS0201-1976-Roman?
4016 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
4017 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4018 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 4019 details.
4ed46869
KH
4020*/
4021
4022/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
4023 register REG at DST, and increment DST. If <final-char> of CHARSET is
4024 '@', 'A', or 'B' and the coding system CODING allows, produce
4025 designation sequence of short-form. */
4ed46869
KH
4026
4027#define ENCODE_DESIGNATION(charset, reg, coding) \
4028 do { \
df7492f9 4029 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
675e2c69
DN
4030 const char *intermediate_char_94 = "()*+"; \
4031 const char *intermediate_char_96 = ",-./"; \
df7492f9
KH
4032 int revision = -1; \
4033 int c; \
4034 \
4035 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 4036 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
4037 \
4038 if (revision >= 0) \
70c22245 4039 { \
df7492f9
KH
4040 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4041 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 4042 } \
df7492f9 4043 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
4044 if (CHARSET_DIMENSION (charset) == 1) \
4045 { \
df7492f9
KH
4046 if (! CHARSET_ISO_CHARS_96 (charset)) \
4047 c = intermediate_char_94[reg]; \
4ed46869 4048 else \
df7492f9
KH
4049 c = intermediate_char_96[reg]; \
4050 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
4051 } \
4052 else \
4053 { \
df7492f9
KH
4054 EMIT_ONE_ASCII_BYTE ('$'); \
4055 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 4056 { \
df7492f9 4057 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
4058 || reg != 0 \
4059 || final_char < '@' || final_char > 'B') \
df7492f9 4060 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
4061 } \
4062 else \
df7492f9 4063 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 4064 } \
df7492f9
KH
4065 EMIT_ONE_ASCII_BYTE (final_char); \
4066 \
4067 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
4068 } while (0)
4069
df7492f9 4070
4ed46869
KH
4071/* The following two macros produce codes (control character or escape
4072 sequence) for ISO2022 single-shift functions (single-shift-2 and
4073 single-shift-3). */
4074
df7492f9
KH
4075#define ENCODE_SINGLE_SHIFT_2 \
4076 do { \
4077 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4078 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4079 else \
4080 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4081 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4082 } while (0)
4083
df7492f9
KH
4084
4085#define ENCODE_SINGLE_SHIFT_3 \
4086 do { \
4087 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4088 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4089 else \
4090 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4091 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4092 } while (0)
4093
df7492f9 4094
4ed46869
KH
4095/* The following four macros produce codes (control character or
4096 escape sequence) for ISO2022 locking-shift functions (shift-in,
4097 shift-out, locking-shift-2, and locking-shift-3). */
4098
df7492f9
KH
4099#define ENCODE_SHIFT_IN \
4100 do { \
4101 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4102 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
4103 } while (0)
4104
df7492f9
KH
4105
4106#define ENCODE_SHIFT_OUT \
4107 do { \
4108 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4109 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
4110 } while (0)
4111
df7492f9
KH
4112
4113#define ENCODE_LOCKING_SHIFT_2 \
4114 do { \
4115 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4116 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
4117 } while (0)
4118
df7492f9
KH
4119
4120#define ENCODE_LOCKING_SHIFT_3 \
4121 do { \
4122 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4123 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
4124 } while (0)
4125
df7492f9 4126
f4dee582
RS
4127/* Produce codes for a DIMENSION1 character whose character set is
4128 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
4129 sequences are also produced in advance if necessary. */
4130
6e85d753
KH
4131#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4132 do { \
df7492f9 4133 int id = CHARSET_ID (charset); \
bf16eb23
KH
4134 \
4135 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4136 && id == charset_ascii) \
4137 { \
4138 id = charset_jisx0201_roman; \
4139 charset = CHARSET_FROM_ID (id); \
4140 } \
4141 \
df7492f9 4142 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4143 { \
df7492f9
KH
4144 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4145 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 4146 else \
df7492f9
KH
4147 EMIT_ONE_BYTE (c1 | 0x80); \
4148 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4149 break; \
4150 } \
df7492f9 4151 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4152 { \
df7492f9 4153 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
4154 break; \
4155 } \
df7492f9 4156 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4157 { \
df7492f9 4158 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
4159 break; \
4160 } \
6e85d753
KH
4161 else \
4162 /* Since CHARSET is not yet invoked to any graphic planes, we \
4163 must invoke it, or, at first, designate it to some graphic \
4164 register. Then repeat the loop to actually produce the \
4165 character. */ \
df7492f9
KH
4166 dst = encode_invocation_designation (charset, coding, dst, \
4167 &produced_chars); \
4ed46869
KH
4168 } while (1)
4169
df7492f9 4170
f4dee582
RS
4171/* Produce codes for a DIMENSION2 character whose character set is
4172 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
4173 invocation codes are also produced in advance if necessary. */
4174
6e85d753
KH
4175#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4176 do { \
df7492f9 4177 int id = CHARSET_ID (charset); \
bf16eb23
KH
4178 \
4179 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4180 && id == charset_jisx0208) \
4181 { \
4182 id = charset_jisx0208_1978; \
4183 charset = CHARSET_FROM_ID (id); \
4184 } \
4185 \
df7492f9 4186 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4187 { \
df7492f9
KH
4188 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4189 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 4190 else \
df7492f9
KH
4191 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4192 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4193 break; \
4194 } \
df7492f9 4195 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4196 { \
df7492f9 4197 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
4198 break; \
4199 } \
df7492f9 4200 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4201 { \
df7492f9 4202 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
4203 break; \
4204 } \
6e85d753
KH
4205 else \
4206 /* Since CHARSET is not yet invoked to any graphic planes, we \
4207 must invoke it, or, at first, designate it to some graphic \
4208 register. Then repeat the loop to actually produce the \
4209 character. */ \
df7492f9
KH
4210 dst = encode_invocation_designation (charset, coding, dst, \
4211 &produced_chars); \
4ed46869
KH
4212 } while (1)
4213
05e6f5dc 4214
df7492f9
KH
4215#define ENCODE_ISO_CHARACTER(charset, c) \
4216 do { \
1a4990fb 4217 int code = ENCODE_CHAR ((charset), (c)); \
df7492f9
KH
4218 \
4219 if (CHARSET_DIMENSION (charset) == 1) \
4220 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4221 else \
4222 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 4223 } while (0)
bdd9fb48 4224
05e6f5dc 4225
4ed46869 4226/* Produce designation and invocation codes at a place pointed by DST
df7492f9 4227 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
4228 Return new DST. */
4229
4230unsigned char *
cf84bb53
JB
4231encode_invocation_designation (struct charset *charset,
4232 struct coding_system *coding,
4233 unsigned char *dst, int *p_nchars)
4ed46869 4234{
df7492f9
KH
4235 int multibytep = coding->dst_multibyte;
4236 int produced_chars = *p_nchars;
4ed46869 4237 int reg; /* graphic register number */
df7492f9 4238 int id = CHARSET_ID (charset);
4ed46869
KH
4239
4240 /* At first, check designations. */
4241 for (reg = 0; reg < 4; reg++)
df7492f9 4242 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
4243 break;
4244
4245 if (reg >= 4)
4246 {
4247 /* CHARSET is not yet designated to any graphic registers. */
4248 /* At first check the requested designation. */
df7492f9
KH
4249 reg = CODING_ISO_REQUEST (coding, id);
4250 if (reg < 0)
1ba9e4ab
KH
4251 /* Since CHARSET requests no special designation, designate it
4252 to graphic register 0. */
4ed46869
KH
4253 reg = 0;
4254
4255 ENCODE_DESIGNATION (charset, reg, coding);
4256 }
4257
df7492f9
KH
4258 if (CODING_ISO_INVOCATION (coding, 0) != reg
4259 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
4260 {
4261 /* Since the graphic register REG is not invoked to any graphic
4262 planes, invoke it to graphic plane 0. */
4263 switch (reg)
4264 {
4265 case 0: /* graphic register 0 */
4266 ENCODE_SHIFT_IN;
4267 break;
4268
4269 case 1: /* graphic register 1 */
4270 ENCODE_SHIFT_OUT;
4271 break;
4272
4273 case 2: /* graphic register 2 */
df7492f9 4274 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4275 ENCODE_SINGLE_SHIFT_2;
4276 else
4277 ENCODE_LOCKING_SHIFT_2;
4278 break;
4279
4280 case 3: /* graphic register 3 */
df7492f9 4281 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4282 ENCODE_SINGLE_SHIFT_3;
4283 else
4284 ENCODE_LOCKING_SHIFT_3;
4285 break;
4286 }
4287 }
b73bfc1c 4288
df7492f9 4289 *p_nchars = produced_chars;
4ed46869
KH
4290 return dst;
4291}
4292
df7492f9
KH
4293/* The following three macros produce codes for indicating direction
4294 of text. */
4295#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 4296 do { \
df7492f9
KH
4297 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
4298 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 4299 else \
df7492f9 4300 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
4301 } while (0)
4302
ec6d2bb8 4303
df7492f9
KH
4304#define ENCODE_DIRECTION_R2L() \
4305 do { \
4306 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4307 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
4308 } while (0)
4309
ec6d2bb8 4310
df7492f9 4311#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 4312 do { \
df7492f9
KH
4313 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4314 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 4315 } while (0)
4ed46869 4316
4ed46869
KH
4317
4318/* Produce codes for designation and invocation to reset the graphic
4319 planes and registers to initial state. */
df7492f9
KH
4320#define ENCODE_RESET_PLANE_AND_REGISTER() \
4321 do { \
4322 int reg; \
4323 struct charset *charset; \
4324 \
4325 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4326 ENCODE_SHIFT_IN; \
4327 for (reg = 0; reg < 4; reg++) \
4328 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4329 && (CODING_ISO_DESIGNATION (coding, reg) \
4330 != CODING_ISO_INITIAL (coding, reg))) \
4331 { \
4332 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4333 ENCODE_DESIGNATION (charset, reg, coding); \
4334 } \
4ed46869
KH
4335 } while (0)
4336
df7492f9 4337
bdd9fb48 4338/* Produce designation sequences of charsets in the line started from
b73bfc1c 4339 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
4340
4341 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4342 find all the necessary designations. */
4343
b73bfc1c 4344static unsigned char *
cf84bb53
JB
4345encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4346 int *charbuf_end, unsigned char *dst)
e0e989f6 4347{
df7492f9 4348 struct charset *charset;
bdd9fb48
KH
4349 /* Table of charsets to be designated to each graphic register. */
4350 int r[4];
df7492f9
KH
4351 int c, found = 0, reg;
4352 int produced_chars = 0;
4353 int multibytep = coding->dst_multibyte;
4354 Lisp_Object attrs;
4355 Lisp_Object charset_list;
4356
4357 attrs = CODING_ID_ATTRS (coding->id);
4358 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4359 if (EQ (charset_list, Qiso_2022))
4360 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4361
4362 for (reg = 0; reg < 4; reg++)
4363 r[reg] = -1;
4364
b73bfc1c 4365 while (found < 4)
e0e989f6 4366 {
df7492f9
KH
4367 int id;
4368
4369 c = *charbuf++;
b73bfc1c
KH
4370 if (c == '\n')
4371 break;
df7492f9
KH
4372 charset = char_charset (c, charset_list, NULL);
4373 id = CHARSET_ID (charset);
4374 reg = CODING_ISO_REQUEST (coding, id);
4375 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4376 {
4377 found++;
df7492f9 4378 r[reg] = id;
bdd9fb48 4379 }
bdd9fb48
KH
4380 }
4381
4382 if (found)
4383 {
4384 for (reg = 0; reg < 4; reg++)
4385 if (r[reg] >= 0
df7492f9
KH
4386 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4387 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4388 }
b73bfc1c
KH
4389
4390 return dst;
e0e989f6
KH
4391}
4392
4ed46869
KH
4393/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4394
df7492f9 4395static int
971de7fb 4396encode_coding_iso_2022 (struct coding_system *coding)
4ed46869 4397{
df7492f9
KH
4398 int multibytep = coding->dst_multibyte;
4399 int *charbuf = coding->charbuf;
4400 int *charbuf_end = charbuf + coding->charbuf_used;
4401 unsigned char *dst = coding->destination + coding->produced;
4402 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4403 int safe_room = 16;
4404 int bol_designation
4405 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4406 && CODING_ISO_BOL (coding));
4407 int produced_chars = 0;
4408 Lisp_Object attrs, eol_type, charset_list;
4409 int ascii_compatible;
b73bfc1c 4410 int c;
ff0dacd7 4411 int preferred_charset_id = -1;
05e6f5dc 4412
24a73b0a 4413 CODING_GET_INFO (coding, attrs, charset_list);
0a9564cb 4414 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
24a73b0a
KH
4415 if (VECTORP (eol_type))
4416 eol_type = Qunix;
4417
004068e4 4418 setup_iso_safe_charsets (attrs);
ff0dacd7 4419 /* Charset list may have been changed. */
287c57d7 4420 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 4421 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
0eecad43 4422
a552b35a
KH
4423 ascii_compatible
4424 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4425 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4426 | CODING_ISO_FLAG_LOCKING_SHIFT)));
bdd9fb48 4427
df7492f9 4428 while (charbuf < charbuf_end)
4ed46869 4429 {
df7492f9 4430 ASSURE_DESTINATION (safe_room);
b73bfc1c 4431
df7492f9 4432 if (bol_designation)
b73bfc1c 4433 {
df7492f9 4434 unsigned char *dst_prev = dst;
4ed46869 4435
bdd9fb48 4436 /* We have to produce designation sequences if any now. */
df7492f9
KH
4437 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4438 bol_designation = 0;
4439 /* We are sure that designation sequences are all ASCII bytes. */
4440 produced_chars += dst - dst_prev;
e0e989f6
KH
4441 }
4442
df7492f9 4443 c = *charbuf++;
ec6d2bb8 4444
ff0dacd7
KH
4445 if (c < 0)
4446 {
4447 /* Handle an annotation. */
4448 switch (*charbuf)
ec6d2bb8 4449 {
ff0dacd7
KH
4450 case CODING_ANNOTATE_COMPOSITION_MASK:
4451 /* Not yet implemented. */
4452 break;
4453 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4454 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4455 if (preferred_charset_id >= 0
4456 && NILP (Fmemq (make_number (preferred_charset_id),
4457 charset_list)))
4458 preferred_charset_id = -1;
4459 break;
4460 default:
4461 abort ();
4ed46869 4462 }
ff0dacd7
KH
4463 charbuf += -c - 1;
4464 continue;
4ed46869 4465 }
ec6d2bb8 4466
b73bfc1c
KH
4467 /* Now encode the character C. */
4468 if (c < 0x20 || c == 0x7F)
4469 {
df7492f9
KH
4470 if (c == '\n'
4471 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4472 {
df7492f9
KH
4473 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4474 ENCODE_RESET_PLANE_AND_REGISTER ();
4475 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4476 {
df7492f9
KH
4477 int i;
4478
4479 for (i = 0; i < 4; i++)
4480 CODING_ISO_DESIGNATION (coding, i)
4481 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4482 }
df7492f9
KH
4483 bol_designation
4484 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4485 }
df7492f9
KH
4486 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4487 ENCODE_RESET_PLANE_AND_REGISTER ();
4488 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4489 }
df7492f9 4490 else if (ASCII_CHAR_P (c))
88993dfd 4491 {
df7492f9
KH
4492 if (ascii_compatible)
4493 EMIT_ONE_ASCII_BYTE (c);
93dec019 4494 else
19a8d9e0 4495 {
bf16eb23
KH
4496 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4497 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4498 }
4ed46869 4499 }
16eafb5d 4500 else if (CHAR_BYTE8_P (c))
88993dfd 4501 {
16eafb5d
KH
4502 c = CHAR_TO_BYTE8 (c);
4503 EMIT_ONE_BYTE (c);
88993dfd 4504 }
b73bfc1c 4505 else
df7492f9 4506 {
ff0dacd7 4507 struct charset *charset;
b73bfc1c 4508
ff0dacd7
KH
4509 if (preferred_charset_id >= 0)
4510 {
4511 charset = CHARSET_FROM_ID (preferred_charset_id);
4512 if (! CHAR_CHARSET_P (c, charset))
4513 charset = char_charset (c, charset_list, NULL);
4514 }
4515 else
4516 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
4517 if (!charset)
4518 {
41cbe562
KH
4519 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4520 {
4521 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4522 charset = CHARSET_FROM_ID (charset_ascii);
4523 }
4524 else
4525 {
4526 c = coding->default_char;
4527 charset = char_charset (c, charset_list, NULL);
4528 }
df7492f9
KH
4529 }
4530 ENCODE_ISO_CHARACTER (charset, c);
4531 }
84fbb8a0 4532 }
b73bfc1c 4533
df7492f9
KH
4534 if (coding->mode & CODING_MODE_LAST_BLOCK
4535 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4536 {
4537 ASSURE_DESTINATION (safe_room);
4538 ENCODE_RESET_PLANE_AND_REGISTER ();
4539 }
065e3595 4540 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4541 CODING_ISO_BOL (coding) = bol_designation;
4542 coding->produced_char += produced_chars;
4543 coding->produced = dst - coding->destination;
4544 return 0;
4ed46869
KH
4545}
4546
4547\f
df7492f9 4548/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4549
df7492f9 4550/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4551 quite widely. So, for the moment, Emacs supports them in the bare
4552 C code. But, in the future, they may be supported only by CCL. */
4553
4554/* SJIS is a coding system encoding three character sets: ASCII, right
4555 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4556 as is. A character of charset katakana-jisx0201 is encoded by
4557 "position-code + 0x80". A character of charset japanese-jisx0208
4558 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4559 so that it fit in the range below.
4ed46869
KH
4560
4561 --- CODE RANGE of SJIS ---
4562 (character set) (range)
4563 ASCII 0x00 .. 0x7F
df7492f9 4564 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4565 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4566 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4567 -------------------------------
4568
4569*/
4570
4571/* BIG5 is a coding system encoding two character sets: ASCII and
4572 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4573 character set and is encoded in two-byte.
4ed46869
KH
4574
4575 --- CODE RANGE of BIG5 ---
4576 (character set) (range)
4577 ASCII 0x00 .. 0x7F
4578 Big5 (1st byte) 0xA1 .. 0xFE
4579 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4580 --------------------------
4581
df7492f9 4582 */
4ed46869
KH
4583
4584/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4585 Check if a text is encoded in SJIS. If it is, return
df7492f9 4586 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4587
0a28aafb 4588static int
cf84bb53
JB
4589detect_coding_sjis (struct coding_system *coding,
4590 struct coding_detection_info *detect_info)
4ed46869 4591{
065e3595 4592 const unsigned char *src = coding->source, *src_base;
8f924df7 4593 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4594 int multibytep = coding->src_multibyte;
4595 int consumed_chars = 0;
4596 int found = 0;
b73bfc1c 4597 int c;
f07190ca
KH
4598 Lisp_Object attrs, charset_list;
4599 int max_first_byte_of_2_byte_code;
4600
4601 CODING_GET_INFO (coding, attrs, charset_list);
4602 max_first_byte_of_2_byte_code
4603 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
df7492f9 4604
ff0dacd7 4605 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4606 /* A coding system of this category is always ASCII compatible. */
4607 src += coding->head_ascii;
4ed46869 4608
b73bfc1c 4609 while (1)
4ed46869 4610 {
065e3595 4611 src_base = src;
df7492f9 4612 ONE_MORE_BYTE (c);
682169fe
KH
4613 if (c < 0x80)
4614 continue;
f07190ca
KH
4615 if ((c >= 0x81 && c <= 0x9F)
4616 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4ed46869 4617 {
df7492f9 4618 ONE_MORE_BYTE (c);
682169fe 4619 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4620 break;
ff0dacd7 4621 found = CATEGORY_MASK_SJIS;
4ed46869 4622 }
df7492f9 4623 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4624 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4625 else
4626 break;
4ed46869 4627 }
ff0dacd7 4628 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4629 return 0;
4630
4631 no_more_source:
065e3595 4632 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4633 {
ff0dacd7 4634 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4635 return 0;
4ed46869 4636 }
ff0dacd7
KH
4637 detect_info->found |= found;
4638 return 1;
4ed46869
KH
4639}
4640
4641/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4642 Check if a text is encoded in BIG5. If it is, return
df7492f9 4643 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4644
0a28aafb 4645static int
cf84bb53
JB
4646detect_coding_big5 (struct coding_system *coding,
4647 struct coding_detection_info *detect_info)
4ed46869 4648{
065e3595 4649 const unsigned char *src = coding->source, *src_base;
8f924df7 4650 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4651 int multibytep = coding->src_multibyte;
4652 int consumed_chars = 0;
4653 int found = 0;
b73bfc1c 4654 int c;
fa42c37f 4655
ff0dacd7 4656 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4657 /* A coding system of this category is always ASCII compatible. */
4658 src += coding->head_ascii;
fa42c37f 4659
b73bfc1c 4660 while (1)
fa42c37f 4661 {
065e3595 4662 src_base = src;
df7492f9
KH
4663 ONE_MORE_BYTE (c);
4664 if (c < 0x80)
fa42c37f 4665 continue;
df7492f9 4666 if (c >= 0xA1)
fa42c37f 4667 {
df7492f9
KH
4668 ONE_MORE_BYTE (c);
4669 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4670 return 0;
ff0dacd7 4671 found = CATEGORY_MASK_BIG5;
fa42c37f 4672 }
df7492f9
KH
4673 else
4674 break;
fa42c37f 4675 }
ff0dacd7 4676 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4677 return 0;
fa42c37f 4678
df7492f9 4679 no_more_source:
065e3595 4680 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4681 {
ff0dacd7 4682 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4683 return 0;
4684 }
ff0dacd7
KH
4685 detect_info->found |= found;
4686 return 1;
fa42c37f
KH
4687}
4688
4ed46869
KH
4689/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4690 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4691
b73bfc1c 4692static void
971de7fb 4693decode_coding_sjis (struct coding_system *coding)
4ed46869 4694{
8f924df7
KH
4695 const unsigned char *src = coding->source + coding->consumed;
4696 const unsigned char *src_end = coding->source + coding->src_bytes;
4697 const unsigned char *src_base;
69a80ea3 4698 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4699 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4700 the end. */
69a80ea3 4701 int *charbuf_end
df80c7f0 4702 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
df7492f9
KH
4703 int consumed_chars = 0, consumed_chars_base;
4704 int multibytep = coding->src_multibyte;
4705 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4706 struct charset *charset_kanji2;
24a73b0a 4707 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4708 int char_offset = coding->produced_char;
4709 int last_offset = char_offset;
4710 int last_id = charset_ascii;
0a9564cb
EZ
4711 int eol_crlf =
4712 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4713 int byte_after_cr = -1;
a5d301df 4714
24a73b0a 4715 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4716
4717 val = charset_list;
4718 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4719 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4720 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4721 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4722
b73bfc1c 4723 while (1)
4ed46869 4724 {
df7492f9 4725 int c, c1;
24a73b0a 4726 struct charset *charset;
fa42c37f 4727
b73bfc1c 4728 src_base = src;
df7492f9 4729 consumed_chars_base = consumed_chars;
fa42c37f 4730
df7492f9 4731 if (charbuf >= charbuf_end)
b71f6f73
KH
4732 {
4733 if (byte_after_cr >= 0)
4734 src_base--;
4735 break;
4736 }
df7492f9 4737
119852e7
KH
4738 if (byte_after_cr >= 0)
4739 c = byte_after_cr, byte_after_cr = -1;
4740 else
4741 ONE_MORE_BYTE (c);
065e3595
KH
4742 if (c < 0)
4743 goto invalid_code;
24a73b0a 4744 if (c < 0x80)
119852e7
KH
4745 {
4746 if (eol_crlf && c == '\r')
4747 ONE_MORE_BYTE (byte_after_cr);
4748 charset = charset_roman;
4749 }
57a47f8a 4750 else if (c == 0x80 || c == 0xA0)
8e921c4b 4751 goto invalid_code;
57a47f8a
KH
4752 else if (c >= 0xA1 && c <= 0xDF)
4753 {
4754 /* SJIS -> JISX0201-Kana */
4755 c &= 0x7F;
4756 charset = charset_kana;
4757 }
4758 else if (c <= 0xEF)
df7492f9 4759 {
57a47f8a
KH
4760 /* SJIS -> JISX0208 */
4761 ONE_MORE_BYTE (c1);
4762 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4763 goto invalid_code;
57a47f8a
KH
4764 c = (c << 8) | c1;
4765 SJIS_TO_JIS (c);
4766 charset = charset_kanji;
4767 }
4768 else if (c <= 0xFC && charset_kanji2)
4769 {
c6876370 4770 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4771 ONE_MORE_BYTE (c1);
4772 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4773 goto invalid_code;
57a47f8a
KH
4774 c = (c << 8) | c1;
4775 SJIS_TO_JIS2 (c);
4776 charset = charset_kanji2;
df7492f9 4777 }
57a47f8a
KH
4778 else
4779 goto invalid_code;
24a73b0a
KH
4780 if (charset->id != charset_ascii
4781 && last_id != charset->id)
4782 {
4783 if (last_id != charset_ascii)
69a80ea3 4784 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4785 last_id = charset->id;
4786 last_offset = char_offset;
4787 }
4788 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4789 *charbuf++ = c;
ff0dacd7 4790 char_offset++;
df7492f9 4791 continue;
b73bfc1c 4792
df7492f9
KH
4793 invalid_code:
4794 src = src_base;
4795 consumed_chars = consumed_chars_base;
4796 ONE_MORE_BYTE (c);
065e3595 4797 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4798 char_offset++;
df7492f9
KH
4799 coding->errors++;
4800 }
fa42c37f 4801
df7492f9 4802 no_more_source:
ff0dacd7 4803 if (last_id != charset_ascii)
69a80ea3 4804 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4805 coding->consumed_char += consumed_chars_base;
4806 coding->consumed = src_base - coding->source;
4807 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4808}
4809
b73bfc1c 4810static void
971de7fb 4811decode_coding_big5 (struct coding_system *coding)
4ed46869 4812{
8f924df7
KH
4813 const unsigned char *src = coding->source + coding->consumed;
4814 const unsigned char *src_end = coding->source + coding->src_bytes;
4815 const unsigned char *src_base;
69a80ea3 4816 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4817 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4818 the end. */
69a80ea3 4819 int *charbuf_end
df80c7f0 4820 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
df7492f9
KH
4821 int consumed_chars = 0, consumed_chars_base;
4822 int multibytep = coding->src_multibyte;
4823 struct charset *charset_roman, *charset_big5;
24a73b0a 4824 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4825 int char_offset = coding->produced_char;
4826 int last_offset = char_offset;
4827 int last_id = charset_ascii;
0a9564cb
EZ
4828 int eol_crlf =
4829 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4830 int byte_after_cr = -1;
df7492f9 4831
24a73b0a 4832 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4833 val = charset_list;
4834 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4835 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4836
b73bfc1c 4837 while (1)
4ed46869 4838 {
df7492f9 4839 int c, c1;
24a73b0a 4840 struct charset *charset;
b73bfc1c
KH
4841
4842 src_base = src;
df7492f9
KH
4843 consumed_chars_base = consumed_chars;
4844
4845 if (charbuf >= charbuf_end)
b71f6f73
KH
4846 {
4847 if (byte_after_cr >= 0)
4848 src_base--;
4849 break;
4850 }
df7492f9 4851
119852e7 4852 if (byte_after_cr >= 0)
14daee73 4853 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4854 else
4855 ONE_MORE_BYTE (c);
b73bfc1c 4856
065e3595
KH
4857 if (c < 0)
4858 goto invalid_code;
24a73b0a 4859 if (c < 0x80)
119852e7 4860 {
14daee73 4861 if (eol_crlf && c == '\r')
119852e7
KH
4862 ONE_MORE_BYTE (byte_after_cr);
4863 charset = charset_roman;
4864 }
24a73b0a 4865 else
4ed46869 4866 {
24a73b0a
KH
4867 /* BIG5 -> Big5 */
4868 if (c < 0xA1 || c > 0xFE)
4869 goto invalid_code;
4870 ONE_MORE_BYTE (c1);
4871 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4872 goto invalid_code;
4873 c = c << 8 | c1;
4874 charset = charset_big5;
4ed46869 4875 }
24a73b0a
KH
4876 if (charset->id != charset_ascii
4877 && last_id != charset->id)
df7492f9 4878 {
24a73b0a 4879 if (last_id != charset_ascii)
69a80ea3 4880 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4881 last_id = charset->id;
4882 last_offset = char_offset;
4ed46869 4883 }
24a73b0a 4884 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4885 *charbuf++ = c;
ff0dacd7 4886 char_offset++;
fb88bf2d
KH
4887 continue;
4888
df7492f9 4889 invalid_code:
4ed46869 4890 src = src_base;
df7492f9
KH
4891 consumed_chars = consumed_chars_base;
4892 ONE_MORE_BYTE (c);
065e3595 4893 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4894 char_offset++;
df7492f9 4895 coding->errors++;
fb88bf2d 4896 }
d46c5b12 4897
df7492f9 4898 no_more_source:
ff0dacd7 4899 if (last_id != charset_ascii)
69a80ea3 4900 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4901 coding->consumed_char += consumed_chars_base;
4902 coding->consumed = src_base - coding->source;
4903 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4904}
4905
4906/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4907 This function can encode charsets `ascii', `katakana-jisx0201',
4908 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4909 are sure that all these charsets are registered as official charset
4ed46869
KH
4910 (i.e. do not have extended leading-codes). Characters of other
4911 charsets are produced without any encoding. If SJIS_P is 1, encode
4912 SJIS text, else encode BIG5 text. */
4913
df7492f9 4914static int
971de7fb 4915encode_coding_sjis (struct coding_system *coding)
4ed46869 4916{
df7492f9
KH
4917 int multibytep = coding->dst_multibyte;
4918 int *charbuf = coding->charbuf;
4919 int *charbuf_end = charbuf + coding->charbuf_used;
4920 unsigned char *dst = coding->destination + coding->produced;
4921 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4922 int safe_room = 4;
4923 int produced_chars = 0;
24a73b0a 4924 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4925 int ascii_compatible;
4926 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4927 struct charset *charset_kanji2;
df7492f9 4928 int c;
a5d301df 4929
24a73b0a 4930 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4931 val = charset_list;
4932 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4933 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4934 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4935 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4936
df7492f9 4937 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4938
df7492f9
KH
4939 while (charbuf < charbuf_end)
4940 {
4941 ASSURE_DESTINATION (safe_room);
4942 c = *charbuf++;
b73bfc1c 4943 /* Now encode the character C. */
df7492f9
KH
4944 if (ASCII_CHAR_P (c) && ascii_compatible)
4945 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4946 else if (CHAR_BYTE8_P (c))
4947 {
4948 c = CHAR_TO_BYTE8 (c);
4949 EMIT_ONE_BYTE (c);
4950 }
df7492f9 4951 else
b73bfc1c 4952 {
df7492f9
KH
4953 unsigned code;
4954 struct charset *charset = char_charset (c, charset_list, &code);
4955
4956 if (!charset)
4ed46869 4957 {
41cbe562 4958 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4959 {
41cbe562
KH
4960 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4961 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4962 }
41cbe562 4963 else
b73bfc1c 4964 {
41cbe562
KH
4965 c = coding->default_char;
4966 charset = char_charset (c, charset_list, &code);
b73bfc1c 4967 }
b73bfc1c 4968 }
df7492f9
KH
4969 if (code == CHARSET_INVALID_CODE (charset))
4970 abort ();
4971 if (charset == charset_kanji)
4972 {
4973 int c1, c2;
4974 JIS_TO_SJIS (code);
4975 c1 = code >> 8, c2 = code & 0xFF;
4976 EMIT_TWO_BYTES (c1, c2);
4977 }
4978 else if (charset == charset_kana)
4979 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4980 else if (charset_kanji2 && charset == charset_kanji2)
4981 {
4982 int c1, c2;
4983
4984 c1 = code >> 8;
f07190ca
KH
4985 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4986 || c1 == 0x28
57a47f8a
KH
4987 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4988 {
4989 JIS_TO_SJIS2 (code);
4990 c1 = code >> 8, c2 = code & 0xFF;
4991 EMIT_TWO_BYTES (c1, c2);
4992 }
4993 else
4994 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4995 }
df7492f9
KH
4996 else
4997 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4998 }
4999 }
065e3595 5000 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5001 coding->produced_char += produced_chars;
5002 coding->produced = dst - coding->destination;
5003 return 0;
5004}
5005
5006static int
971de7fb 5007encode_coding_big5 (struct coding_system *coding)
df7492f9
KH
5008{
5009 int multibytep = coding->dst_multibyte;
5010 int *charbuf = coding->charbuf;
5011 int *charbuf_end = charbuf + coding->charbuf_used;
5012 unsigned char *dst = coding->destination + coding->produced;
5013 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5014 int safe_room = 4;
5015 int produced_chars = 0;
24a73b0a 5016 Lisp_Object attrs, charset_list, val;
df7492f9
KH
5017 int ascii_compatible;
5018 struct charset *charset_roman, *charset_big5;
5019 int c;
5020
24a73b0a 5021 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
5022 val = charset_list;
5023 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5024 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5025 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5026
5027 while (charbuf < charbuf_end)
5028 {
5029 ASSURE_DESTINATION (safe_room);
5030 c = *charbuf++;
5031 /* Now encode the character C. */
5032 if (ASCII_CHAR_P (c) && ascii_compatible)
5033 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
5034 else if (CHAR_BYTE8_P (c))
5035 {
5036 c = CHAR_TO_BYTE8 (c);
5037 EMIT_ONE_BYTE (c);
b73bfc1c
KH
5038 }
5039 else
5040 {
df7492f9
KH
5041 unsigned code;
5042 struct charset *charset = char_charset (c, charset_list, &code);
5043
5044 if (! charset)
b73bfc1c 5045 {
41cbe562 5046 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 5047 {
41cbe562
KH
5048 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5049 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 5050 }
41cbe562 5051 else
0eecad43 5052 {
41cbe562
KH
5053 c = coding->default_char;
5054 charset = char_charset (c, charset_list, &code);
0eecad43 5055 }
4ed46869 5056 }
df7492f9
KH
5057 if (code == CHARSET_INVALID_CODE (charset))
5058 abort ();
5059 if (charset == charset_big5)
b73bfc1c 5060 {
df7492f9
KH
5061 int c1, c2;
5062
5063 c1 = code >> 8, c2 = code & 0xFF;
5064 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 5065 }
df7492f9
KH
5066 else
5067 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 5068 }
4ed46869 5069 }
065e3595 5070 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5071 coding->produced_char += produced_chars;
5072 coding->produced = dst - coding->destination;
5073 return 0;
4ed46869
KH
5074}
5075
5076\f
df7492f9 5077/*** 10. CCL handlers ***/
1397dc18
KH
5078
5079/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5080 Check if a text is encoded in a coding system of which
5081 encoder/decoder are written in CCL program. If it is, return
df7492f9 5082 CATEGORY_MASK_CCL, else return 0. */
1397dc18 5083
0a28aafb 5084static int
cf84bb53
JB
5085detect_coding_ccl (struct coding_system *coding,
5086 struct coding_detection_info *detect_info)
1397dc18 5087{
065e3595 5088 const unsigned char *src = coding->source, *src_base;
8f924df7 5089 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
5090 int multibytep = coding->src_multibyte;
5091 int consumed_chars = 0;
5092 int found = 0;
0e219d54 5093 unsigned char *valids;
df7492f9
KH
5094 int head_ascii = coding->head_ascii;
5095 Lisp_Object attrs;
5096
ff0dacd7
KH
5097 detect_info->checked |= CATEGORY_MASK_CCL;
5098
df7492f9 5099 coding = &coding_categories[coding_category_ccl];
0e219d54 5100 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
5101 attrs = CODING_ID_ATTRS (coding->id);
5102 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5103 src += head_ascii;
1397dc18 5104
b73bfc1c 5105 while (1)
1397dc18 5106 {
df7492f9 5107 int c;
065e3595
KH
5108
5109 src_base = src;
df7492f9 5110 ONE_MORE_BYTE (c);
065e3595 5111 if (c < 0 || ! valids[c])
df7492f9 5112 break;
ff0dacd7
KH
5113 if ((valids[c] > 1))
5114 found = CATEGORY_MASK_CCL;
df7492f9 5115 }
ff0dacd7 5116 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
5117 return 0;
5118
5119 no_more_source:
ff0dacd7
KH
5120 detect_info->found |= found;
5121 return 1;
df7492f9
KH
5122}
5123
5124static void
971de7fb 5125decode_coding_ccl (struct coding_system *coding)
df7492f9 5126{
7c78e542 5127 const unsigned char *src = coding->source + coding->consumed;
8f924df7 5128 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
5129 int *charbuf = coding->charbuf + coding->charbuf_used;
5130 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
5131 int consumed_chars = 0;
5132 int multibytep = coding->src_multibyte;
d0396581 5133 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9 5134 int source_charbuf[1024];
fbdc1721 5135 int source_byteidx[1025];
24a73b0a 5136 Lisp_Object attrs, charset_list;
df7492f9 5137
24a73b0a 5138 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5139
d0396581 5140 while (1)
df7492f9 5141 {
7c78e542 5142 const unsigned char *p = src;
df7492f9
KH
5143 int i = 0;
5144
5145 if (multibytep)
fbdc1721
KH
5146 {
5147 while (i < 1024 && p < src_end)
5148 {
5149 source_byteidx[i] = p - src;
5150 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5151 }
5152 source_byteidx[i] = p - src;
5153 }
df7492f9
KH
5154 else
5155 while (i < 1024 && p < src_end)
5156 source_charbuf[i++] = *p++;
8f924df7 5157
df7492f9 5158 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
d0396581
KH
5159 ccl->last_block = 1;
5160 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5161 charset_list);
5162 charbuf += ccl->produced;
fbdc1721 5163 if (multibytep)
d0396581 5164 src += source_byteidx[ccl->consumed];
df7492f9 5165 else
d0396581
KH
5166 src += ccl->consumed;
5167 consumed_chars += ccl->consumed;
5168 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
df7492f9
KH
5169 break;
5170 }
5171
d0396581 5172 switch (ccl->status)
df7492f9
KH
5173 {
5174 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5175 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5176 break;
5177 case CCL_STAT_SUSPEND_BY_DST:
d0396581 5178 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5179 break;
5180 case CCL_STAT_QUIT:
5181 case CCL_STAT_INVALID_CMD:
065e3595 5182 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5183 break;
5184 default:
065e3595 5185 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5186 break;
5187 }
5188 coding->consumed_char += consumed_chars;
5189 coding->consumed = src - coding->source;
5190 coding->charbuf_used = charbuf - coding->charbuf;
5191}
5192
5193static int
971de7fb 5194encode_coding_ccl (struct coding_system *coding)
df7492f9 5195{
fb608df3 5196 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9
KH
5197 int multibytep = coding->dst_multibyte;
5198 int *charbuf = coding->charbuf;
5199 int *charbuf_end = charbuf + coding->charbuf_used;
5200 unsigned char *dst = coding->destination + coding->produced;
5201 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9
KH
5202 int destination_charbuf[1024];
5203 int i, produced_chars = 0;
24a73b0a 5204 Lisp_Object attrs, charset_list;
df7492f9 5205
24a73b0a 5206 CODING_GET_INFO (coding, attrs, charset_list);
fb608df3
KH
5207 if (coding->consumed_char == coding->src_chars
5208 && coding->mode & CODING_MODE_LAST_BLOCK)
5209 ccl->last_block = 1;
df7492f9 5210
8cffd3e7 5211 while (charbuf < charbuf_end)
df7492f9 5212 {
fb608df3 5213 ccl_driver (ccl, charbuf, destination_charbuf,
8cffd3e7 5214 charbuf_end - charbuf, 1024, charset_list);
df7492f9 5215 if (multibytep)
8cffd3e7 5216 {
fb608df3
KH
5217 ASSURE_DESTINATION (ccl->produced * 2);
5218 for (i = 0; i < ccl->produced; i++)
8cffd3e7
KH
5219 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5220 }
df7492f9
KH
5221 else
5222 {
fb608df3
KH
5223 ASSURE_DESTINATION (ccl->produced);
5224 for (i = 0; i < ccl->produced; i++)
df7492f9 5225 *dst++ = destination_charbuf[i] & 0xFF;
fb608df3 5226 produced_chars += ccl->produced;
df7492f9 5227 }
fb608df3
KH
5228 charbuf += ccl->consumed;
5229 if (ccl->status == CCL_STAT_QUIT
5230 || ccl->status == CCL_STAT_INVALID_CMD)
8cffd3e7 5231 break;
df7492f9
KH
5232 }
5233
fb608df3 5234 switch (ccl->status)
df7492f9
KH
5235 {
5236 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5237 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5238 break;
5239 case CCL_STAT_SUSPEND_BY_DST:
065e3595 5240 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5241 break;
5242 case CCL_STAT_QUIT:
5243 case CCL_STAT_INVALID_CMD:
065e3595 5244 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5245 break;
5246 default:
065e3595 5247 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5248 break;
1397dc18 5249 }
df7492f9
KH
5250
5251 coding->produced_char += produced_chars;
5252 coding->produced = dst - coding->destination;
5253 return 0;
1397dc18
KH
5254}
5255
df7492f9 5256
1397dc18 5257\f
df7492f9 5258/*** 10, 11. no-conversion handlers ***/
4ed46869 5259
b73bfc1c 5260/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 5261
b73bfc1c 5262static void
971de7fb 5263decode_coding_raw_text (struct coding_system *coding)
4ed46869 5264{
0a9564cb
EZ
5265 int eol_crlf =
5266 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5267
df7492f9 5268 coding->chars_at_source = 1;
119852e7
KH
5269 coding->consumed_char = coding->src_chars;
5270 coding->consumed = coding->src_bytes;
5271 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5272 {
5273 coding->consumed_char--;
5274 coding->consumed--;
5275 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5276 }
5277 else
5278 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5279}
4ed46869 5280
df7492f9 5281static int
971de7fb 5282encode_coding_raw_text (struct coding_system *coding)
df7492f9
KH
5283{
5284 int multibytep = coding->dst_multibyte;
5285 int *charbuf = coding->charbuf;
5286 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5287 unsigned char *dst = coding->destination + coding->produced;
5288 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a0ed9b27 5289 int produced_chars = 0;
b73bfc1c
KH
5290 int c;
5291
df7492f9 5292 if (multibytep)
b73bfc1c 5293 {
df7492f9 5294 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 5295
df7492f9
KH
5296 if (coding->src_multibyte)
5297 while (charbuf < charbuf_end)
5298 {
5299 ASSURE_DESTINATION (safe_room);
5300 c = *charbuf++;
5301 if (ASCII_CHAR_P (c))
5302 EMIT_ONE_ASCII_BYTE (c);
5303 else if (CHAR_BYTE8_P (c))
5304 {
5305 c = CHAR_TO_BYTE8 (c);
5306 EMIT_ONE_BYTE (c);
5307 }
5308 else
5309 {
5310 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 5311
df7492f9
KH
5312 CHAR_STRING_ADVANCE (c, p1);
5313 while (p0 < p1)
9d123124
KH
5314 {
5315 EMIT_ONE_BYTE (*p0);
5316 p0++;
5317 }
df7492f9
KH
5318 }
5319 }
b73bfc1c 5320 else
df7492f9
KH
5321 while (charbuf < charbuf_end)
5322 {
5323 ASSURE_DESTINATION (safe_room);
5324 c = *charbuf++;
5325 EMIT_ONE_BYTE (c);
5326 }
5327 }
5328 else
4ed46869 5329 {
df7492f9 5330 if (coding->src_multibyte)
d46c5b12 5331 {
df7492f9
KH
5332 int safe_room = MAX_MULTIBYTE_LENGTH;
5333
5334 while (charbuf < charbuf_end)
d46c5b12 5335 {
df7492f9
KH
5336 ASSURE_DESTINATION (safe_room);
5337 c = *charbuf++;
5338 if (ASCII_CHAR_P (c))
5339 *dst++ = c;
5340 else if (CHAR_BYTE8_P (c))
5341 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5342 else
df7492f9 5343 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5344 }
5345 }
df7492f9
KH
5346 else
5347 {
5348 ASSURE_DESTINATION (charbuf_end - charbuf);
5349 while (charbuf < charbuf_end && dst < dst_end)
5350 *dst++ = *charbuf++;
8f924df7 5351 }
319a3947 5352 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5353 }
065e3595 5354 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5355 coding->produced_char += produced_chars;
df7492f9
KH
5356 coding->produced = dst - coding->destination;
5357 return 0;
4ed46869
KH
5358}
5359
ff0dacd7
KH
5360/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5361 Check if a text is encoded in a charset-based coding system. If it
5362 is, return 1, else return 0. */
5363
0a28aafb 5364static int
cf84bb53
JB
5365detect_coding_charset (struct coding_system *coding,
5366 struct coding_detection_info *detect_info)
1397dc18 5367{
065e3595 5368 const unsigned char *src = coding->source, *src_base;
8f924df7 5369 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
5370 int multibytep = coding->src_multibyte;
5371 int consumed_chars = 0;
07295713 5372 Lisp_Object attrs, valids, name;
584948ac 5373 int found = 0;
716b3fa0 5374 int head_ascii = coding->head_ascii;
07295713 5375 int check_latin_extra = 0;
1397dc18 5376
ff0dacd7
KH
5377 detect_info->checked |= CATEGORY_MASK_CHARSET;
5378
df7492f9
KH
5379 coding = &coding_categories[coding_category_charset];
5380 attrs = CODING_ID_ATTRS (coding->id);
5381 valids = AREF (attrs, coding_attr_charset_valids);
07295713 5382 name = CODING_ID_NAME (coding->id);
51b59d79 5383 if (strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5384 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
51b59d79 5385 || strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5386 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
07295713 5387 check_latin_extra = 1;
237aabf4 5388
df7492f9 5389 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5390 src += head_ascii;
1397dc18 5391
b73bfc1c 5392 while (1)
1397dc18 5393 {
df7492f9 5394 int c;
716b3fa0
KH
5395 Lisp_Object val;
5396 struct charset *charset;
5397 int dim, idx;
1397dc18 5398
065e3595 5399 src_base = src;
df7492f9 5400 ONE_MORE_BYTE (c);
065e3595
KH
5401 if (c < 0)
5402 continue;
716b3fa0
KH
5403 val = AREF (valids, c);
5404 if (NILP (val))
df7492f9 5405 break;
584948ac 5406 if (c >= 0x80)
07295713
KH
5407 {
5408 if (c < 0xA0
237aabf4
JR
5409 && check_latin_extra
5410 && (!VECTORP (Vlatin_extra_code_table)
9f0526cb 5411 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
07295713
KH
5412 break;
5413 found = CATEGORY_MASK_CHARSET;
5414 }
716b3fa0
KH
5415 if (INTEGERP (val))
5416 {
5417 charset = CHARSET_FROM_ID (XFASTINT (val));
5418 dim = CHARSET_DIMENSION (charset);
5419 for (idx = 1; idx < dim; idx++)
5420 {
5421 if (src == src_end)
5422 goto too_short;
5423 ONE_MORE_BYTE (c);
3ed051d4 5424 if (c < charset->code_space[(dim - 1 - idx) * 2]
716b3fa0
KH
5425 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5426 break;
5427 }
5428 if (idx < dim)
5429 break;
5430 }
5431 else
5432 {
5433 idx = 1;
5434 for (; CONSP (val); val = XCDR (val))
5435 {
5436 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5437 dim = CHARSET_DIMENSION (charset);
5438 while (idx < dim)
5439 {
5440 if (src == src_end)
5441 goto too_short;
5442 ONE_MORE_BYTE (c);
5443 if (c < charset->code_space[(dim - 1 - idx) * 4]
5444 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5445 break;
5446 idx++;
5447 }
5448 if (idx == dim)
5449 {
5450 val = Qnil;
5451 break;
5452 }
5453 }
5454 if (CONSP (val))
5455 break;
5456 }
df7492f9 5457 }
716b3fa0 5458 too_short:
ff0dacd7 5459 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5460 return 0;
4ed46869 5461
df7492f9 5462 no_more_source:
ff0dacd7
KH
5463 detect_info->found |= found;
5464 return 1;
df7492f9 5465}
b73bfc1c 5466
b73bfc1c 5467static void
971de7fb 5468decode_coding_charset (struct coding_system *coding)
4ed46869 5469{
8f924df7
KH
5470 const unsigned char *src = coding->source + coding->consumed;
5471 const unsigned char *src_end = coding->source + coding->src_bytes;
5472 const unsigned char *src_base;
69a80ea3 5473 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 5474 /* We may produce one charset annotation in one loop and one more at
df80c7f0 5475 the end. */
69a80ea3 5476 int *charbuf_end
df80c7f0 5477 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
df7492f9
KH
5478 int consumed_chars = 0, consumed_chars_base;
5479 int multibytep = coding->src_multibyte;
24a73b0a 5480 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
5481 int char_offset = coding->produced_char;
5482 int last_offset = char_offset;
5483 int last_id = charset_ascii;
0a9564cb
EZ
5484 int eol_crlf =
5485 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5486 int byte_after_cr = -1;
df7492f9 5487
24a73b0a 5488 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 5489 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5490
df7492f9 5491 while (1)
4ed46869 5492 {
4eb6d3f1 5493 int c;
24a73b0a
KH
5494 Lisp_Object val;
5495 struct charset *charset;
5496 int dim;
5497 int len = 1;
5498 unsigned code;
df7492f9
KH
5499
5500 src_base = src;
5501 consumed_chars_base = consumed_chars;
b73bfc1c 5502
df7492f9 5503 if (charbuf >= charbuf_end)
b71f6f73
KH
5504 {
5505 if (byte_after_cr >= 0)
5506 src_base--;
5507 break;
5508 }
df7492f9 5509
119852e7
KH
5510 if (byte_after_cr >= 0)
5511 {
5512 c = byte_after_cr;
5513 byte_after_cr = -1;
5514 }
5515 else
5516 {
5517 ONE_MORE_BYTE (c);
5518 if (eol_crlf && c == '\r')
5519 ONE_MORE_BYTE (byte_after_cr);
5520 }
065e3595
KH
5521 if (c < 0)
5522 goto invalid_code;
24a73b0a
KH
5523 code = c;
5524
5525 val = AREF (valids, c);
1b17adfd 5526 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5527 goto invalid_code;
5528 if (INTEGERP (val))
d46c5b12 5529 {
24a73b0a
KH
5530 charset = CHARSET_FROM_ID (XFASTINT (val));
5531 dim = CHARSET_DIMENSION (charset);
5532 while (len < dim)
b73bfc1c 5533 {
24a73b0a
KH
5534 ONE_MORE_BYTE (c);
5535 code = (code << 8) | c;
5536 len++;
b73bfc1c 5537 }
24a73b0a
KH
5538 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5539 charset, code, c);
d46c5b12 5540 }
df7492f9 5541 else
d46c5b12 5542 {
24a73b0a
KH
5543 /* VAL is a list of charset IDs. It is assured that the
5544 list is sorted by charset dimensions (smaller one
5545 comes first). */
5546 while (CONSP (val))
4eb6d3f1 5547 {
24a73b0a 5548 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5549 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5550 while (len < dim)
4eb6d3f1 5551 {
acb2a965
KH
5552 ONE_MORE_BYTE (c);
5553 code = (code << 8) | c;
f9d71dcd 5554 len++;
4eb6d3f1 5555 }
24a73b0a
KH
5556 CODING_DECODE_CHAR (coding, src, src_base,
5557 src_end, charset, code, c);
5558 if (c >= 0)
5559 break;
5560 val = XCDR (val);
ff0dacd7 5561 }
d46c5b12 5562 }
24a73b0a
KH
5563 if (c < 0)
5564 goto invalid_code;
5565 if (charset->id != charset_ascii
5566 && last_id != charset->id)
5567 {
5568 if (last_id != charset_ascii)
69a80ea3 5569 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5570 last_id = charset->id;
5571 last_offset = char_offset;
5572 }
5573
df7492f9 5574 *charbuf++ = c;
ff0dacd7 5575 char_offset++;
df7492f9
KH
5576 continue;
5577
5578 invalid_code:
5579 src = src_base;
5580 consumed_chars = consumed_chars_base;
5581 ONE_MORE_BYTE (c);
065e3595 5582 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5583 char_offset++;
df7492f9 5584 coding->errors++;
4ed46869
KH
5585 }
5586
df7492f9 5587 no_more_source:
ff0dacd7 5588 if (last_id != charset_ascii)
69a80ea3 5589 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5590 coding->consumed_char += consumed_chars_base;
5591 coding->consumed = src_base - coding->source;
5592 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5593}
5594
df7492f9 5595static int
971de7fb 5596encode_coding_charset (struct coding_system *coding)
4ed46869 5597{
df7492f9
KH
5598 int multibytep = coding->dst_multibyte;
5599 int *charbuf = coding->charbuf;
5600 int *charbuf_end = charbuf + coding->charbuf_used;
5601 unsigned char *dst = coding->destination + coding->produced;
5602 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5603 int safe_room = MAX_MULTIBYTE_LENGTH;
5604 int produced_chars = 0;
24a73b0a 5605 Lisp_Object attrs, charset_list;
df7492f9 5606 int ascii_compatible;
b73bfc1c 5607 int c;
b73bfc1c 5608
24a73b0a 5609 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5610 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5611
df7492f9 5612 while (charbuf < charbuf_end)
4ed46869 5613 {
4eb6d3f1 5614 struct charset *charset;
df7492f9 5615 unsigned code;
8f924df7 5616
df7492f9
KH
5617 ASSURE_DESTINATION (safe_room);
5618 c = *charbuf++;
5619 if (ascii_compatible && ASCII_CHAR_P (c))
5620 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5621 else if (CHAR_BYTE8_P (c))
4ed46869 5622 {
16eafb5d
KH
5623 c = CHAR_TO_BYTE8 (c);
5624 EMIT_ONE_BYTE (c);
d46c5b12 5625 }
d46c5b12 5626 else
b73bfc1c 5627 {
4eb6d3f1
KH
5628 charset = char_charset (c, charset_list, &code);
5629 if (charset)
5630 {
5631 if (CHARSET_DIMENSION (charset) == 1)
5632 EMIT_ONE_BYTE (code);
5633 else if (CHARSET_DIMENSION (charset) == 2)
5634 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5635 else if (CHARSET_DIMENSION (charset) == 3)
5636 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5637 else
5638 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5639 (code >> 8) & 0xFF, code & 0xFF);
5640 }
5641 else
41cbe562
KH
5642 {
5643 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5644 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5645 else
5646 c = coding->default_char;
5647 EMIT_ONE_BYTE (c);
5648 }
4ed46869 5649 }
4ed46869
KH
5650 }
5651
065e3595 5652 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5653 coding->produced_char += produced_chars;
5654 coding->produced = dst - coding->destination;
5655 return 0;
4ed46869
KH
5656}
5657
5658\f
1397dc18 5659/*** 7. C library functions ***/
4ed46869 5660
df7492f9
KH
5661/* Setup coding context CODING from information about CODING_SYSTEM.
5662 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5663 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5664
ec6d2bb8 5665void
971de7fb 5666setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
4ed46869 5667{
df7492f9
KH
5668 Lisp_Object attrs;
5669 Lisp_Object eol_type;
5670 Lisp_Object coding_type;
4608c386 5671 Lisp_Object val;
4ed46869 5672
df7492f9 5673 if (NILP (coding_system))
ae6f73fa 5674 coding_system = Qundecided;
c07c8e12 5675
df7492f9 5676 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5677
df7492f9 5678 attrs = CODING_ID_ATTRS (coding->id);
0a9564cb 5679 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5680
df7492f9
KH
5681 coding->mode = 0;
5682 coding->head_ascii = -1;
4a015c45
KH
5683 if (VECTORP (eol_type))
5684 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5685 | CODING_REQUIRE_DETECTION_MASK);
5686 else if (! EQ (eol_type, Qunix))
5687 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5688 | CODING_REQUIRE_ENCODING_MASK);
5689 else
5690 coding->common_flags = 0;
5e5c78be
KH
5691 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5692 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5693 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5694 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5695 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5696 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5697
df7492f9 5698 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5699 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5700 coding->safe_charsets = SDATA (val);
df7492f9 5701 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
624bda09 5702 coding->carryover_bytes = 0;
4608c386 5703
df7492f9
KH
5704 coding_type = CODING_ATTR_TYPE (attrs);
5705 if (EQ (coding_type, Qundecided))
d46c5b12 5706 {
df7492f9
KH
5707 coding->detector = NULL;
5708 coding->decoder = decode_coding_raw_text;
5709 coding->encoder = encode_coding_raw_text;
5710 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5711 }
df7492f9 5712 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5713 {
df7492f9
KH
5714 int i;
5715 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5716
5717 /* Invoke graphic register 0 to plane 0. */
5718 CODING_ISO_INVOCATION (coding, 0) = 0;
5719 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5720 CODING_ISO_INVOCATION (coding, 1)
5721 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5722 /* Setup the initial status of designation. */
5723 for (i = 0; i < 4; i++)
5724 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5725 /* Not single shifting initially. */
5726 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5727 /* Beginning of buffer should also be regarded as bol. */
5728 CODING_ISO_BOL (coding) = 1;
5729 coding->detector = detect_coding_iso_2022;
5730 coding->decoder = decode_coding_iso_2022;
5731 coding->encoder = encode_coding_iso_2022;
5732 if (flags & CODING_ISO_FLAG_SAFE)
5733 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5734 coding->common_flags
df7492f9
KH
5735 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5736 | CODING_REQUIRE_FLUSHING_MASK);
5737 if (flags & CODING_ISO_FLAG_COMPOSITION)
5738 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5739 if (flags & CODING_ISO_FLAG_DESIGNATION)
5740 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5741 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5742 {
5743 setup_iso_safe_charsets (attrs);
5744 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5745 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5746 coding->safe_charsets = SDATA (val);
df7492f9
KH
5747 }
5748 CODING_ISO_FLAGS (coding) = flags;
e951386e
KH
5749 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5750 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5751 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5752 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
d46c5b12 5753 }
df7492f9 5754 else if (EQ (coding_type, Qcharset))
d46c5b12 5755 {
df7492f9
KH
5756 coding->detector = detect_coding_charset;
5757 coding->decoder = decode_coding_charset;
5758 coding->encoder = encode_coding_charset;
d46c5b12 5759 coding->common_flags
df7492f9 5760 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5761 }
df7492f9 5762 else if (EQ (coding_type, Qutf_8))
d46c5b12 5763 {
a470d443
KH
5764 val = AREF (attrs, coding_attr_utf_bom);
5765 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5766 : EQ (val, Qt) ? utf_with_bom
5767 : utf_without_bom);
df7492f9
KH
5768 coding->detector = detect_coding_utf_8;
5769 coding->decoder = decode_coding_utf_8;
5770 coding->encoder = encode_coding_utf_8;
5771 coding->common_flags
5772 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5773 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5774 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5775 }
5776 else if (EQ (coding_type, Qutf_16))
5777 {
a470d443
KH
5778 val = AREF (attrs, coding_attr_utf_bom);
5779 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5780 : EQ (val, Qt) ? utf_with_bom
5781 : utf_without_bom);
df7492f9 5782 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5783 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5784 : utf_16_little_endian);
e19c3639 5785 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5786 coding->detector = detect_coding_utf_16;
5787 coding->decoder = decode_coding_utf_16;
5788 coding->encoder = encode_coding_utf_16;
5789 coding->common_flags
5790 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5791 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5792 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5793 }
df7492f9 5794 else if (EQ (coding_type, Qccl))
4ed46869 5795 {
df7492f9
KH
5796 coding->detector = detect_coding_ccl;
5797 coding->decoder = decode_coding_ccl;
5798 coding->encoder = encode_coding_ccl;
c952af22 5799 coding->common_flags
df7492f9
KH
5800 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5801 | CODING_REQUIRE_FLUSHING_MASK);
5802 }
5803 else if (EQ (coding_type, Qemacs_mule))
5804 {
5805 coding->detector = detect_coding_emacs_mule;
5806 coding->decoder = decode_coding_emacs_mule;
5807 coding->encoder = encode_coding_emacs_mule;
c952af22 5808 coding->common_flags
df7492f9 5809 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
e951386e 5810 coding->spec.emacs_mule.full_support = 1;
df7492f9
KH
5811 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5812 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5813 {
5814 Lisp_Object tail, safe_charsets;
5815 int max_charset_id = 0;
5816
5817 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5818 tail = XCDR (tail))
5819 if (max_charset_id < XFASTINT (XCAR (tail)))
5820 max_charset_id = XFASTINT (XCAR (tail));
1b3b981b
AS
5821 safe_charsets = make_uninit_string (max_charset_id + 1);
5822 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
5823 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5824 tail = XCDR (tail))
8f924df7 5825 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5826 coding->max_charset_id = max_charset_id;
1b3b981b 5827 coding->safe_charsets = SDATA (safe_charsets);
e951386e 5828 coding->spec.emacs_mule.full_support = 1;
df7492f9 5829 }
e951386e
KH
5830 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5831 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
df7492f9
KH
5832 }
5833 else if (EQ (coding_type, Qshift_jis))
5834 {
5835 coding->detector = detect_coding_sjis;
5836 coding->decoder = decode_coding_sjis;
5837 coding->encoder = encode_coding_sjis;
c952af22 5838 coding->common_flags
df7492f9
KH
5839 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5840 }
5841 else if (EQ (coding_type, Qbig5))
5842 {
5843 coding->detector = detect_coding_big5;
5844 coding->decoder = decode_coding_big5;
5845 coding->encoder = encode_coding_big5;
c952af22 5846 coding->common_flags
df7492f9
KH
5847 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5848 }
5849 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5850 {
df7492f9
KH
5851 coding->detector = NULL;
5852 coding->decoder = decode_coding_raw_text;
5853 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5854 if (! EQ (eol_type, Qunix))
5855 {
5856 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5857 if (! VECTORP (eol_type))
5858 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5859 }
5860
4ed46869 5861 }
4ed46869 5862
df7492f9 5863 return;
4ed46869
KH
5864}
5865
0ff61e78
KH
5866/* Return a list of charsets supported by CODING. */
5867
5868Lisp_Object
971de7fb 5869coding_charset_list (struct coding_system *coding)
0ff61e78 5870{
35befdaa 5871 Lisp_Object attrs, charset_list;
0ff61e78
KH
5872
5873 CODING_GET_INFO (coding, attrs, charset_list);
5874 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5875 {
5876 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5877
5878 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5879 charset_list = Viso_2022_charset_list;
5880 }
5881 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5882 {
5883 charset_list = Vemacs_mule_charset_list;
5884 }
5885 return charset_list;
5886}
5887
5888
e9f91ece
KH
5889/* Return a list of charsets supported by CODING-SYSTEM. */
5890
5891Lisp_Object
971de7fb 5892coding_system_charset_list (Lisp_Object coding_system)
e9f91ece
KH
5893{
5894 int id;
5895 Lisp_Object attrs, charset_list;
5896
5897 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5898 attrs = CODING_ID_ATTRS (id);
5899
5900 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5901 {
5902 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5903
5904 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5905 charset_list = Viso_2022_charset_list;
5906 else
5907 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5908 }
5909 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5910 {
5911 charset_list = Vemacs_mule_charset_list;
5912 }
5913 else
5914 {
5915 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5916 }
5917 return charset_list;
5918}
5919
5920
df7492f9
KH
5921/* Return raw-text or one of its subsidiaries that has the same
5922 eol_type as CODING-SYSTEM. */
ec6d2bb8 5923
df7492f9 5924Lisp_Object
971de7fb 5925raw_text_coding_system (Lisp_Object coding_system)
ec6d2bb8 5926{
0be8721c 5927 Lisp_Object spec, attrs;
df7492f9 5928 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5929
d3e4cb56
KH
5930 if (NILP (coding_system))
5931 return Qraw_text;
df7492f9
KH
5932 spec = CODING_SYSTEM_SPEC (coding_system);
5933 attrs = AREF (spec, 0);
ec6d2bb8 5934
df7492f9
KH
5935 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5936 return coding_system;
ec6d2bb8 5937
df7492f9
KH
5938 eol_type = AREF (spec, 2);
5939 if (VECTORP (eol_type))
5940 return Qraw_text;
5941 spec = CODING_SYSTEM_SPEC (Qraw_text);
5942 raw_text_eol_type = AREF (spec, 2);
5943 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5944 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5945 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5946}
5947
54f78171 5948
1911a33b
KH
5949/* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5950 the subsidiary that has the same eol-spec as PARENT (if it is not
5951 nil and specifies end-of-line format) or the system's setting
fcbcfb64 5952 (system_eol_type). */
df7492f9
KH
5953
5954Lisp_Object
971de7fb 5955coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
54f78171 5956{
3e139625 5957 Lisp_Object spec, eol_type;
54f78171 5958
d3e4cb56
KH
5959 if (NILP (coding_system))
5960 coding_system = Qraw_text;
df7492f9 5961 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5962 eol_type = AREF (spec, 2);
fcbcfb64 5963 if (VECTORP (eol_type))
df7492f9 5964 {
df7492f9
KH
5965 Lisp_Object parent_eol_type;
5966
fcbcfb64
KH
5967 if (! NILP (parent))
5968 {
5969 Lisp_Object parent_spec;
5970
4a015c45 5971 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64 5972 parent_eol_type = AREF (parent_spec, 2);
1911a33b 5973 if (VECTORP (parent_eol_type))
4628bef1 5974 parent_eol_type = system_eol_type;
fcbcfb64
KH
5975 }
5976 else
5977 parent_eol_type = system_eol_type;
df7492f9
KH
5978 if (EQ (parent_eol_type, Qunix))
5979 coding_system = AREF (eol_type, 0);
5980 else if (EQ (parent_eol_type, Qdos))
5981 coding_system = AREF (eol_type, 1);
5982 else if (EQ (parent_eol_type, Qmac))
5983 coding_system = AREF (eol_type, 2);
54f78171 5984 }
df7492f9 5985 return coding_system;
54f78171
KH
5986}
5987
fcaf8878
KH
5988
5989/* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5990 decided for writing to a process. If not, complement them, and
5991 return a new coding system. */
5992
5993Lisp_Object
4628bef1 5994complement_process_encoding_system (Lisp_Object coding_system)
fcaf8878 5995{
5886ec9c
KH
5996 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5997 Lisp_Object spec, attrs;
93d50df8 5998 int i;
fcaf8878 5999
93d50df8 6000 for (i = 0; i < 3; i++)
fcaf8878 6001 {
93d50df8
KH
6002 if (i == 1)
6003 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6004 else if (i == 2)
6005 coding_system = preferred_coding_system ();
6006 spec = CODING_SYSTEM_SPEC (coding_system);
6007 if (NILP (spec))
6008 continue;
6009 attrs = AREF (spec, 0);
6010 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6011 coding_base = CODING_ATTR_BASE_NAME (attrs);
6012 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6013 eol_base = coding_system;
6014 if (! NILP (coding_base) && ! NILP (eol_base))
6015 break;
fcaf8878 6016 }
fcaf8878 6017
93d50df8
KH
6018 if (i > 0)
6019 /* The original CODING_SYSTEM didn't specify text-conversion or
6020 eol-conversion. Be sure that we return a fully complemented
6021 coding system. */
6022 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6023 return coding_system;
fcaf8878
KH
6024}
6025
6026
4ed46869
KH
6027/* Emacs has a mechanism to automatically detect a coding system if it
6028 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6029 it's impossible to distinguish some coding systems accurately
6030 because they use the same range of codes. So, at first, coding
6031 systems are categorized into 7, those are:
6032
0ef69138 6033 o coding-category-emacs-mule
4ed46869
KH
6034
6035 The category for a coding system which has the same code range
6036 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 6037 symbol) `emacs-mule' by default.
4ed46869
KH
6038
6039 o coding-category-sjis
6040
6041 The category for a coding system which has the same code range
6042 as SJIS. Assigned the coding-system (Lisp
7717c392 6043 symbol) `japanese-shift-jis' by default.
4ed46869
KH
6044
6045 o coding-category-iso-7
6046
6047 The category for a coding system which has the same code range
7717c392 6048 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
6049 shift and single shift functions. This can encode/decode all
6050 charsets. Assigned the coding-system (Lisp symbol)
6051 `iso-2022-7bit' by default.
6052
6053 o coding-category-iso-7-tight
6054
6055 Same as coding-category-iso-7 except that this can
6056 encode/decode only the specified charsets.
4ed46869
KH
6057
6058 o coding-category-iso-8-1
6059
6060 The category for a coding system which has the same code range
6061 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6062 for DIMENSION1 charset. This doesn't use any locking shift
6063 and single shift functions. Assigned the coding-system (Lisp
6064 symbol) `iso-latin-1' by default.
4ed46869
KH
6065
6066 o coding-category-iso-8-2
6067
6068 The category for a coding system which has the same code range
6069 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6070 for DIMENSION2 charset. This doesn't use any locking shift
6071 and single shift functions. Assigned the coding-system (Lisp
6072 symbol) `japanese-iso-8bit' by default.
4ed46869 6073
7717c392 6074 o coding-category-iso-7-else
4ed46869
KH
6075
6076 The category for a coding system which has the same code range
ad1746f5 6077 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
6078 single shift functions. Assigned the coding-system (Lisp
6079 symbol) `iso-2022-7bit-lock' by default.
6080
6081 o coding-category-iso-8-else
6082
6083 The category for a coding system which has the same code range
ad1746f5 6084 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
6085 single shift functions. Assigned the coding-system (Lisp
6086 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
6087
6088 o coding-category-big5
6089
6090 The category for a coding system which has the same code range
6091 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 6092 `cn-big5' by default.
4ed46869 6093
fa42c37f
KH
6094 o coding-category-utf-8
6095
6096 The category for a coding system which has the same code range
6e76ae91 6097 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
6098 symbol) `utf-8' by default.
6099
6100 o coding-category-utf-16-be
6101
6102 The category for a coding system in which a text has an
6103 Unicode signature (cf. Unicode Standard) in the order of BIG
6104 endian at the head. Assigned the coding-system (Lisp symbol)
6105 `utf-16-be' by default.
6106
6107 o coding-category-utf-16-le
6108
6109 The category for a coding system in which a text has an
6110 Unicode signature (cf. Unicode Standard) in the order of
6111 LITTLE endian at the head. Assigned the coding-system (Lisp
6112 symbol) `utf-16-le' by default.
6113
1397dc18
KH
6114 o coding-category-ccl
6115
6116 The category for a coding system of which encoder/decoder is
6117 written in CCL programs. The default value is nil, i.e., no
6118 coding system is assigned.
6119
4ed46869
KH
6120 o coding-category-binary
6121
6122 The category for a coding system not categorized in any of the
6123 above. Assigned the coding-system (Lisp symbol)
e0e989f6 6124 `no-conversion' by default.
4ed46869
KH
6125
6126 Each of them is a Lisp symbol and the value is an actual
df7492f9 6127 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
6128 What Emacs does actually is to detect a category of coding system.
6129 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 6130 decide only one possible category, it selects a category of the
4ed46869
KH
6131 highest priority. Priorities of categories are also specified by a
6132 user in a Lisp variable `coding-category-list'.
6133
6134*/
6135
df7492f9
KH
6136#define EOL_SEEN_NONE 0
6137#define EOL_SEEN_LF 1
6138#define EOL_SEEN_CR 2
6139#define EOL_SEEN_CRLF 4
66cfb530 6140
ff0dacd7
KH
6141/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6142 SOURCE is encoded. If CATEGORY is one of
6143 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6144 two-byte, else they are encoded by one-byte.
6145
6146 Return one of EOL_SEEN_XXX. */
4ed46869 6147
bc4bc72a 6148#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
6149
6150static int
cf84bb53
JB
6151detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6152 enum coding_category category)
4ed46869 6153{
f6cbaf43 6154 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 6155 unsigned char c;
df7492f9
KH
6156 int total = 0;
6157 int eol_seen = EOL_SEEN_NONE;
4ed46869 6158
89528eb3 6159 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 6160 {
df7492f9 6161 int msb, lsb;
fa42c37f 6162
89528eb3
KH
6163 msb = category == (coding_category_utf_16_le
6164 | coding_category_utf_16_le_nosig);
df7492f9 6165 lsb = 1 - msb;
fa42c37f 6166
df7492f9 6167 while (src + 1 < src_end)
fa42c37f 6168 {
df7492f9
KH
6169 c = src[lsb];
6170 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 6171 {
df7492f9
KH
6172 int this_eol;
6173
6174 if (c == '\n')
6175 this_eol = EOL_SEEN_LF;
6176 else if (src + 3 >= src_end
6177 || src[msb + 2] != 0
6178 || src[lsb + 2] != '\n')
6179 this_eol = EOL_SEEN_CR;
fa42c37f 6180 else
75f4f1ac
EZ
6181 {
6182 this_eol = EOL_SEEN_CRLF;
6183 src += 2;
6184 }
df7492f9
KH
6185
6186 if (eol_seen == EOL_SEEN_NONE)
6187 /* This is the first end-of-line. */
6188 eol_seen = this_eol;
6189 else if (eol_seen != this_eol)
fa42c37f 6190 {
75f4f1ac
EZ
6191 /* The found type is different from what found before.
6192 Allow for stray ^M characters in DOS EOL files. */
ef1b0ba7
SM
6193 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6194 || (eol_seen == EOL_SEEN_CRLF
6195 && this_eol == EOL_SEEN_CR))
75f4f1ac
EZ
6196 eol_seen = EOL_SEEN_CRLF;
6197 else
6198 {
6199 eol_seen = EOL_SEEN_LF;
6200 break;
6201 }
fa42c37f 6202 }
df7492f9
KH
6203 if (++total == MAX_EOL_CHECK_COUNT)
6204 break;
fa42c37f 6205 }
df7492f9 6206 src += 2;
fa42c37f 6207 }
bcf26d6a 6208 }
d46c5b12 6209 else
ef1b0ba7
SM
6210 while (src < src_end)
6211 {
6212 c = *src++;
6213 if (c == '\n' || c == '\r')
6214 {
6215 int this_eol;
d46c5b12 6216
ef1b0ba7
SM
6217 if (c == '\n')
6218 this_eol = EOL_SEEN_LF;
6219 else if (src >= src_end || *src != '\n')
6220 this_eol = EOL_SEEN_CR;
6221 else
6222 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 6223
ef1b0ba7
SM
6224 if (eol_seen == EOL_SEEN_NONE)
6225 /* This is the first end-of-line. */
6226 eol_seen = this_eol;
6227 else if (eol_seen != this_eol)
6228 {
6229 /* The found type is different from what found before.
6230 Allow for stray ^M characters in DOS EOL files. */
6231 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6232 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6233 eol_seen = EOL_SEEN_CRLF;
6234 else
6235 {
6236 eol_seen = EOL_SEEN_LF;
6237 break;
6238 }
6239 }
6240 if (++total == MAX_EOL_CHECK_COUNT)
6241 break;
6242 }
6243 }
df7492f9 6244 return eol_seen;
73be902c
KH
6245}
6246
df7492f9 6247
24a73b0a 6248static Lisp_Object
971de7fb 6249adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
73be902c 6250{
0be8721c 6251 Lisp_Object eol_type;
8f924df7 6252
df7492f9
KH
6253 eol_type = CODING_ID_EOL_TYPE (coding->id);
6254 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
6255 {
6256 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6257 eol_type = Qunix;
6258 }
6f197c07 6259 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
6260 {
6261 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6262 eol_type = Qdos;
6263 }
6f197c07 6264 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
6265 {
6266 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6267 eol_type = Qmac;
6268 }
6269 return eol_type;
d46c5b12 6270}
4ed46869 6271
df7492f9
KH
6272/* Detect how a text specified in CODING is encoded. If a coding
6273 system is detected, update fields of CODING by the detected coding
6274 system. */
0a28aafb 6275
df7492f9 6276void
971de7fb 6277detect_coding (struct coding_system *coding)
d46c5b12 6278{
8f924df7 6279 const unsigned char *src, *src_end;
73cce38d 6280 int saved_mode = coding->mode;
d46c5b12 6281
df7492f9
KH
6282 coding->consumed = coding->consumed_char = 0;
6283 coding->produced = coding->produced_char = 0;
6284 coding_set_source (coding);
1c3478b0 6285
df7492f9 6286 src_end = coding->source + coding->src_bytes;
c0e16b14 6287 coding->head_ascii = 0;
1c3478b0 6288
df7492f9
KH
6289 /* If we have not yet decided the text encoding type, detect it
6290 now. */
6291 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 6292 {
df7492f9 6293 int c, i;
6cb21a4f 6294 struct coding_detection_info detect_info;
2f3cbb32 6295 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 6296
6cb21a4f 6297 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 6298 for (src = coding->source; src < src_end; src++)
d46c5b12 6299 {
df7492f9 6300 c = *src;
6cb21a4f 6301 if (c & 0x80)
6cb21a4f 6302 {
2f3cbb32 6303 eight_bit_found = 1;
2f3cbb32
KH
6304 if (null_byte_found)
6305 break;
6306 }
6307 else if (c < 0x20)
6308 {
6309 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6310 && ! inhibit_iso_escape_detection
6311 && ! detect_info.checked)
6cb21a4f 6312 {
2f3cbb32
KH
6313 if (detect_coding_iso_2022 (coding, &detect_info))
6314 {
6315 /* We have scanned the whole data. */
6316 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
6317 {
6318 /* We didn't find an 8-bit code. We may
6319 have found a null-byte, but it's very
ce5b453a 6320 rare that a binary file conforms to
c0e16b14
KH
6321 ISO-2022. */
6322 src = src_end;
6323 coding->head_ascii = src - coding->source;
6324 }
6325 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
6326 break;
6327 }
6328 }
97b1b294 6329 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
6330 {
6331 null_byte_found = 1;
6332 if (eight_bit_found)
6333 break;
6cb21a4f 6334 }
c006c0c8
KH
6335 if (! eight_bit_found)
6336 coding->head_ascii++;
6cb21a4f 6337 }
c006c0c8 6338 else if (! eight_bit_found)
c0e16b14 6339 coding->head_ascii++;
d46c5b12 6340 }
df7492f9 6341
2f3cbb32
KH
6342 if (null_byte_found || eight_bit_found
6343 || coding->head_ascii < coding->src_bytes
6cb21a4f 6344 || detect_info.found)
d46c5b12 6345 {
ff0dacd7
KH
6346 enum coding_category category;
6347 struct coding_system *this;
df7492f9 6348
6cb21a4f
KH
6349 if (coding->head_ascii == coding->src_bytes)
6350 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6351 for (i = 0; i < coding_category_raw_text; i++)
6352 {
6353 category = coding_priorities[i];
6354 this = coding_categories + category;
6355 if (detect_info.found & (1 << category))
24a73b0a 6356 break;
6cb21a4f
KH
6357 }
6358 else
2f3cbb32
KH
6359 {
6360 if (null_byte_found)
ff0dacd7 6361 {
2f3cbb32
KH
6362 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6363 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 6364 }
2f3cbb32
KH
6365 for (i = 0; i < coding_category_raw_text; i++)
6366 {
6367 category = coding_priorities[i];
6368 this = coding_categories + category;
6369 if (this->id < 0)
6370 {
6371 /* No coding system of this category is defined. */
6372 detect_info.rejected |= (1 << category);
6373 }
6374 else if (category >= coding_category_raw_text)
6375 continue;
6376 else if (detect_info.checked & (1 << category))
6377 {
6378 if (detect_info.found & (1 << category))
6379 break;
6380 }
6381 else if ((*(this->detector)) (coding, &detect_info)
6382 && detect_info.found & (1 << category))
6383 {
6384 if (category == coding_category_utf_16_auto)
6385 {
6386 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6387 category = coding_category_utf_16_le;
6388 else
6389 category = coding_category_utf_16_be;
6390 }
6391 break;
6392 }
6393 }
2f3cbb32 6394 }
c0e16b14
KH
6395
6396 if (i < coding_category_raw_text)
6397 setup_coding_system (CODING_ID_NAME (this->id), coding);
6398 else if (null_byte_found)
6399 setup_coding_system (Qno_conversion, coding);
6400 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6401 == CATEGORY_MASK_ANY)
6402 setup_coding_system (Qraw_text, coding);
6403 else if (detect_info.rejected)
6404 for (i = 0; i < coding_category_raw_text; i++)
6405 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6406 {
6407 this = coding_categories + coding_priorities[i];
6408 setup_coding_system (CODING_ID_NAME (this->id), coding);
6409 break;
6410 }
d46c5b12 6411 }
b73bfc1c 6412 }
a470d443
KH
6413 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6414 == coding_category_utf_8_auto)
6415 {
6416 Lisp_Object coding_systems;
6417 struct coding_detection_info detect_info;
6418
6419 coding_systems
6420 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6421 detect_info.found = detect_info.rejected = 0;
6422 coding->head_ascii = 0;
6423 if (CONSP (coding_systems)
6424 && detect_coding_utf_8 (coding, &detect_info))
6425 {
6426 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6427 setup_coding_system (XCAR (coding_systems), coding);
6428 else
6429 setup_coding_system (XCDR (coding_systems), coding);
6430 }
6431 }
24a73b0a
KH
6432 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6433 == coding_category_utf_16_auto)
b49a1807
KH
6434 {
6435 Lisp_Object coding_systems;
6436 struct coding_detection_info detect_info;
6437
6438 coding_systems
a470d443 6439 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6440 detect_info.found = detect_info.rejected = 0;
a470d443 6441 coding->head_ascii = 0;
b49a1807 6442 if (CONSP (coding_systems)
24a73b0a 6443 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6444 {
6445 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6446 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6447 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6448 setup_coding_system (XCDR (coding_systems), coding);
6449 }
6450 }
73cce38d 6451 coding->mode = saved_mode;
4ed46869 6452}
4ed46869 6453
d46c5b12 6454
aaaf0b1e 6455static void
971de7fb 6456decode_eol (struct coding_system *coding)
aaaf0b1e 6457{
24a73b0a
KH
6458 Lisp_Object eol_type;
6459 unsigned char *p, *pbeg, *pend;
3ed051d4 6460
24a73b0a 6461 eol_type = CODING_ID_EOL_TYPE (coding->id);
0a9564cb 6462 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
24a73b0a
KH
6463 return;
6464
6465 if (NILP (coding->dst_object))
6466 pbeg = coding->destination;
6467 else
6468 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6469 pend = pbeg + coding->produced;
6470
6471 if (VECTORP (eol_type))
aaaf0b1e 6472 {
df7492f9 6473 int eol_seen = EOL_SEEN_NONE;
4ed46869 6474
24a73b0a 6475 for (p = pbeg; p < pend; p++)
aaaf0b1e 6476 {
df7492f9
KH
6477 if (*p == '\n')
6478 eol_seen |= EOL_SEEN_LF;
6479 else if (*p == '\r')
aaaf0b1e 6480 {
df7492f9 6481 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6482 {
df7492f9
KH
6483 eol_seen |= EOL_SEEN_CRLF;
6484 p++;
aaaf0b1e 6485 }
aaaf0b1e 6486 else
df7492f9 6487 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6488 }
aaaf0b1e 6489 }
75f4f1ac
EZ
6490 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6491 if ((eol_seen & EOL_SEEN_CRLF) != 0
6492 && (eol_seen & EOL_SEEN_CR) != 0
6493 && (eol_seen & EOL_SEEN_LF) == 0)
6494 eol_seen = EOL_SEEN_CRLF;
6495 else if (eol_seen != EOL_SEEN_NONE
24a73b0a
KH
6496 && eol_seen != EOL_SEEN_LF
6497 && eol_seen != EOL_SEEN_CRLF
6498 && eol_seen != EOL_SEEN_CR)
6499 eol_seen = EOL_SEEN_LF;
df7492f9 6500 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6501 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6502 }
d46c5b12 6503
24a73b0a 6504 if (EQ (eol_type, Qmac))
27901516 6505 {
24a73b0a 6506 for (p = pbeg; p < pend; p++)
df7492f9
KH
6507 if (*p == '\r')
6508 *p = '\n';
4ed46869 6509 }
24a73b0a 6510 else if (EQ (eol_type, Qdos))
df7492f9 6511 {
24a73b0a 6512 int n = 0;
b73bfc1c 6513
24a73b0a
KH
6514 if (NILP (coding->dst_object))
6515 {
4347441b
KH
6516 /* Start deleting '\r' from the tail to minimize the memory
6517 movement. */
24a73b0a
KH
6518 for (p = pend - 2; p >= pbeg; p--)
6519 if (*p == '\r')
6520 {
72af86bd 6521 memmove (p, p + 1, pend-- - p - 1);
24a73b0a
KH
6522 n++;
6523 }
6524 }
6525 else
6526 {
4347441b
KH
6527 int pos_byte = coding->dst_pos_byte;
6528 int pos = coding->dst_pos;
6529 int pos_end = pos + coding->produced_char - 1;
6530
6531 while (pos < pos_end)
6532 {
6533 p = BYTE_POS_ADDR (pos_byte);
6534 if (*p == '\r' && p[1] == '\n')
6535 {
6536 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6537 n++;
6538 pos_end--;
6539 }
6540 pos++;
69b8522d
KH
6541 if (coding->dst_multibyte)
6542 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6543 else
6544 pos_byte++;
4347441b 6545 }
24a73b0a
KH
6546 }
6547 coding->produced -= n;
6548 coding->produced_char -= n;
aaaf0b1e 6549 }
4ed46869
KH
6550}
6551
7d64c6ad 6552
a6f87d34
KH
6553/* Return a translation table (or list of them) from coding system
6554 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6555 decoding (ENCODEP is zero). */
7d64c6ad 6556
e6a54062 6557static Lisp_Object
971de7fb 6558get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
7d64c6ad
KH
6559{
6560 Lisp_Object standard, translation_table;
09ee6fdd 6561 Lisp_Object val;
7d64c6ad 6562
4bed5909
CY
6563 if (NILP (Venable_character_translation))
6564 {
6565 if (max_lookup)
6566 *max_lookup = 0;
6567 return Qnil;
6568 }
7d64c6ad
KH
6569 if (encodep)
6570 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6571 standard = Vstandard_translation_table_for_encode;
6572 else
6573 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6574 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6575 if (NILP (translation_table))
09ee6fdd
KH
6576 translation_table = standard;
6577 else
a6f87d34 6578 {
09ee6fdd
KH
6579 if (SYMBOLP (translation_table))
6580 translation_table = Fget (translation_table, Qtranslation_table);
6581 else if (CONSP (translation_table))
6582 {
6583 translation_table = Fcopy_sequence (translation_table);
6584 for (val = translation_table; CONSP (val); val = XCDR (val))
6585 if (SYMBOLP (XCAR (val)))
6586 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6587 }
6588 if (CHAR_TABLE_P (standard))
6589 {
6590 if (CONSP (translation_table))
6591 translation_table = nconc2 (translation_table,
6592 Fcons (standard, Qnil));
6593 else
6594 translation_table = Fcons (translation_table,
6595 Fcons (standard, Qnil));
6596 }
a6f87d34 6597 }
2170c8f0
KH
6598
6599 if (max_lookup)
09ee6fdd 6600 {
2170c8f0
KH
6601 *max_lookup = 1;
6602 if (CHAR_TABLE_P (translation_table)
6603 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6604 {
6605 val = XCHAR_TABLE (translation_table)->extras[1];
6606 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6607 *max_lookup = XFASTINT (val);
6608 }
6609 else if (CONSP (translation_table))
6610 {
6611 Lisp_Object tail, val;
09ee6fdd 6612
2170c8f0
KH
6613 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6614 if (CHAR_TABLE_P (XCAR (tail))
6615 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6616 {
6617 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6618 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6619 *max_lookup = XFASTINT (val);
6620 }
6621 }
a6f87d34 6622 }
7d64c6ad
KH
6623 return translation_table;
6624}
6625
09ee6fdd
KH
6626#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6627 do { \
6628 trans = Qnil; \
6629 if (CHAR_TABLE_P (table)) \
6630 { \
6631 trans = CHAR_TABLE_REF (table, c); \
6632 if (CHARACTERP (trans)) \
6633 c = XFASTINT (trans), trans = Qnil; \
6634 } \
6635 else if (CONSP (table)) \
6636 { \
6637 Lisp_Object tail; \
6638 \
6639 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6640 if (CHAR_TABLE_P (XCAR (tail))) \
6641 { \
6642 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6643 if (CHARACTERP (trans)) \
6644 c = XFASTINT (trans), trans = Qnil; \
6645 else if (! NILP (trans)) \
6646 break; \
6647 } \
6648 } \
e6a54062
KH
6649 } while (0)
6650
7d64c6ad 6651
e951386e
KH
6652/* Return a translation of character(s) at BUF according to TRANS.
6653 TRANS is TO-CHAR or ((FROM . TO) ...) where
6654 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6655 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6656 translation is found, and Qnil if not found..
6657 If BUF is too short to lookup characters in FROM, return Qt. */
6658
69a80ea3 6659static Lisp_Object
971de7fb 6660get_translation (Lisp_Object trans, int *buf, int *buf_end)
69a80ea3 6661{
e951386e
KH
6662
6663 if (INTEGERP (trans))
6664 return trans;
6665 for (; CONSP (trans); trans = XCDR (trans))
69a80ea3 6666 {
e951386e
KH
6667 Lisp_Object val = XCAR (trans);
6668 Lisp_Object from = XCAR (val);
6669 int len = ASIZE (from);
6670 int i;
69a80ea3 6671
e951386e 6672 for (i = 0; i < len; i++)
69a80ea3 6673 {
e951386e
KH
6674 if (buf + i == buf_end)
6675 return Qt;
6676 if (XINT (AREF (from, i)) != buf[i])
6677 break;
69a80ea3 6678 }
e951386e
KH
6679 if (i == len)
6680 return val;
69a80ea3 6681 }
e951386e 6682 return Qnil;
69a80ea3
KH
6683}
6684
6685
d46c5b12 6686static int
cf84bb53
JB
6687produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6688 int last_block)
4ed46869 6689{
df7492f9
KH
6690 unsigned char *dst = coding->destination + coding->produced;
6691 unsigned char *dst_end = coding->destination + coding->dst_bytes;
119852e7
KH
6692 EMACS_INT produced;
6693 EMACS_INT produced_chars = 0;
69a80ea3 6694 int carryover = 0;
4ed46869 6695
df7492f9 6696 if (! coding->chars_at_source)
4ed46869 6697 {
119852e7 6698 /* Source characters are in coding->charbuf. */
fba4576f
AS
6699 int *buf = coding->charbuf;
6700 int *buf_end = buf + coding->charbuf_used;
4ed46869 6701
db274c7a
KH
6702 if (EQ (coding->src_object, coding->dst_object))
6703 {
6704 coding_set_source (coding);
6705 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6706 }
4ed46869 6707
df7492f9 6708 while (buf < buf_end)
4ed46869 6709 {
69a80ea3 6710 int c = *buf, i;
bc4bc72a 6711
df7492f9
KH
6712 if (c >= 0)
6713 {
69a80ea3
KH
6714 int from_nchars = 1, to_nchars = 1;
6715 Lisp_Object trans = Qnil;
6716
09ee6fdd 6717 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6718 if (! NILP (trans))
69a80ea3 6719 {
e951386e
KH
6720 trans = get_translation (trans, buf, buf_end);
6721 if (INTEGERP (trans))
6722 c = XINT (trans);
6723 else if (CONSP (trans))
6724 {
6725 from_nchars = ASIZE (XCAR (trans));
6726 trans = XCDR (trans);
6727 if (INTEGERP (trans))
6728 c = XINT (trans);
6729 else
6730 {
6731 to_nchars = ASIZE (trans);
6732 c = XINT (AREF (trans, 0));
6733 }
6734 }
6735 else if (EQ (trans, Qt) && ! last_block)
69a80ea3 6736 break;
69a80ea3
KH
6737 }
6738
6739 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6740 {
6741 dst = alloc_destination (coding,
6742 buf_end - buf
6743 + MAX_MULTIBYTE_LENGTH * to_nchars,
6744 dst);
db274c7a
KH
6745 if (EQ (coding->src_object, coding->dst_object))
6746 {
6747 coding_set_source (coding);
e951386e
KH
6748 dst_end = (((unsigned char *) coding->source)
6749 + coding->consumed);
db274c7a
KH
6750 }
6751 else
6752 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6753 }
6754
433f7f87 6755 for (i = 0; i < to_nchars; i++)
69a80ea3 6756 {
433f7f87
KH
6757 if (i > 0)
6758 c = XINT (AREF (trans, i));
69a80ea3
KH
6759 if (coding->dst_multibyte
6760 || ! CHAR_BYTE8_P (c))
db274c7a 6761 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6762 else
6763 *dst++ = CHAR_TO_BYTE8 (c);
6764 }
6765 produced_chars += to_nchars;
e951386e 6766 buf += from_nchars;
d46c5b12 6767 }
df7492f9 6768 else
69a80ea3
KH
6769 /* This is an annotation datum. (-C) is the length. */
6770 buf += -c;
4ed46869 6771 }
69a80ea3 6772 carryover = buf_end - buf;
4ed46869 6773 }
fa42c37f 6774 else
fa42c37f 6775 {
119852e7 6776 /* Source characters are at coding->source. */
8f924df7 6777 const unsigned char *src = coding->source;
119852e7 6778 const unsigned char *src_end = src + coding->consumed;
4ed46869 6779
db274c7a
KH
6780 if (EQ (coding->dst_object, coding->src_object))
6781 dst_end = (unsigned char *) src;
df7492f9 6782 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6783 {
df7492f9 6784 if (coding->src_multibyte)
fa42c37f 6785 {
71c81426 6786 int multibytep = 1;
4533845d 6787 EMACS_INT consumed_chars = 0;
d46c5b12 6788
df7492f9
KH
6789 while (1)
6790 {
8f924df7 6791 const unsigned char *src_base = src;
df7492f9 6792 int c;
b73bfc1c 6793
df7492f9 6794 ONE_MORE_BYTE (c);
119852e7 6795 if (dst == dst_end)
df7492f9 6796 {
119852e7
KH
6797 if (EQ (coding->src_object, coding->dst_object))
6798 dst_end = (unsigned char *) src;
6799 if (dst == dst_end)
df7492f9 6800 {
119852e7
KH
6801 EMACS_INT offset = src - coding->source;
6802
6803 dst = alloc_destination (coding, src_end - src + 1,
6804 dst);
6805 dst_end = coding->destination + coding->dst_bytes;
6806 coding_set_source (coding);
6807 src = coding->source + offset;
6808 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6809 if (EQ (coding->src_object, coding->dst_object))
6810 dst_end = (unsigned char *) src;
df7492f9 6811 }
df7492f9
KH
6812 }
6813 *dst++ = c;
6814 produced_chars++;
6815 }
6816 no_more_source:
6817 ;
fa42c37f
KH
6818 }
6819 else
df7492f9
KH
6820 while (src < src_end)
6821 {
71c81426 6822 int multibytep = 1;
df7492f9 6823 int c = *src++;
b73bfc1c 6824
df7492f9
KH
6825 if (dst >= dst_end - 1)
6826 {
2c78b7e1 6827 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6828 dst_end = (unsigned char *) src;
2c78b7e1
KH
6829 if (dst >= dst_end - 1)
6830 {
119852e7 6831 EMACS_INT offset = src - coding->source;
db274c7a 6832 EMACS_INT more_bytes;
119852e7 6833
db274c7a
KH
6834 if (EQ (coding->src_object, coding->dst_object))
6835 more_bytes = ((src_end - src) / 2) + 2;
6836 else
6837 more_bytes = src_end - src + 2;
6838 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6839 dst_end = coding->destination + coding->dst_bytes;
6840 coding_set_source (coding);
119852e7 6841 src = coding->source + offset;
2c78b7e1 6842 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6843 if (EQ (coding->src_object, coding->dst_object))
6844 dst_end = (unsigned char *) src;
2c78b7e1 6845 }
df7492f9
KH
6846 }
6847 EMIT_ONE_BYTE (c);
6848 }
d46c5b12 6849 }
df7492f9
KH
6850 else
6851 {
6852 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6853 {
119852e7 6854 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
4ed46869 6855
df7492f9 6856 if (require > 0)
fa42c37f 6857 {
df7492f9
KH
6858 EMACS_INT offset = src - coding->source;
6859
6860 dst = alloc_destination (coding, require, dst);
6861 coding_set_source (coding);
6862 src = coding->source + offset;
6863 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
6864 }
6865 }
119852e7 6866 produced_chars = coding->consumed_char;
df7492f9 6867 while (src < src_end)
14daee73 6868 *dst++ = *src++;
fa42c37f
KH
6869 }
6870 }
6871
df7492f9 6872 produced = dst - (coding->destination + coding->produced);
284201e4 6873 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6874 insert_from_gap (produced_chars, produced);
6875 coding->produced += produced;
6876 coding->produced_char += produced_chars;
69a80ea3 6877 return carryover;
fa42c37f
KH
6878}
6879
ff0dacd7
KH
6880/* Compose text in CODING->object according to the annotation data at
6881 CHARBUF. CHARBUF is an array:
e951386e 6882 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
df7492f9 6883 */
4ed46869 6884
df7492f9 6885static INLINE void
971de7fb 6886produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
4ed46869 6887{
df7492f9 6888 int len;
69a80ea3 6889 EMACS_INT to;
df7492f9 6890 enum composition_method method;
df7492f9 6891 Lisp_Object components;
fa42c37f 6892
e951386e 6893 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
69a80ea3 6894 to = pos + charbuf[2];
e951386e 6895 method = (enum composition_method) (charbuf[4]);
d46c5b12 6896
df7492f9
KH
6897 if (method == COMPOSITION_RELATIVE)
6898 components = Qnil;
e951386e 6899 else
d46c5b12 6900 {
df7492f9 6901 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
e951386e 6902 int i, j;
b73bfc1c 6903
e951386e
KH
6904 if (method == COMPOSITION_WITH_RULE)
6905 len = charbuf[2] * 3 - 2;
6906 charbuf += MAX_ANNOTATION_LENGTH;
6907 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6908 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
9ffd559c 6909 {
e951386e
KH
6910 if (charbuf[i] >= 0)
6911 args[j] = make_number (charbuf[i]);
6912 else
6913 {
6914 i++;
6915 args[j] = make_number (charbuf[i] % 0x100);
6916 }
9ffd559c 6917 }
e951386e 6918 components = (i == j ? Fstring (j, args) : Fvector (j, args));
d46c5b12 6919 }
69a80ea3 6920 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6921}
6922
d46c5b12 6923
ff0dacd7
KH
6924/* Put `charset' property on text in CODING->object according to
6925 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6926 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6927 */
d46c5b12 6928
ff0dacd7 6929static INLINE void
971de7fb 6930produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
d46c5b12 6931{
69a80ea3
KH
6932 EMACS_INT from = pos - charbuf[2];
6933 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6934
69a80ea3 6935 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6936 Qcharset, CHARSET_NAME (charset),
6937 coding->dst_object);
d46c5b12
KH
6938}
6939
d46c5b12 6940
df7492f9
KH
6941#define CHARBUF_SIZE 0x4000
6942
6943#define ALLOC_CONVERSION_WORK_AREA(coding) \
6944 do { \
8510724d 6945 int size = CHARBUF_SIZE; \
df7492f9
KH
6946 \
6947 coding->charbuf = NULL; \
6948 while (size > 1024) \
6949 { \
6950 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6951 if (coding->charbuf) \
6952 break; \
6953 size >>= 1; \
6954 } \
6955 if (! coding->charbuf) \
6956 { \
065e3595 6957 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
6958 return coding->result; \
6959 } \
6960 coding->charbuf_size = size; \
6961 } while (0)
4ed46869 6962
d46c5b12
KH
6963
6964static void
971de7fb 6965produce_annotation (struct coding_system *coding, EMACS_INT pos)
d46c5b12 6966{
df7492f9
KH
6967 int *charbuf = coding->charbuf;
6968 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6969
ff0dacd7
KH
6970 if (NILP (coding->dst_object))
6971 return;
d46c5b12 6972
df7492f9 6973 while (charbuf < charbuf_end)
a84f1519 6974 {
df7492f9 6975 if (*charbuf >= 0)
e951386e 6976 pos++, charbuf++;
d46c5b12 6977 else
d46c5b12 6978 {
df7492f9 6979 int len = -*charbuf;
e951386e
KH
6980
6981 if (len > 2)
6982 switch (charbuf[1])
6983 {
6984 case CODING_ANNOTATE_COMPOSITION_MASK:
6985 produce_composition (coding, charbuf, pos);
6986 break;
6987 case CODING_ANNOTATE_CHARSET_MASK:
6988 produce_charset (coding, charbuf, pos);
6989 break;
6990 }
df7492f9 6991 charbuf += len;
d46c5b12 6992 }
a84f1519 6993 }
d46c5b12
KH
6994}
6995
df7492f9
KH
6996/* Decode the data at CODING->src_object into CODING->dst_object.
6997 CODING->src_object is a buffer, a string, or nil.
6998 CODING->dst_object is a buffer.
d46c5b12 6999
df7492f9
KH
7000 If CODING->src_object is a buffer, it must be the current buffer.
7001 In this case, if CODING->src_pos is positive, it is a position of
7002 the source text in the buffer, otherwise, the source text is in the
7003 gap area of the buffer, and CODING->src_pos specifies the offset of
7004 the text from GPT (which must be the same as PT). If this is the
7005 same buffer as CODING->dst_object, CODING->src_pos must be
7006 negative.
d46c5b12 7007
b6828792 7008 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 7009 that string.
d46c5b12 7010
df7492f9
KH
7011 If CODING->src_object is nil, CODING->source must already point to
7012 the non-relocatable memory area. In this case, CODING->src_pos is
7013 an offset from CODING->source.
73be902c 7014
df7492f9
KH
7015 The decoded data is inserted at the current point of the buffer
7016 CODING->dst_object.
7017*/
d46c5b12 7018
df7492f9 7019static int
971de7fb 7020decode_coding (struct coding_system *coding)
d46c5b12 7021{
df7492f9 7022 Lisp_Object attrs;
24a73b0a 7023 Lisp_Object undo_list;
7d64c6ad 7024 Lisp_Object translation_table;
d0396581 7025 struct ccl_spec cclspec;
69a80ea3
KH
7026 int carryover;
7027 int i;
d46c5b12 7028
df7492f9
KH
7029 if (BUFFERP (coding->src_object)
7030 && coding->src_pos > 0
7031 && coding->src_pos < GPT
7032 && coding->src_pos + coding->src_chars > GPT)
7033 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 7034
24a73b0a 7035 undo_list = Qt;
df7492f9 7036 if (BUFFERP (coding->dst_object))
1c3478b0 7037 {
df7492f9
KH
7038 if (current_buffer != XBUFFER (coding->dst_object))
7039 set_buffer_internal (XBUFFER (coding->dst_object));
7040 if (GPT != PT)
7041 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
7042 undo_list = current_buffer->undo_list;
7043 current_buffer->undo_list = Qt;
1c3478b0
KH
7044 }
7045
df7492f9
KH
7046 coding->consumed = coding->consumed_char = 0;
7047 coding->produced = coding->produced_char = 0;
7048 coding->chars_at_source = 0;
065e3595 7049 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 7050 coding->errors = 0;
1c3478b0 7051
df7492f9
KH
7052 ALLOC_CONVERSION_WORK_AREA (coding);
7053
7054 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 7055 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 7056
69a80ea3 7057 carryover = 0;
d0396581
KH
7058 if (coding->decoder == decode_coding_ccl)
7059 {
7060 coding->spec.ccl = &cclspec;
7061 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7062 }
df7492f9 7063 do
b73bfc1c 7064 {
69a80ea3
KH
7065 EMACS_INT pos = coding->dst_pos + coding->produced_char;
7066
df7492f9
KH
7067 coding_set_source (coding);
7068 coding->annotated = 0;
69a80ea3 7069 coding->charbuf_used = carryover;
df7492f9 7070 (*(coding->decoder)) (coding);
df7492f9 7071 coding_set_destination (coding);
69a80ea3 7072 carryover = produce_chars (coding, translation_table, 0);
df7492f9 7073 if (coding->annotated)
69a80ea3
KH
7074 produce_annotation (coding, pos);
7075 for (i = 0; i < carryover; i++)
7076 coding->charbuf[i]
7077 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 7078 }
d0396581
KH
7079 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7080 || (coding->consumed < coding->src_bytes
7081 && (coding->result == CODING_RESULT_SUCCESS
7082 || coding->result == CODING_RESULT_INVALID_SRC)));
d46c5b12 7083
69a80ea3
KH
7084 if (carryover > 0)
7085 {
7086 coding_set_destination (coding);
7087 coding->charbuf_used = carryover;
7088 produce_chars (coding, translation_table, 1);
7089 }
7090
df7492f9
KH
7091 coding->carryover_bytes = 0;
7092 if (coding->consumed < coding->src_bytes)
d46c5b12 7093 {
df7492f9 7094 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 7095 const unsigned char *src;
df7492f9
KH
7096
7097 coding_set_source (coding);
7098 coding_set_destination (coding);
7099 src = coding->source + coding->consumed;
7100
7101 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 7102 {
df7492f9
KH
7103 /* Flush out unprocessed data as binary chars. We are sure
7104 that the number of data is less than the size of
7105 coding->charbuf. */
065e3595 7106 coding->charbuf_used = 0;
b2dab6c8
JR
7107 coding->chars_at_source = 0;
7108
df7492f9 7109 while (nbytes-- > 0)
1c3478b0 7110 {
df7492f9 7111 int c = *src++;
98725083 7112
1c91457d
KH
7113 if (c & 0x80)
7114 c = BYTE8_TO_CHAR (c);
7115 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 7116 }
f6cbaf43 7117 produce_chars (coding, Qnil, 1);
d46c5b12 7118 }
d46c5b12 7119 else
df7492f9
KH
7120 {
7121 /* Record unprocessed bytes in coding->carryover. We are
7122 sure that the number of data is less than the size of
7123 coding->carryover. */
7124 unsigned char *p = coding->carryover;
7125
f289d375
KH
7126 if (nbytes > sizeof coding->carryover)
7127 nbytes = sizeof coding->carryover;
df7492f9
KH
7128 coding->carryover_bytes = nbytes;
7129 while (nbytes-- > 0)
7130 *p++ = *src++;
1c3478b0 7131 }
df7492f9 7132 coding->consumed = coding->src_bytes;
b73bfc1c 7133 }
69f76525 7134
0a9564cb
EZ
7135 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7136 && !inhibit_eol_conversion)
4347441b 7137 decode_eol (coding);
24a73b0a
KH
7138 if (BUFFERP (coding->dst_object))
7139 {
7140 current_buffer->undo_list = undo_list;
7141 record_insert (coding->dst_pos, coding->produced_char);
7142 }
73be902c 7143 return coding->result;
4ed46869
KH
7144}
7145
aaaf0b1e 7146
e1c23804 7147/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
7148 ending before LIMIT of CODING->src_object (buffer or string), store
7149 the data in BUF, set *STOP to a starting position of the next
7150 composition (if any) or to LIMIT, and return the address of the
7151 next element of BUF.
7152
7153 If such an annotation is not found, set *STOP to a starting
7154 position of a composition after POS (if any) or to LIMIT, and
7155 return BUF. */
7156
7157static INLINE int *
cf84bb53
JB
7158handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7159 struct coding_system *coding, int *buf,
7160 EMACS_INT *stop)
aaaf0b1e 7161{
ff0dacd7
KH
7162 EMACS_INT start, end;
7163 Lisp_Object prop;
aaaf0b1e 7164
ff0dacd7
KH
7165 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7166 || end > limit)
7167 *stop = limit;
7168 else if (start > pos)
7169 *stop = start;
7170 else
aaaf0b1e 7171 {
ff0dacd7 7172 if (start == pos)
aaaf0b1e 7173 {
ff0dacd7
KH
7174 /* We found a composition. Store the corresponding
7175 annotation data in BUF. */
7176 int *head = buf;
7177 enum composition_method method = COMPOSITION_METHOD (prop);
7178 int nchars = COMPOSITION_LENGTH (prop);
7179
e951386e 7180 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
ff0dacd7 7181 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 7182 {
ff0dacd7
KH
7183 Lisp_Object components;
7184 int len, i, i_byte;
7185
7186 components = COMPOSITION_COMPONENTS (prop);
7187 if (VECTORP (components))
aaaf0b1e 7188 {
ff0dacd7
KH
7189 len = XVECTOR (components)->size;
7190 for (i = 0; i < len; i++)
7191 *buf++ = XINT (AREF (components, i));
aaaf0b1e 7192 }
ff0dacd7 7193 else if (STRINGP (components))
aaaf0b1e 7194 {
8f924df7 7195 len = SCHARS (components);
ff0dacd7
KH
7196 i = i_byte = 0;
7197 while (i < len)
7198 {
7199 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7200 buf++;
7201 }
7202 }
7203 else if (INTEGERP (components))
7204 {
7205 len = 1;
7206 *buf++ = XINT (components);
7207 }
7208 else if (CONSP (components))
7209 {
7210 for (len = 0; CONSP (components);
7211 len++, components = XCDR (components))
7212 *buf++ = XINT (XCAR (components));
aaaf0b1e 7213 }
aaaf0b1e 7214 else
ff0dacd7
KH
7215 abort ();
7216 *head -= len;
aaaf0b1e 7217 }
aaaf0b1e 7218 }
ff0dacd7
KH
7219
7220 if (find_composition (end, limit, &start, &end, &prop,
7221 coding->src_object)
7222 && end <= limit)
7223 *stop = start;
7224 else
7225 *stop = limit;
aaaf0b1e 7226 }
ff0dacd7
KH
7227 return buf;
7228}
7229
7230
e1c23804 7231/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
7232 CODING->src_object (buffer of string), store the data in BUF, set
7233 *STOP to the position where the value of `charset' property changes
7234 (limiting by LIMIT), and return the address of the next element of
7235 BUF.
7236
7237 If the property value is nil, set *STOP to the position where the
7238 property value is non-nil (limiting by LIMIT), and return BUF. */
7239
7240static INLINE int *
cf84bb53
JB
7241handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7242 struct coding_system *coding, int *buf,
7243 EMACS_INT *stop)
ff0dacd7
KH
7244{
7245 Lisp_Object val, next;
7246 int id;
7247
7248 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7249 if (! NILP (val) && CHARSETP (val))
7250 id = XINT (CHARSET_SYMBOL_ID (val));
7251 else
7252 id = -1;
69a80ea3 7253 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
7254 next = Fnext_single_property_change (make_number (pos), Qcharset,
7255 coding->src_object,
7256 make_number (limit));
7257 *stop = XINT (next);
7258 return buf;
7259}
7260
7261
df7492f9 7262static void
cf84bb53
JB
7263consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7264 int max_lookup)
df7492f9
KH
7265{
7266 int *buf = coding->charbuf;
ff0dacd7 7267 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 7268 const unsigned char *src = coding->source + coding->consumed;
4776e638 7269 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
7270 EMACS_INT pos = coding->src_pos + coding->consumed_char;
7271 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
7272 int multibytep = coding->src_multibyte;
7273 Lisp_Object eol_type;
7274 int c;
ff0dacd7 7275 EMACS_INT stop, stop_composition, stop_charset;
09ee6fdd 7276 int *lookup_buf = NULL;
433f7f87
KH
7277
7278 if (! NILP (translation_table))
09ee6fdd 7279 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 7280
0a9564cb 7281 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
df7492f9
KH
7282 if (VECTORP (eol_type))
7283 eol_type = Qunix;
88993dfd 7284
df7492f9
KH
7285 /* Note: composition handling is not yet implemented. */
7286 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 7287
0b5670c9
KH
7288 if (NILP (coding->src_object))
7289 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 7290 else
0b5670c9
KH
7291 {
7292 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7293 stop = stop_composition = pos;
7294 else
7295 stop = stop_composition = end_pos;
7296 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7297 stop = stop_charset = pos;
7298 else
7299 stop_charset = end_pos;
7300 }
ec6d2bb8 7301
24a73b0a 7302 /* Compensate for CRLF and conversion. */
ff0dacd7 7303 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 7304 while (buf < buf_end)
aaaf0b1e 7305 {
433f7f87
KH
7306 Lisp_Object trans;
7307
df7492f9 7308 if (pos == stop)
ec6d2bb8 7309 {
df7492f9
KH
7310 if (pos == end_pos)
7311 break;
ff0dacd7
KH
7312 if (pos == stop_composition)
7313 buf = handle_composition_annotation (pos, end_pos, coding,
7314 buf, &stop_composition);
7315 if (pos == stop_charset)
7316 buf = handle_charset_annotation (pos, end_pos, coding,
7317 buf, &stop_charset);
7318 stop = (stop_composition < stop_charset
7319 ? stop_composition : stop_charset);
df7492f9
KH
7320 }
7321
7322 if (! multibytep)
4776e638 7323 {
d3e4cb56 7324 EMACS_INT bytes;
aaaf0b1e 7325
4d1e6632
KH
7326 if (coding->encoder == encode_coding_raw_text
7327 || coding->encoder == encode_coding_ccl)
ea29edf2
KH
7328 c = *src++, pos++;
7329 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 7330 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 7331 else
f03caae0 7332 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 7333 }
df7492f9 7334 else
db274c7a 7335 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
7336 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7337 c = '\n';
7338 if (! EQ (eol_type, Qunix))
aaaf0b1e 7339 {
df7492f9 7340 if (c == '\n')
aaaf0b1e 7341 {
df7492f9
KH
7342 if (EQ (eol_type, Qdos))
7343 *buf++ = '\r';
7344 else
7345 c = '\r';
aaaf0b1e
KH
7346 }
7347 }
433f7f87 7348
e6a54062 7349 trans = Qnil;
09ee6fdd 7350 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 7351 if (NILP (trans))
433f7f87
KH
7352 *buf++ = c;
7353 else
7354 {
7355 int from_nchars = 1, to_nchars = 1;
7356 int *lookup_buf_end;
7357 const unsigned char *p = src;
7358 int i;
7359
7360 lookup_buf[0] = c;
7361 for (i = 1; i < max_lookup && p < src_end; i++)
7362 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7363 lookup_buf_end = lookup_buf + i;
e951386e
KH
7364 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7365 if (INTEGERP (trans))
7366 c = XINT (trans);
7367 else if (CONSP (trans))
7368 {
7369 from_nchars = ASIZE (XCAR (trans));
7370 trans = XCDR (trans);
7371 if (INTEGERP (trans))
7372 c = XINT (trans);
7373 else
7374 {
7375 to_nchars = ASIZE (trans);
7376 if (buf + to_nchars > buf_end)
7377 break;
7378 c = XINT (AREF (trans, 0));
7379 }
7380 }
7381 else
433f7f87 7382 break;
e951386e 7383 *buf++ = c;
433f7f87
KH
7384 for (i = 1; i < to_nchars; i++)
7385 *buf++ = XINT (AREF (trans, i));
7386 for (i = 1; i < from_nchars; i++, pos++)
7387 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7388 }
aaaf0b1e 7389 }
ec6d2bb8 7390
df7492f9
KH
7391 coding->consumed = src - coding->source;
7392 coding->consumed_char = pos - coding->src_pos;
7393 coding->charbuf_used = buf - coding->charbuf;
7394 coding->chars_at_source = 0;
aaaf0b1e
KH
7395}
7396
4ed46869 7397
df7492f9
KH
7398/* Encode the text at CODING->src_object into CODING->dst_object.
7399 CODING->src_object is a buffer or a string.
7400 CODING->dst_object is a buffer or nil.
7401
7402 If CODING->src_object is a buffer, it must be the current buffer.
7403 In this case, if CODING->src_pos is positive, it is a position of
7404 the source text in the buffer, otherwise. the source text is in the
7405 gap area of the buffer, and coding->src_pos specifies the offset of
7406 the text from GPT (which must be the same as PT). If this is the
7407 same buffer as CODING->dst_object, CODING->src_pos must be
7408 negative and CODING should not have `pre-write-conversion'.
7409
7410 If CODING->src_object is a string, CODING should not have
7411 `pre-write-conversion'.
7412
7413 If CODING->dst_object is a buffer, the encoded data is inserted at
7414 the current point of that buffer.
7415
7416 If CODING->dst_object is nil, the encoded data is placed at the
7417 memory area specified by CODING->destination. */
7418
7419static int
971de7fb 7420encode_coding (struct coding_system *coding)
4ed46869 7421{
df7492f9 7422 Lisp_Object attrs;
7d64c6ad 7423 Lisp_Object translation_table;
09ee6fdd 7424 int max_lookup;
fb608df3 7425 struct ccl_spec cclspec;
9861e777 7426
df7492f9 7427 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
7428 if (coding->encoder == encode_coding_raw_text)
7429 translation_table = Qnil, max_lookup = 0;
7430 else
7431 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7432
df7492f9 7433 if (BUFFERP (coding->dst_object))
8844fa83 7434 {
df7492f9
KH
7435 set_buffer_internal (XBUFFER (coding->dst_object));
7436 coding->dst_multibyte
7437 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 7438 }
4ed46869 7439
b73bfc1c 7440 coding->consumed = coding->consumed_char = 0;
df7492f9 7441 coding->produced = coding->produced_char = 0;
065e3595 7442 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7443 coding->errors = 0;
b73bfc1c 7444
df7492f9 7445 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7446
fb608df3
KH
7447 if (coding->encoder == encode_coding_ccl)
7448 {
7449 coding->spec.ccl = &cclspec;
7450 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7451 }
df7492f9
KH
7452 do {
7453 coding_set_source (coding);
09ee6fdd 7454 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7455 coding_set_destination (coding);
7456 (*(coding->encoder)) (coding);
7457 } while (coding->consumed_char < coding->src_chars);
7458
284201e4 7459 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
7460 insert_from_gap (coding->produced_char, coding->produced);
7461
7462 return (coding->result);
ec6d2bb8
KH
7463}
7464
fb88bf2d 7465
24a73b0a
KH
7466/* Name (or base name) of work buffer for code conversion. */
7467static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7468
24a73b0a
KH
7469/* A working buffer used by the top level conversion. Once it is
7470 created, it is never destroyed. It has the name
7471 Vcode_conversion_workbuf_name. The other working buffers are
7472 destroyed after the use is finished, and their names are modified
7473 versions of Vcode_conversion_workbuf_name. */
7474static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7475
24a73b0a
KH
7476/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7477static int reused_workbuf_in_use;
4ed46869 7478
24a73b0a 7479
ad1746f5 7480/* Return a working buffer of code conversion. MULTIBYTE specifies the
24a73b0a 7481 multibyteness of returning buffer. */
b73bfc1c 7482
f6cbaf43 7483static Lisp_Object
971de7fb 7484make_conversion_work_buffer (int multibyte)
df7492f9 7485{
24a73b0a
KH
7486 Lisp_Object name, workbuf;
7487 struct buffer *current;
4ed46869 7488
24a73b0a 7489 if (reused_workbuf_in_use++)
065e3595
KH
7490 {
7491 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7492 workbuf = Fget_buffer_create (name);
7493 }
df7492f9 7494 else
065e3595 7495 {
159bd5a2 7496 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7497 Vcode_conversion_reused_workbuf
7498 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7499 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7500 }
24a73b0a
KH
7501 current = current_buffer;
7502 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7503 /* We can't allow modification hooks to run in the work buffer. For
7504 instance, directory_files_internal assumes that file decoding
7505 doesn't compile new regexps. */
7506 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7507 Ferase_buffer ();
df7492f9 7508 current_buffer->undo_list = Qt;
24a73b0a 7509 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 7510 set_buffer_internal (current);
24a73b0a 7511 return workbuf;
df7492f9 7512}
d46c5b12 7513
24a73b0a 7514
4776e638 7515static Lisp_Object
971de7fb 7516code_conversion_restore (Lisp_Object arg)
4776e638 7517{
24a73b0a 7518 Lisp_Object current, workbuf;
948bdcf3 7519 struct gcpro gcpro1;
24a73b0a 7520
948bdcf3 7521 GCPRO1 (arg);
24a73b0a
KH
7522 current = XCAR (arg);
7523 workbuf = XCDR (arg);
7524 if (! NILP (workbuf))
7525 {
7526 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7527 reused_workbuf_in_use = 0;
7528 else if (! NILP (Fbuffer_live_p (workbuf)))
7529 Fkill_buffer (workbuf);
7530 }
7531 set_buffer_internal (XBUFFER (current));
948bdcf3 7532 UNGCPRO;
4776e638
KH
7533 return Qnil;
7534}
b73bfc1c 7535
24a73b0a 7536Lisp_Object
971de7fb 7537code_conversion_save (int with_work_buf, int multibyte)
df7492f9 7538{
24a73b0a 7539 Lisp_Object workbuf = Qnil;
b73bfc1c 7540
4776e638 7541 if (with_work_buf)
24a73b0a
KH
7542 workbuf = make_conversion_work_buffer (multibyte);
7543 record_unwind_protect (code_conversion_restore,
7544 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7545 return workbuf;
df7492f9 7546}
d46c5b12 7547
df7492f9 7548int
cf84bb53
JB
7549decode_coding_gap (struct coding_system *coding,
7550 EMACS_INT chars, EMACS_INT bytes)
df7492f9 7551{
1a4990fb 7552 int count = SPECPDL_INDEX ();
5e5c78be 7553 Lisp_Object attrs;
fb88bf2d 7554
24a73b0a 7555 code_conversion_save (0, 0);
ec6d2bb8 7556
24a73b0a 7557 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7558 coding->src_chars = chars;
7559 coding->src_bytes = bytes;
7560 coding->src_pos = -chars;
7561 coding->src_pos_byte = -bytes;
7562 coding->src_multibyte = chars < bytes;
24a73b0a 7563 coding->dst_object = coding->src_object;
df7492f9
KH
7564 coding->dst_pos = PT;
7565 coding->dst_pos_byte = PT_BYTE;
71c81426 7566 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4ed46869 7567
df7492f9
KH
7568 if (CODING_REQUIRE_DETECTION (coding))
7569 detect_coding (coding);
8f924df7 7570
9286b333 7571 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7572 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7573 decode_coding (coding);
287c57d7 7574 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7575
5e5c78be
KH
7576 attrs = CODING_ID_ATTRS (coding->id);
7577 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7578 {
5e5c78be
KH
7579 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7580 Lisp_Object val;
7581
7582 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7583 val = call1 (CODING_ATTR_POST_READ (attrs),
7584 make_number (coding->produced_char));
5e5c78be
KH
7585 CHECK_NATNUM (val);
7586 coding->produced_char += Z - prev_Z;
7587 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7588 }
4ed46869 7589
df7492f9 7590 unbind_to (count, Qnil);
b73bfc1c
KH
7591 return coding->result;
7592}
52d41803 7593
4ed46869 7594int
cf84bb53
JB
7595encode_coding_gap (struct coding_system *coding,
7596 EMACS_INT chars, EMACS_INT bytes)
4ed46869 7597{
1a4990fb 7598 int count = SPECPDL_INDEX ();
4ed46869 7599
24a73b0a 7600 code_conversion_save (0, 0);
4ed46869 7601
24a73b0a 7602 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7603 coding->src_chars = chars;
7604 coding->src_bytes = bytes;
7605 coding->src_pos = -chars;
7606 coding->src_pos_byte = -bytes;
7607 coding->src_multibyte = chars < bytes;
7608 coding->dst_object = coding->src_object;
7609 coding->dst_pos = PT;
7610 coding->dst_pos_byte = PT_BYTE;
4ed46869 7611
df7492f9 7612 encode_coding (coding);
b73bfc1c 7613
df7492f9
KH
7614 unbind_to (count, Qnil);
7615 return coding->result;
7616}
4ed46869 7617
d46c5b12 7618
df7492f9
KH
7619/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7620 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7621
df7492f9 7622 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7623
df7492f9
KH
7624 If it is a buffer, the text is at point of the buffer. FROM and TO
7625 are positions in the buffer.
b73bfc1c 7626
df7492f9
KH
7627 If it is a string, the text is at the beginning of the string.
7628 FROM and TO are indices to the string.
4ed46869 7629
df7492f9
KH
7630 If it is nil, the text is at coding->source. FROM and TO are
7631 indices to coding->source.
bb10be8b 7632
df7492f9 7633 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7634
df7492f9
KH
7635 If it is a buffer, the decoded text is inserted at point of the
7636 buffer. If the buffer is the same as SRC_OBJECT, the source text
7637 is deleted.
4ed46869 7638
df7492f9
KH
7639 If it is Qt, a string is made from the decoded text, and
7640 set in CODING->dst_object.
d46c5b12 7641
df7492f9 7642 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7643 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7644 CODING->destination by xmalloc. If the decoded text is longer than
7645 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7646 */
d46c5b12 7647
df7492f9 7648void
cf84bb53
JB
7649decode_coding_object (struct coding_system *coding,
7650 Lisp_Object src_object,
7651 EMACS_INT from, EMACS_INT from_byte,
7652 EMACS_INT to, EMACS_INT to_byte,
7653 Lisp_Object dst_object)
d46c5b12 7654{
1a4990fb 7655 int count = SPECPDL_INDEX ();
df7492f9
KH
7656 unsigned char *destination;
7657 EMACS_INT dst_bytes;
7658 EMACS_INT chars = to - from;
7659 EMACS_INT bytes = to_byte - from_byte;
7660 Lisp_Object attrs;
4776e638 7661 int saved_pt = -1, saved_pt_byte;
64cedb0c 7662 int need_marker_adjustment = 0;
b3bfad50 7663 Lisp_Object old_deactivate_mark;
d46c5b12 7664
b3bfad50 7665 old_deactivate_mark = Vdeactivate_mark;
93dec019 7666
df7492f9 7667 if (NILP (dst_object))
d46c5b12 7668 {
df7492f9
KH
7669 destination = coding->destination;
7670 dst_bytes = coding->dst_bytes;
d46c5b12 7671 }
93dec019 7672
df7492f9
KH
7673 coding->src_object = src_object;
7674 coding->src_chars = chars;
7675 coding->src_bytes = bytes;
7676 coding->src_multibyte = chars < bytes;
70ad9fc4 7677
df7492f9 7678 if (STRINGP (src_object))
d46c5b12 7679 {
df7492f9
KH
7680 coding->src_pos = from;
7681 coding->src_pos_byte = from_byte;
d46c5b12 7682 }
df7492f9 7683 else if (BUFFERP (src_object))
88993dfd 7684 {
df7492f9
KH
7685 set_buffer_internal (XBUFFER (src_object));
7686 if (from != GPT)
7687 move_gap_both (from, from_byte);
7688 if (EQ (src_object, dst_object))
fb88bf2d 7689 {
64cedb0c
KH
7690 struct Lisp_Marker *tail;
7691
7692 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7693 {
7694 tail->need_adjustment
7695 = tail->charpos == (tail->insertion_type ? from : to);
7696 need_marker_adjustment |= tail->need_adjustment;
7697 }
4776e638 7698 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7699 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7700 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7701 del_range_both (from, from_byte, to, to_byte, 1);
7702 coding->src_pos = -chars;
7703 coding->src_pos_byte = -bytes;
fb88bf2d 7704 }
df7492f9 7705 else
fb88bf2d 7706 {
df7492f9
KH
7707 coding->src_pos = from;
7708 coding->src_pos_byte = from_byte;
fb88bf2d 7709 }
88993dfd
KH
7710 }
7711
df7492f9
KH
7712 if (CODING_REQUIRE_DETECTION (coding))
7713 detect_coding (coding);
7714 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7715
2cb26057
KH
7716 if (EQ (dst_object, Qt)
7717 || (! NILP (CODING_ATTR_POST_READ (attrs))
7718 && NILP (dst_object)))
b73bfc1c 7719 {
a1567c45
SM
7720 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7721 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7722 coding->dst_pos = BEG;
7723 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7724 }
df7492f9 7725 else if (BUFFERP (dst_object))
d46c5b12 7726 {
24a73b0a 7727 code_conversion_save (0, 0);
df7492f9
KH
7728 coding->dst_object = dst_object;
7729 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7730 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7731 coding->dst_multibyte
7732 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
7733 }
7734 else
7735 {
24a73b0a 7736 code_conversion_save (0, 0);
df7492f9 7737 coding->dst_object = Qnil;
0154725e
SM
7738 /* Most callers presume this will return a multibyte result, and they
7739 won't use `binary' or `raw-text' anyway, so let's not worry about
7740 CODING_FOR_UNIBYTE. */
bb555731 7741 coding->dst_multibyte = 1;
d46c5b12
KH
7742 }
7743
df7492f9 7744 decode_coding (coding);
fa46990e 7745
df7492f9
KH
7746 if (BUFFERP (coding->dst_object))
7747 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7748
df7492f9 7749 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7750 {
b3bfad50 7751 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
df7492f9 7752 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7753 Lisp_Object val;
d46c5b12 7754
c0cc7f7f 7755 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7756 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7757 old_deactivate_mark);
d4850d67
KH
7758 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7759 make_number (coding->produced_char));
df7492f9
KH
7760 UNGCPRO;
7761 CHECK_NATNUM (val);
7762 coding->produced_char += Z - prev_Z;
7763 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7764 }
de79a6a5 7765
df7492f9 7766 if (EQ (dst_object, Qt))
ec6d2bb8 7767 {
df7492f9
KH
7768 coding->dst_object = Fbuffer_string ();
7769 }
7770 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7771 {
7772 set_buffer_internal (XBUFFER (coding->dst_object));
7773 if (dst_bytes < coding->produced)
7774 {
b3bfad50 7775 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7776 if (! destination)
7777 {
065e3595 7778 record_conversion_result (coding,
ebaf11b6 7779 CODING_RESULT_INSUFFICIENT_MEM);
df7492f9
KH
7780 unbind_to (count, Qnil);
7781 return;
7782 }
7783 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7784 move_gap_both (BEGV, BEGV_BYTE);
72af86bd 7785 memcpy (destination, BEGV_ADDR, coding->produced);
df7492f9 7786 coding->destination = destination;
d46c5b12 7787 }
ec6d2bb8 7788 }
b73bfc1c 7789
4776e638
KH
7790 if (saved_pt >= 0)
7791 {
7792 /* This is the case of:
7793 (BUFFERP (src_object) && EQ (src_object, dst_object))
7794 As we have moved PT while replacing the original buffer
7795 contents, we must recover it now. */
7796 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7797 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7798 if (saved_pt < from)
7799 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7800 else if (saved_pt < from + chars)
7801 TEMP_SET_PT_BOTH (from, from_byte);
7802 else if (! NILP (current_buffer->enable_multibyte_characters))
7803 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7804 saved_pt_byte + (coding->produced - bytes));
7805 else
7806 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7807 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7808
7809 if (need_marker_adjustment)
7810 {
7811 struct Lisp_Marker *tail;
7812
7813 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7814 if (tail->need_adjustment)
7815 {
7816 tail->need_adjustment = 0;
7817 if (tail->insertion_type)
7818 {
7819 tail->bytepos = from_byte;
7820 tail->charpos = from;
7821 }
7822 else
7823 {
7824 tail->bytepos = from_byte + coding->produced;
7825 tail->charpos
7826 = (NILP (current_buffer->enable_multibyte_characters)
7827 ? tail->bytepos : from + coding->produced_char);
7828 }
7829 }
7830 }
d46c5b12 7831 }
4776e638 7832
b3bfad50 7833 Vdeactivate_mark = old_deactivate_mark;
065e3595 7834 unbind_to (count, coding->dst_object);
d46c5b12
KH
7835}
7836
d46c5b12 7837
df7492f9 7838void
cf84bb53
JB
7839encode_coding_object (struct coding_system *coding,
7840 Lisp_Object src_object,
7841 EMACS_INT from, EMACS_INT from_byte,
7842 EMACS_INT to, EMACS_INT to_byte,
7843 Lisp_Object dst_object)
d46c5b12 7844{
1a4990fb 7845 int count = SPECPDL_INDEX ();
df7492f9
KH
7846 EMACS_INT chars = to - from;
7847 EMACS_INT bytes = to_byte - from_byte;
7848 Lisp_Object attrs;
4776e638 7849 int saved_pt = -1, saved_pt_byte;
64cedb0c 7850 int need_marker_adjustment = 0;
c02d943b 7851 int kill_src_buffer = 0;
b3bfad50 7852 Lisp_Object old_deactivate_mark;
df7492f9 7853
b3bfad50 7854 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7855
7856 coding->src_object = src_object;
7857 coding->src_chars = chars;
7858 coding->src_bytes = bytes;
7859 coding->src_multibyte = chars < bytes;
7860
7861 attrs = CODING_ID_ATTRS (coding->id);
7862
64cedb0c
KH
7863 if (EQ (src_object, dst_object))
7864 {
7865 struct Lisp_Marker *tail;
7866
7867 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7868 {
7869 tail->need_adjustment
7870 = tail->charpos == (tail->insertion_type ? from : to);
7871 need_marker_adjustment |= tail->need_adjustment;
7872 }
7873 }
7874
df7492f9 7875 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7876 {
24a73b0a 7877 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7878 set_buffer_internal (XBUFFER (coding->src_object));
7879 if (STRINGP (src_object))
7880 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7881 else if (BUFFERP (src_object))
7882 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7883 else
7884 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7885
df7492f9
KH
7886 if (EQ (src_object, dst_object))
7887 {
7888 set_buffer_internal (XBUFFER (src_object));
4776e638 7889 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7890 del_range_both (from, from_byte, to, to_byte, 1);
7891 set_buffer_internal (XBUFFER (coding->src_object));
7892 }
7893
d4850d67
KH
7894 {
7895 Lisp_Object args[3];
b3bfad50 7896 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7897
b3bfad50
KH
7898 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7899 old_deactivate_mark);
d4850d67
KH
7900 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7901 args[1] = make_number (BEG);
7902 args[2] = make_number (Z);
7903 safe_call (3, args);
b3bfad50 7904 UNGCPRO;
d4850d67 7905 }
c02d943b
KH
7906 if (XBUFFER (coding->src_object) != current_buffer)
7907 kill_src_buffer = 1;
ac87bbef 7908 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7909 if (BEG != GPT)
7910 move_gap_both (BEG, BEG_BYTE);
7911 coding->src_chars = Z - BEG;
7912 coding->src_bytes = Z_BYTE - BEG_BYTE;
7913 coding->src_pos = BEG;
7914 coding->src_pos_byte = BEG_BYTE;
7915 coding->src_multibyte = Z < Z_BYTE;
7916 }
7917 else if (STRINGP (src_object))
d46c5b12 7918 {
24a73b0a 7919 code_conversion_save (0, 0);
df7492f9
KH
7920 coding->src_pos = from;
7921 coding->src_pos_byte = from_byte;
b73bfc1c 7922 }
df7492f9 7923 else if (BUFFERP (src_object))
b73bfc1c 7924 {
24a73b0a 7925 code_conversion_save (0, 0);
df7492f9 7926 set_buffer_internal (XBUFFER (src_object));
df7492f9 7927 if (EQ (src_object, dst_object))
d46c5b12 7928 {
4776e638 7929 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7930 coding->src_object = del_range_1 (from, to, 1, 1);
7931 coding->src_pos = 0;
7932 coding->src_pos_byte = 0;
d46c5b12 7933 }
df7492f9 7934 else
d46c5b12 7935 {
ff0dacd7
KH
7936 if (from < GPT && to >= GPT)
7937 move_gap_both (from, from_byte);
df7492f9
KH
7938 coding->src_pos = from;
7939 coding->src_pos_byte = from_byte;
d46c5b12 7940 }
d46c5b12 7941 }
4776e638 7942 else
24a73b0a 7943 code_conversion_save (0, 0);
d46c5b12 7944
df7492f9 7945 if (BUFFERP (dst_object))
88993dfd 7946 {
df7492f9 7947 coding->dst_object = dst_object;
28f67a95
KH
7948 if (EQ (src_object, dst_object))
7949 {
7950 coding->dst_pos = from;
7951 coding->dst_pos_byte = from_byte;
7952 }
7953 else
7954 {
319a3947
KH
7955 struct buffer *current = current_buffer;
7956
7957 set_buffer_temp (XBUFFER (dst_object));
7958 coding->dst_pos = PT;
7959 coding->dst_pos_byte = PT_BYTE;
7960 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7961 set_buffer_temp (current);
28f67a95 7962 }
df7492f9
KH
7963 coding->dst_multibyte
7964 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 7965 }
df7492f9 7966 else if (EQ (dst_object, Qt))
d46c5b12 7967 {
df7492f9 7968 coding->dst_object = Qnil;
df7492f9 7969 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
7970 if (coding->dst_bytes == 0)
7971 coding->dst_bytes = 1;
7972 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 7973 coding->dst_multibyte = 0;
d46c5b12
KH
7974 }
7975 else
7976 {
df7492f9
KH
7977 coding->dst_object = Qnil;
7978 coding->dst_multibyte = 0;
d46c5b12
KH
7979 }
7980
df7492f9 7981 encode_coding (coding);
d46c5b12 7982
df7492f9 7983 if (EQ (dst_object, Qt))
d46c5b12 7984 {
df7492f9
KH
7985 if (BUFFERP (coding->dst_object))
7986 coding->dst_object = Fbuffer_string ();
7987 else
d46c5b12 7988 {
df7492f9
KH
7989 coding->dst_object
7990 = make_unibyte_string ((char *) coding->destination,
7991 coding->produced);
7992 xfree (coding->destination);
d46c5b12 7993 }
4ed46869 7994 }
d46c5b12 7995
4776e638
KH
7996 if (saved_pt >= 0)
7997 {
7998 /* This is the case of:
7999 (BUFFERP (src_object) && EQ (src_object, dst_object))
8000 As we have moved PT while replacing the original buffer
8001 contents, we must recover it now. */
8002 set_buffer_internal (XBUFFER (src_object));
8003 if (saved_pt < from)
8004 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8005 else if (saved_pt < from + chars)
8006 TEMP_SET_PT_BOTH (from, from_byte);
8007 else if (! NILP (current_buffer->enable_multibyte_characters))
8008 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8009 saved_pt_byte + (coding->produced - bytes));
d46c5b12 8010 else
4776e638
KH
8011 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8012 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
8013
8014 if (need_marker_adjustment)
8015 {
8016 struct Lisp_Marker *tail;
8017
8018 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8019 if (tail->need_adjustment)
8020 {
8021 tail->need_adjustment = 0;
8022 if (tail->insertion_type)
8023 {
8024 tail->bytepos = from_byte;
8025 tail->charpos = from;
8026 }
8027 else
8028 {
8029 tail->bytepos = from_byte + coding->produced;
8030 tail->charpos
8031 = (NILP (current_buffer->enable_multibyte_characters)
8032 ? tail->bytepos : from + coding->produced_char);
8033 }
8034 }
8035 }
4776e638
KH
8036 }
8037
c02d943b
KH
8038 if (kill_src_buffer)
8039 Fkill_buffer (coding->src_object);
b3bfad50
KH
8040
8041 Vdeactivate_mark = old_deactivate_mark;
df7492f9 8042 unbind_to (count, Qnil);
b73bfc1c
KH
8043}
8044
df7492f9 8045
b73bfc1c 8046Lisp_Object
971de7fb 8047preferred_coding_system (void)
b73bfc1c 8048{
df7492f9 8049 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 8050
df7492f9 8051 return CODING_ID_NAME (id);
4ed46869
KH
8052}
8053
8054\f
8055#ifdef emacs
1397dc18 8056/*** 8. Emacs Lisp library functions ***/
4ed46869 8057
4ed46869 8058DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 8059 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 8060See the documentation of `define-coding-system' for information
48b0f3ae 8061about coding-system objects. */)
5842a27b 8062 (Lisp_Object object)
4ed46869 8063{
d4a1d553
JB
8064 if (NILP (object)
8065 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 8066 return Qt;
d4a1d553
JB
8067 if (! SYMBOLP (object)
8068 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
8069 return Qnil;
8070 return Qt;
4ed46869
KH
8071}
8072
9d991de8
RS
8073DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8074 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae 8075 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
5842a27b 8076 (Lisp_Object prompt)
4ed46869 8077{
e0e989f6 8078 Lisp_Object val;
9d991de8
RS
8079 do
8080 {
4608c386
KH
8081 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8082 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 8083 }
8f924df7 8084 while (SCHARS (val) == 0);
e0e989f6 8085 return (Fintern (val, Qnil));
4ed46869
KH
8086}
8087
9b787f3e 8088DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 8089 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
8090If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8091Ignores case when completing coding systems (all Emacs coding systems
8092are lower-case). */)
5842a27b 8093 (Lisp_Object prompt, Lisp_Object default_coding_system)
4ed46869 8094{
f44d27ce 8095 Lisp_Object val;
c7183fb8
GM
8096 int count = SPECPDL_INDEX ();
8097
9b787f3e 8098 if (SYMBOLP (default_coding_system))
57d25e6f 8099 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 8100 specbind (Qcompletion_ignore_case, Qt);
4608c386 8101 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
8102 Qt, Qnil, Qcoding_system_history,
8103 default_coding_system, Qnil);
c7183fb8 8104 unbind_to (count, Qnil);
8f924df7 8105 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
8106}
8107
8108DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8109 1, 1, 0,
48b0f3ae 8110 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
8111If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8112It is valid if it is nil or a symbol defined as a coding system by the
8113function `define-coding-system'. */)
5842a27b 8114 (Lisp_Object coding_system)
4ed46869 8115{
44e8490d
KH
8116 Lisp_Object define_form;
8117
8118 define_form = Fget (coding_system, Qcoding_system_define_form);
8119 if (! NILP (define_form))
8120 {
8121 Fput (coding_system, Qcoding_system_define_form, Qnil);
8122 safe_eval (define_form);
8123 }
4ed46869
KH
8124 if (!NILP (Fcoding_system_p (coding_system)))
8125 return coding_system;
fcad4ec4 8126 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 8127}
df7492f9 8128
3a73fa5d 8129\f
89528eb3
KH
8130/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8131 HIGHEST is nonzero, return the coding system of the highest
ad1746f5 8132 priority among the detected coding systems. Otherwise return a
89528eb3
KH
8133 list of detected coding systems sorted by their priorities. If
8134 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8135 multibyte form but contains only ASCII and eight-bit chars.
8136 Otherwise, the bytes are raw bytes.
8137
8138 CODING-SYSTEM controls the detection as below:
8139
8140 If it is nil, detect both text-format and eol-format. If the
8141 text-format part of CODING-SYSTEM is already specified
8142 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8143 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8144 detect only text-format. */
8145
d46c5b12 8146Lisp_Object
cf84bb53
JB
8147detect_coding_system (const unsigned char *src,
8148 EMACS_INT src_chars, EMACS_INT src_bytes,
8149 int highest, int multibytep,
8150 Lisp_Object coding_system)
4ed46869 8151{
8f924df7 8152 const unsigned char *src_end = src + src_bytes;
df7492f9 8153 Lisp_Object attrs, eol_type;
4533845d 8154 Lisp_Object val = Qnil;
df7492f9 8155 struct coding_system coding;
89528eb3 8156 int id;
ff0dacd7 8157 struct coding_detection_info detect_info;
24a73b0a 8158 enum coding_category base_category;
2f3cbb32 8159 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 8160
df7492f9
KH
8161 if (NILP (coding_system))
8162 coding_system = Qundecided;
8163 setup_coding_system (coding_system, &coding);
8164 attrs = CODING_ID_ATTRS (coding.id);
8165 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 8166 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 8167
df7492f9 8168 coding.source = src;
24a73b0a 8169 coding.src_chars = src_chars;
df7492f9
KH
8170 coding.src_bytes = src_bytes;
8171 coding.src_multibyte = multibytep;
8172 coding.consumed = 0;
89528eb3 8173 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 8174 coding.head_ascii = 0;
d46c5b12 8175
ff0dacd7 8176 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 8177
89528eb3 8178 /* At first, detect text-format if necessary. */
24a73b0a
KH
8179 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8180 if (base_category == coding_category_undecided)
4ed46869 8181 {
ff0dacd7
KH
8182 enum coding_category category;
8183 struct coding_system *this;
8184 int c, i;
88993dfd 8185
24a73b0a 8186 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 8187 for (; src < src_end; src++)
4ed46869 8188 {
df7492f9 8189 c = *src;
6cb21a4f 8190 if (c & 0x80)
6cb21a4f 8191 {
2f3cbb32 8192 eight_bit_found = 1;
2f3cbb32
KH
8193 if (null_byte_found)
8194 break;
8195 }
c0e16b14 8196 else if (c < 0x20)
2f3cbb32
KH
8197 {
8198 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8199 && ! inhibit_iso_escape_detection
8200 && ! detect_info.checked)
6cb21a4f 8201 {
2f3cbb32
KH
8202 if (detect_coding_iso_2022 (&coding, &detect_info))
8203 {
8204 /* We have scanned the whole data. */
8205 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
8206 {
8207 /* We didn't find an 8-bit code. We may
8208 have found a null-byte, but it's very
8209 rare that a binary file confirm to
8210 ISO-2022. */
8211 src = src_end;
8212 coding.head_ascii = src - coding.source;
8213 }
8214 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
8215 break;
8216 }
8217 }
97b1b294 8218 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
8219 {
8220 null_byte_found = 1;
8221 if (eight_bit_found)
8222 break;
6cb21a4f 8223 }
c006c0c8
KH
8224 if (! eight_bit_found)
8225 coding.head_ascii++;
6cb21a4f 8226 }
c006c0c8 8227 else if (! eight_bit_found)
c0e16b14 8228 coding.head_ascii++;
4ed46869 8229 }
88993dfd 8230
2f3cbb32
KH
8231 if (null_byte_found || eight_bit_found
8232 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
8233 || detect_info.found)
8234 {
2f3cbb32 8235 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
8236 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8237 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 8238 {
6cb21a4f 8239 category = coding_priorities[i];
c7266f4a 8240 this = coding_categories + category;
6cb21a4f 8241 if (detect_info.found & (1 << category))
ff0dacd7
KH
8242 break;
8243 }
6cb21a4f 8244 else
2f3cbb32
KH
8245 {
8246 if (null_byte_found)
8247 {
8248 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8249 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8250 }
8251 for (i = 0; i < coding_category_raw_text; i++)
8252 {
8253 category = coding_priorities[i];
8254 this = coding_categories + category;
6cb21a4f 8255
2f3cbb32
KH
8256 if (this->id < 0)
8257 {
8258 /* No coding system of this category is defined. */
8259 detect_info.rejected |= (1 << category);
8260 }
8261 else if (category >= coding_category_raw_text)
8262 continue;
8263 else if (detect_info.checked & (1 << category))
8264 {
8265 if (highest
8266 && (detect_info.found & (1 << category)))
6cb21a4f 8267 break;
2f3cbb32
KH
8268 }
8269 else if ((*(this->detector)) (&coding, &detect_info)
8270 && highest
8271 && (detect_info.found & (1 << category)))
8272 {
8273 if (category == coding_category_utf_16_auto)
8274 {
8275 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8276 category = coding_category_utf_16_le;
8277 else
8278 category = coding_category_utf_16_be;
8279 }
8280 break;
8281 }
8282 }
8283 }
6cb21a4f 8284 }
ec6d2bb8 8285
4cddb209
KH
8286 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8287 || null_byte_found)
ec6d2bb8 8288 {
ff0dacd7 8289 detect_info.found = CATEGORY_MASK_RAW_TEXT;
4cddb209 8290 id = CODING_SYSTEM_ID (Qno_conversion);
89528eb3
KH
8291 val = Fcons (make_number (id), Qnil);
8292 }
ff0dacd7 8293 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 8294 {
ff0dacd7 8295 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
8296 id = coding_categories[coding_category_undecided].id;
8297 val = Fcons (make_number (id), Qnil);
8298 }
8299 else if (highest)
8300 {
ff0dacd7 8301 if (detect_info.found)
ec6d2bb8 8302 {
ff0dacd7
KH
8303 detect_info.found = 1 << category;
8304 val = Fcons (make_number (this->id), Qnil);
8305 }
8306 else
8307 for (i = 0; i < coding_category_raw_text; i++)
8308 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8309 {
8310 detect_info.found = 1 << coding_priorities[i];
8311 id = coding_categories[coding_priorities[i]].id;
8312 val = Fcons (make_number (id), Qnil);
8313 break;
8314 }
8315 }
89528eb3
KH
8316 else
8317 {
ff0dacd7
KH
8318 int mask = detect_info.rejected | detect_info.found;
8319 int found = 0;
ec6d2bb8 8320
89528eb3 8321 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
8322 {
8323 category = coding_priorities[i];
8324 if (! (mask & (1 << category)))
ec6d2bb8 8325 {
ff0dacd7
KH
8326 found |= 1 << category;
8327 id = coding_categories[category].id;
c7266f4a
KH
8328 if (id >= 0)
8329 val = Fcons (make_number (id), val);
ff0dacd7
KH
8330 }
8331 }
8332 for (i = coding_category_raw_text - 1; i >= 0; i--)
8333 {
8334 category = coding_priorities[i];
8335 if (detect_info.found & (1 << category))
8336 {
8337 id = coding_categories[category].id;
8338 val = Fcons (make_number (id), val);
ec6d2bb8 8339 }
ec6d2bb8 8340 }
ff0dacd7 8341 detect_info.found |= found;
ec6d2bb8 8342 }
ec6d2bb8 8343 }
a470d443
KH
8344 else if (base_category == coding_category_utf_8_auto)
8345 {
8346 if (detect_coding_utf_8 (&coding, &detect_info))
8347 {
8348 struct coding_system *this;
8349
8350 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8351 this = coding_categories + coding_category_utf_8_sig;
8352 else
8353 this = coding_categories + coding_category_utf_8_nosig;
8354 val = Fcons (make_number (this->id), Qnil);
8355 }
8356 }
24a73b0a
KH
8357 else if (base_category == coding_category_utf_16_auto)
8358 {
8359 if (detect_coding_utf_16 (&coding, &detect_info))
8360 {
24a73b0a
KH
8361 struct coding_system *this;
8362
8363 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8364 this = coding_categories + coding_category_utf_16_le;
8365 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8366 this = coding_categories + coding_category_utf_16_be;
8367 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8368 this = coding_categories + coding_category_utf_16_be_nosig;
8369 else
8370 this = coding_categories + coding_category_utf_16_le_nosig;
8371 val = Fcons (make_number (this->id), Qnil);
8372 }
8373 }
df7492f9
KH
8374 else
8375 {
ff0dacd7 8376 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 8377 val = Fcons (make_number (coding.id), Qnil);
4ed46869 8378 }
df7492f9 8379
89528eb3 8380 /* Then, detect eol-format if necessary. */
df7492f9 8381 {
4533845d 8382 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
8383 Lisp_Object tail;
8384
89528eb3
KH
8385 if (VECTORP (eol_type))
8386 {
ff0dacd7 8387 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
8388 {
8389 if (null_byte_found)
8390 normal_eol = EOL_SEEN_LF;
8391 else
8392 normal_eol = detect_eol (coding.source, src_bytes,
8393 coding_category_raw_text);
8394 }
ff0dacd7
KH
8395 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8396 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
8397 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8398 coding_category_utf_16_be);
ff0dacd7
KH
8399 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8400 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
8401 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8402 coding_category_utf_16_le);
8403 }
8404 else
8405 {
8406 if (EQ (eol_type, Qunix))
8407 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8408 else if (EQ (eol_type, Qdos))
8409 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8410 else
8411 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8412 }
8413
df7492f9
KH
8414 for (tail = val; CONSP (tail); tail = XCDR (tail))
8415 {
89528eb3 8416 enum coding_category category;
df7492f9 8417 int this_eol;
89528eb3
KH
8418
8419 id = XINT (XCAR (tail));
8420 attrs = CODING_ID_ATTRS (id);
8421 category = XINT (CODING_ATTR_CATEGORY (attrs));
8422 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8423 if (VECTORP (eol_type))
8424 {
89528eb3
KH
8425 if (category == coding_category_utf_16_be
8426 || category == coding_category_utf_16_be_nosig)
8427 this_eol = utf_16_be_eol;
8428 else if (category == coding_category_utf_16_le
8429 || category == coding_category_utf_16_le_nosig)
8430 this_eol = utf_16_le_eol;
df7492f9 8431 else
89528eb3
KH
8432 this_eol = normal_eol;
8433
df7492f9
KH
8434 if (this_eol == EOL_SEEN_LF)
8435 XSETCAR (tail, AREF (eol_type, 0));
8436 else if (this_eol == EOL_SEEN_CRLF)
8437 XSETCAR (tail, AREF (eol_type, 1));
8438 else if (this_eol == EOL_SEEN_CR)
8439 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8440 else
8441 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8442 }
89528eb3
KH
8443 else
8444 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8445 }
8446 }
ec6d2bb8 8447
4533845d 8448 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8449}
8450
ec6d2bb8 8451
d46c5b12
KH
8452DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8453 2, 3, 0,
48b0f3ae
PJ
8454 doc: /* Detect coding system of the text in the region between START and END.
8455Return a list of possible coding systems ordered by priority.
b811c52b
KH
8456The coding systems to try and their priorities follows what
8457the function `coding-system-priority-list' (which see) returns.
ec6d2bb8 8458
12e0131a 8459If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8460characters as ESC), it returns a list of single element `undecided'
8461or its subsidiary coding system according to a detected end-of-line
8462format.
ec6d2bb8 8463
48b0f3ae
PJ
8464If optional argument HIGHEST is non-nil, return the coding system of
8465highest priority. */)
5842a27b 8466 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
d46c5b12
KH
8467{
8468 int from, to;
8469 int from_byte, to_byte;
ec6d2bb8 8470
b7826503
PJ
8471 CHECK_NUMBER_COERCE_MARKER (start);
8472 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8473
d46c5b12
KH
8474 validate_region (&start, &end);
8475 from = XINT (start), to = XINT (end);
8476 from_byte = CHAR_TO_BYTE (from);
8477 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8478
d46c5b12
KH
8479 if (from < GPT && to >= GPT)
8480 move_gap_both (to, to_byte);
c210f766 8481
d46c5b12 8482 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8483 to - from, to_byte - from_byte,
0a28aafb
KH
8484 !NILP (highest),
8485 !NILP (current_buffer
df7492f9
KH
8486 ->enable_multibyte_characters),
8487 Qnil);
ec6d2bb8
KH
8488}
8489
d46c5b12
KH
8490DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8491 1, 2, 0,
48b0f3ae
PJ
8492 doc: /* Detect coding system of the text in STRING.
8493Return a list of possible coding systems ordered by priority.
67ceab9d
KH
8494The coding systems to try and their priorities follows what
8495the function `coding-system-priority-list' (which see) returns.
fb88bf2d 8496
12e0131a 8497If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8498characters as ESC), it returns a list of single element `undecided'
8499or its subsidiary coding system according to a detected end-of-line
8500format.
d46c5b12 8501
48b0f3ae
PJ
8502If optional argument HIGHEST is non-nil, return the coding system of
8503highest priority. */)
5842a27b 8504 (Lisp_Object string, Lisp_Object highest)
d46c5b12 8505{
b7826503 8506 CHECK_STRING (string);
b73bfc1c 8507
24a73b0a
KH
8508 return detect_coding_system (SDATA (string),
8509 SCHARS (string), SBYTES (string),
8f924df7 8510 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8511 Qnil);
4ed46869 8512}
4ed46869 8513
b73bfc1c 8514
df7492f9 8515static INLINE int
971de7fb 8516char_encodable_p (int c, Lisp_Object attrs)
05e6f5dc 8517{
df7492f9 8518 Lisp_Object tail;
df7492f9 8519 struct charset *charset;
7d64c6ad 8520 Lisp_Object translation_table;
d46c5b12 8521
7d64c6ad 8522 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8523 if (! NILP (translation_table))
7d64c6ad 8524 c = translate_char (translation_table, c);
df7492f9
KH
8525 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8526 CONSP (tail); tail = XCDR (tail))
e133c8fa 8527 {
df7492f9
KH
8528 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8529 if (CHAR_CHARSET_P (c, charset))
8530 break;
e133c8fa 8531 }
df7492f9 8532 return (! NILP (tail));
05e6f5dc 8533}
83fa074f 8534
fb88bf2d 8535
df7492f9
KH
8536/* Return a list of coding systems that safely encode the text between
8537 START and END. If EXCLUDE is non-nil, it is a list of coding
8538 systems not to check. The returned list doesn't contain any such
48468dac 8539 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8540 unibyte, return t. */
e077cc80 8541
df7492f9
KH
8542DEFUN ("find-coding-systems-region-internal",
8543 Ffind_coding_systems_region_internal,
8544 Sfind_coding_systems_region_internal, 2, 3, 0,
8545 doc: /* Internal use only. */)
5842a27b 8546 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
df7492f9
KH
8547{
8548 Lisp_Object coding_attrs_list, safe_codings;
8549 EMACS_INT start_byte, end_byte;
7c78e542 8550 const unsigned char *p, *pbeg, *pend;
df7492f9 8551 int c;
0e727afa 8552 Lisp_Object tail, elt, work_table;
d46c5b12 8553
df7492f9
KH
8554 if (STRINGP (start))
8555 {
8556 if (!STRING_MULTIBYTE (start)
8f924df7 8557 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8558 return Qt;
8559 start_byte = 0;
8f924df7 8560 end_byte = SBYTES (start);
df7492f9
KH
8561 }
8562 else
d46c5b12 8563 {
df7492f9
KH
8564 CHECK_NUMBER_COERCE_MARKER (start);
8565 CHECK_NUMBER_COERCE_MARKER (end);
8566 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8567 args_out_of_range (start, end);
8568 if (NILP (current_buffer->enable_multibyte_characters))
8569 return Qt;
8570 start_byte = CHAR_TO_BYTE (XINT (start));
8571 end_byte = CHAR_TO_BYTE (XINT (end));
8572 if (XINT (end) - XINT (start) == end_byte - start_byte)
8573 return Qt;
d46c5b12 8574
e1c23804 8575 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8576 {
e1c23804
DL
8577 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8578 move_gap_both (XINT (start), start_byte);
df7492f9 8579 else
e1c23804 8580 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8581 }
8582 }
8583
df7492f9
KH
8584 coding_attrs_list = Qnil;
8585 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8586 if (NILP (exclude)
8587 || NILP (Fmemq (XCAR (tail), exclude)))
8588 {
8589 Lisp_Object attrs;
d46c5b12 8590
df7492f9
KH
8591 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8592 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8593 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8594 {
8595 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8596 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8597 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8598 }
df7492f9 8599 }
d46c5b12 8600
df7492f9 8601 if (STRINGP (start))
8f924df7 8602 p = pbeg = SDATA (start);
df7492f9
KH
8603 else
8604 p = pbeg = BYTE_POS_ADDR (start_byte);
8605 pend = p + (end_byte - start_byte);
b843d1ae 8606
df7492f9
KH
8607 while (p < pend && ASCII_BYTE_P (*p)) p++;
8608 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8609
0e727afa 8610 work_table = Fmake_char_table (Qnil, Qnil);
05e6f5dc 8611 while (p < pend)
72d1a715 8612 {
df7492f9
KH
8613 if (ASCII_BYTE_P (*p))
8614 p++;
72d1a715
RS
8615 else
8616 {
df7492f9 8617 c = STRING_CHAR_ADVANCE (p);
0e727afa
YM
8618 if (!NILP (char_table_ref (work_table, c)))
8619 /* This character was already checked. Ignore it. */
8620 continue;
12410ef1 8621
df7492f9
KH
8622 charset_map_loaded = 0;
8623 for (tail = coding_attrs_list; CONSP (tail);)
8624 {
8625 elt = XCAR (tail);
8626 if (NILP (elt))
8627 tail = XCDR (tail);
8628 else if (char_encodable_p (c, elt))
8629 tail = XCDR (tail);
8630 else if (CONSP (XCDR (tail)))
8631 {
8632 XSETCAR (tail, XCAR (XCDR (tail)));
8633 XSETCDR (tail, XCDR (XCDR (tail)));
8634 }
8635 else
8636 {
8637 XSETCAR (tail, Qnil);
8638 tail = XCDR (tail);
8639 }
8640 }
8641 if (charset_map_loaded)
8642 {
8643 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8644
df7492f9 8645 if (STRINGP (start))
8f924df7 8646 pbeg = SDATA (start);
df7492f9
KH
8647 else
8648 pbeg = BYTE_POS_ADDR (start_byte);
8649 p = pbeg + p_offset;
8650 pend = pbeg + pend_offset;
8651 }
0e727afa 8652 char_table_set (work_table, c, Qt);
df7492f9 8653 }
ec6d2bb8 8654 }
fb88bf2d 8655
988b3759 8656 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8657 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8658 if (! NILP (XCAR (tail)))
8659 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8660
05e6f5dc
KH
8661 return safe_codings;
8662}
4956c225 8663
d46c5b12 8664
8f924df7
KH
8665DEFUN ("unencodable-char-position", Funencodable_char_position,
8666 Sunencodable_char_position, 3, 5, 0,
8667 doc: /*
8668Return position of first un-encodable character in a region.
d4a1d553 8669START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8670encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8671
8f924df7
KH
8672If optional 4th argument COUNT is non-nil, it specifies at most how
8673many un-encodable characters to search. In this case, the value is a
8674list of positions.
d46c5b12 8675
8f924df7
KH
8676If optional 5th argument STRING is non-nil, it is a string to search
8677for un-encodable characters. In that case, START and END are indexes
8678to the string. */)
5842a27b 8679 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8f924df7
KH
8680{
8681 int n;
8682 struct coding_system coding;
7d64c6ad 8683 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
8684 Lisp_Object positions;
8685 int from, to;
8686 const unsigned char *p, *stop, *pend;
8687 int ascii_compatible;
fb88bf2d 8688
8f924df7
KH
8689 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8690 attrs = CODING_ID_ATTRS (coding.id);
8691 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8692 return Qnil;
8693 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8694 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8695 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8696
8f924df7
KH
8697 if (NILP (string))
8698 {
8699 validate_region (&start, &end);
8700 from = XINT (start);
8701 to = XINT (end);
8702 if (NILP (current_buffer->enable_multibyte_characters)
8703 || (ascii_compatible
8704 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8705 return Qnil;
8706 p = CHAR_POS_ADDR (from);
8707 pend = CHAR_POS_ADDR (to);
8708 if (from < GPT && to >= GPT)
8709 stop = GPT_ADDR;
8710 else
8711 stop = pend;
8712 }
8713 else
8714 {
8715 CHECK_STRING (string);
8716 CHECK_NATNUM (start);
8717 CHECK_NATNUM (end);
8718 from = XINT (start);
8719 to = XINT (end);
8720 if (from > to
8721 || to > SCHARS (string))
8722 args_out_of_range_3 (string, start, end);
8723 if (! STRING_MULTIBYTE (string))
8724 return Qnil;
8725 p = SDATA (string) + string_char_to_byte (string, from);
8726 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8727 if (ascii_compatible && (to - from) == (pend - p))
8728 return Qnil;
8729 }
f2558efd 8730
8f924df7
KH
8731 if (NILP (count))
8732 n = 1;
8733 else
b73bfc1c 8734 {
8f924df7
KH
8735 CHECK_NATNUM (count);
8736 n = XINT (count);
b73bfc1c
KH
8737 }
8738
8f924df7
KH
8739 positions = Qnil;
8740 while (1)
d46c5b12 8741 {
8f924df7 8742 int c;
ec6d2bb8 8743
8f924df7
KH
8744 if (ascii_compatible)
8745 while (p < stop && ASCII_BYTE_P (*p))
8746 p++, from++;
8747 if (p >= stop)
0e79d667 8748 {
8f924df7
KH
8749 if (p >= pend)
8750 break;
8751 stop = pend;
8752 p = GAP_END_ADDR;
0e79d667 8753 }
ec6d2bb8 8754
8f924df7
KH
8755 c = STRING_CHAR_ADVANCE (p);
8756 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8757 && ! char_charset (translate_char (translation_table, c),
8758 charset_list, NULL))
ec6d2bb8 8759 {
8f924df7
KH
8760 positions = Fcons (make_number (from), positions);
8761 n--;
8762 if (n == 0)
8763 break;
ec6d2bb8
KH
8764 }
8765
8f924df7
KH
8766 from++;
8767 }
d46c5b12 8768
8f924df7
KH
8769 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8770}
d46c5b12 8771
d46c5b12 8772
df7492f9
KH
8773DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8774 Scheck_coding_systems_region, 3, 3, 0,
8775 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8776
df7492f9
KH
8777START and END are buffer positions specifying the region.
8778CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8779
df7492f9 8780The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8781CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8782whole region, POS0, POS1, ... are buffer positions where non-encodable
8783characters are found.
93dec019 8784
df7492f9
KH
8785If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8786value is nil.
93dec019 8787
df7492f9
KH
8788START may be a string. In that case, check if the string is
8789encodable, and the value contains indices to the string instead of
5704f39a
KH
8790buffer positions. END is ignored.
8791
4c1958f4 8792If the current buffer (or START if it is a string) is unibyte, the value
5704f39a 8793is nil. */)
5842a27b 8794 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
05e6f5dc 8795{
df7492f9
KH
8796 Lisp_Object list;
8797 EMACS_INT start_byte, end_byte;
8798 int pos;
7c78e542 8799 const unsigned char *p, *pbeg, *pend;
df7492f9 8800 int c;
7d64c6ad 8801 Lisp_Object tail, elt, attrs;
70ad9fc4 8802
05e6f5dc
KH
8803 if (STRINGP (start))
8804 {
df7492f9 8805 if (!STRING_MULTIBYTE (start)
4c1958f4 8806 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8807 return Qnil;
8808 start_byte = 0;
8f924df7 8809 end_byte = SBYTES (start);
df7492f9 8810 pos = 0;
d46c5b12 8811 }
05e6f5dc 8812 else
b73bfc1c 8813 {
b7826503
PJ
8814 CHECK_NUMBER_COERCE_MARKER (start);
8815 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8816 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8817 args_out_of_range (start, end);
8818 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
8819 return Qnil;
8820 start_byte = CHAR_TO_BYTE (XINT (start));
8821 end_byte = CHAR_TO_BYTE (XINT (end));
8822 if (XINT (end) - XINT (start) == end_byte - start_byte)
5704f39a 8823 return Qnil;
df7492f9 8824
e1c23804 8825 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8826 {
e1c23804
DL
8827 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8828 move_gap_both (XINT (start), start_byte);
df7492f9 8829 else
e1c23804 8830 move_gap_both (XINT (end), end_byte);
b73bfc1c 8831 }
e1c23804 8832 pos = XINT (start);
b73bfc1c 8833 }
7553d0e1 8834
df7492f9
KH
8835 list = Qnil;
8836 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8837 {
df7492f9 8838 elt = XCAR (tail);
7d64c6ad 8839 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8840 ASET (attrs, coding_attr_trans_tbl,
8841 get_translation_table (attrs, 1, NULL));
7d64c6ad 8842 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8843 }
8844
df7492f9 8845 if (STRINGP (start))
8f924df7 8846 p = pbeg = SDATA (start);
72d1a715 8847 else
df7492f9
KH
8848 p = pbeg = BYTE_POS_ADDR (start_byte);
8849 pend = p + (end_byte - start_byte);
4ed46869 8850
df7492f9
KH
8851 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8852 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8853
df7492f9 8854 while (p < pend)
d46c5b12 8855 {
df7492f9
KH
8856 if (ASCII_BYTE_P (*p))
8857 p++;
e133c8fa 8858 else
05e6f5dc 8859 {
df7492f9
KH
8860 c = STRING_CHAR_ADVANCE (p);
8861
8862 charset_map_loaded = 0;
8863 for (tail = list; CONSP (tail); tail = XCDR (tail))
8864 {
8865 elt = XCDR (XCAR (tail));
8866 if (! char_encodable_p (c, XCAR (elt)))
8867 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8868 }
8869 if (charset_map_loaded)
8870 {
8871 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8872
8873 if (STRINGP (start))
8f924df7 8874 pbeg = SDATA (start);
df7492f9
KH
8875 else
8876 pbeg = BYTE_POS_ADDR (start_byte);
8877 p = pbeg + p_offset;
8878 pend = pbeg + pend_offset;
8879 }
05e6f5dc 8880 }
df7492f9 8881 pos++;
d46c5b12 8882 }
4ed46869 8883
df7492f9
KH
8884 tail = list;
8885 list = Qnil;
8886 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8887 {
df7492f9
KH
8888 elt = XCAR (tail);
8889 if (CONSP (XCDR (XCDR (elt))))
8890 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8891 list);
ec6d2bb8 8892 }
2b4f9037 8893
df7492f9 8894 return list;
d46c5b12
KH
8895}
8896
3fd9494b 8897
b73bfc1c 8898Lisp_Object
cf84bb53
JB
8899code_convert_region (Lisp_Object start, Lisp_Object end,
8900 Lisp_Object coding_system, Lisp_Object dst_object,
8901 int encodep, int norecord)
4ed46869 8902{
3a73fa5d 8903 struct coding_system coding;
df7492f9
KH
8904 EMACS_INT from, from_byte, to, to_byte;
8905 Lisp_Object src_object;
4ed46869 8906
b7826503
PJ
8907 CHECK_NUMBER_COERCE_MARKER (start);
8908 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8909 if (NILP (coding_system))
8910 coding_system = Qno_conversion;
8911 else
8912 CHECK_CODING_SYSTEM (coding_system);
8913 src_object = Fcurrent_buffer ();
8914 if (NILP (dst_object))
8915 dst_object = src_object;
8916 else if (! EQ (dst_object, Qt))
8917 CHECK_BUFFER (dst_object);
3a73fa5d 8918
d46c5b12
KH
8919 validate_region (&start, &end);
8920 from = XFASTINT (start);
df7492f9 8921 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8922 to = XFASTINT (end);
df7492f9 8923 to_byte = CHAR_TO_BYTE (to);
764ca8da 8924
df7492f9
KH
8925 setup_coding_system (coding_system, &coding);
8926 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8927
df7492f9
KH
8928 if (encodep)
8929 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8930 dst_object);
8931 else
8932 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8933 dst_object);
8934 if (! norecord)
8935 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8936
df7492f9
KH
8937 return (BUFFERP (dst_object)
8938 ? make_number (coding.produced_char)
8939 : coding.dst_object);
4031e2bf 8940}
78108bcd 8941
4ed46869 8942
4031e2bf 8943DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8944 3, 4, "r\nzCoding system: ",
48b0f3ae 8945 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8946When called from a program, takes four arguments:
8947 START, END, CODING-SYSTEM, and DESTINATION.
8948START and END are buffer positions.
8844fa83 8949
df7492f9 8950Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8951If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8952If buffer, the decoded text is inserted in that buffer after point (point
8953does not move).
446dcd75 8954In those cases, the length of the decoded text is returned.
319a3947 8955If DESTINATION is t, the decoded text is returned.
8844fa83 8956
48b0f3ae
PJ
8957This function sets `last-coding-system-used' to the precise coding system
8958used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8959not fully specified.) */)
5842a27b 8960 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
4031e2bf 8961{
df7492f9 8962 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8963}
8844fa83 8964
3a73fa5d 8965DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8966 3, 4, "r\nzCoding system: ",
8967 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8968When called from a program, takes four arguments:
8969 START, END, CODING-SYSTEM and DESTINATION.
8970START and END are buffer positions.
d46c5b12 8971
df7492f9
KH
8972Optional 4th arguments DESTINATION specifies where the encoded text goes.
8973If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
8974If buffer, the encoded text is inserted in that buffer after point (point
8975does not move).
446dcd75 8976In those cases, the length of the encoded text is returned.
319a3947 8977If DESTINATION is t, the encoded text is returned.
2391eaa4 8978
48b0f3ae
PJ
8979This function sets `last-coding-system-used' to the precise coding system
8980used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8981not fully specified.) */)
5842a27b 8982 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
3a73fa5d 8983{
df7492f9 8984 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8985}
8986
8987Lisp_Object
6f704c76
DN
8988code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8989 Lisp_Object dst_object, int encodep, int nocopy, int norecord)
b73bfc1c 8990{
4031e2bf 8991 struct coding_system coding;
df7492f9 8992 EMACS_INT chars, bytes;
ec6d2bb8 8993
b7826503 8994 CHECK_STRING (string);
d46c5b12 8995 if (NILP (coding_system))
4956c225 8996 {
df7492f9
KH
8997 if (! norecord)
8998 Vlast_coding_system_used = Qno_conversion;
8999 if (NILP (dst_object))
9000 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 9001 }
b73bfc1c 9002
df7492f9
KH
9003 if (NILP (coding_system))
9004 coding_system = Qno_conversion;
9005 else
9006 CHECK_CODING_SYSTEM (coding_system);
9007 if (NILP (dst_object))
9008 dst_object = Qt;
9009 else if (! EQ (dst_object, Qt))
9010 CHECK_BUFFER (dst_object);
73be902c 9011
df7492f9 9012 setup_coding_system (coding_system, &coding);
d46c5b12 9013 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
9014 chars = SCHARS (string);
9015 bytes = SBYTES (string);
df7492f9
KH
9016 if (encodep)
9017 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9018 else
9019 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9020 if (! norecord)
9021 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 9022
df7492f9
KH
9023 return (BUFFERP (dst_object)
9024 ? make_number (coding.produced_char)
9025 : coding.dst_object);
4ed46869 9026}
73be902c 9027
b73bfc1c 9028
ecec61c1 9029/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 9030 Do not set Vlast_coding_system_used.
4ed46869 9031
ec6d2bb8
KH
9032 This function is called only from macros DECODE_FILE and
9033 ENCODE_FILE, thus we ignore character composition. */
4ed46869 9034
ecec61c1 9035Lisp_Object
cf84bb53
JB
9036code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9037 int encodep)
4ed46869 9038{
0be8721c 9039 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
9040}
9041
4ed46869 9042
df7492f9
KH
9043DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9044 2, 4, 0,
9045 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9046
9047Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9048if the decoding operation is trivial.
ecec61c1 9049
d4a1d553 9050Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
9051inserted in that buffer after point (point does not move). In this
9052case, the return value is the length of the decoded text.
ecec61c1 9053
df7492f9
KH
9054This function sets `last-coding-system-used' to the precise coding system
9055used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 9056not fully specified.) */)
5842a27b 9057 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9058{
df7492f9
KH
9059 return code_convert_string (string, coding_system, buffer,
9060 0, ! NILP (nocopy), 0);
4ed46869
KH
9061}
9062
df7492f9
KH
9063DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9064 2, 4, 0,
9065 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9066
9067Optional third arg NOCOPY non-nil means it is OK to return STRING
9068itself if the encoding operation is trivial.
9069
d4a1d553 9070Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
9071inserted in that buffer after point (point does not move). In this
9072case, the return value is the length of the encoded text.
df7492f9
KH
9073
9074This function sets `last-coding-system-used' to the precise coding system
9075used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9076not fully specified.) */)
5842a27b 9077 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9078{
df7492f9 9079 return code_convert_string (string, coding_system, buffer,
c197f191 9080 1, ! NILP (nocopy), 1);
4ed46869 9081}
df7492f9 9082
3a73fa5d 9083\f
4ed46869 9084DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
9085 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9086Return the corresponding character. */)
5842a27b 9087 (Lisp_Object code)
4ed46869 9088{
df7492f9
KH
9089 Lisp_Object spec, attrs, val;
9090 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9091 int c;
4ed46869 9092
df7492f9
KH
9093 CHECK_NATNUM (code);
9094 c = XFASTINT (code);
9095 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9096 attrs = AREF (spec, 0);
4ed46869 9097
df7492f9
KH
9098 if (ASCII_BYTE_P (c)
9099 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9100 return code;
4ed46869 9101
df7492f9
KH
9102 val = CODING_ATTR_CHARSET_LIST (attrs);
9103 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
9104 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9105 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 9106
df7492f9
KH
9107 if (c <= 0x7F)
9108 charset = charset_roman;
9109 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 9110 {
df7492f9
KH
9111 charset = charset_kana;
9112 c -= 0x80;
4ed46869 9113 }
55ab7be3 9114 else
4ed46869 9115 {
004068e4 9116 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
9117
9118 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9119 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9120 error ("Invalid code: %d", code);
9121 SJIS_TO_JIS (c);
9122 charset = charset_kanji;
4ed46869 9123 }
df7492f9
KH
9124 c = DECODE_CHAR (charset, c);
9125 if (c < 0)
9126 error ("Invalid code: %d", code);
9127 return make_number (c);
93dec019 9128}
4ed46869 9129
48b0f3ae 9130
4ed46869 9131DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 9132 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae 9133Return the corresponding code in SJIS. */)
5842a27b 9134 (Lisp_Object ch)
4ed46869 9135{
df7492f9
KH
9136 Lisp_Object spec, attrs, charset_list;
9137 int c;
9138 struct charset *charset;
9139 unsigned code;
48b0f3ae 9140
df7492f9
KH
9141 CHECK_CHARACTER (ch);
9142 c = XFASTINT (ch);
9143 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9144 attrs = AREF (spec, 0);
9145
9146 if (ASCII_CHAR_P (c)
9147 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9148 return ch;
9149
9150 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9151 charset = char_charset (c, charset_list, &code);
9152 if (code == CHARSET_INVALID_CODE (charset))
9153 error ("Can't encode by shift_jis encoding: %d", c);
9154 JIS_TO_SJIS (code);
9155
9156 return make_number (code);
4ed46869
KH
9157}
9158
9159DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
9160 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9161Return the corresponding character. */)
5842a27b 9162 (Lisp_Object code)
d46c5b12 9163{
df7492f9
KH
9164 Lisp_Object spec, attrs, val;
9165 struct charset *charset_roman, *charset_big5, *charset;
9166 int c;
6289dd10 9167
df7492f9
KH
9168 CHECK_NATNUM (code);
9169 c = XFASTINT (code);
9170 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9171 attrs = AREF (spec, 0);
4ed46869 9172
df7492f9
KH
9173 if (ASCII_BYTE_P (c)
9174 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9175 return code;
6289dd10 9176
df7492f9
KH
9177 val = CODING_ATTR_CHARSET_LIST (attrs);
9178 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9179 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 9180
df7492f9
KH
9181 if (c <= 0x7F)
9182 charset = charset_roman;
c28a9453
KH
9183 else
9184 {
df7492f9
KH
9185 int b1 = c >> 8, b2 = c & 0x7F;
9186 if (b1 < 0xA1 || b1 > 0xFE
9187 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9188 error ("Invalid code: %d", code);
9189 charset = charset_big5;
c28a9453 9190 }
df7492f9
KH
9191 c = DECODE_CHAR (charset, (unsigned )c);
9192 if (c < 0)
9193 error ("Invalid code: %d", code);
9194 return make_number (c);
d46c5b12 9195}
6289dd10 9196
4ed46869 9197DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 9198 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae 9199Return the corresponding character code in Big5. */)
5842a27b 9200 (Lisp_Object ch)
4ed46869 9201{
df7492f9
KH
9202 Lisp_Object spec, attrs, charset_list;
9203 struct charset *charset;
9204 int c;
9205 unsigned code;
9206
9207 CHECK_CHARACTER (ch);
9208 c = XFASTINT (ch);
9209 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9210 attrs = AREF (spec, 0);
9211 if (ASCII_CHAR_P (c)
9212 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9213 return ch;
9214
9215 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9216 charset = char_charset (c, charset_list, &code);
9217 if (code == CHARSET_INVALID_CODE (charset))
9218 error ("Can't encode by Big5 encoding: %d", c);
9219
9220 return make_number (code);
4ed46869 9221}
48b0f3ae 9222
3a73fa5d 9223\f
002fdb44 9224DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 9225 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 9226 doc: /* Internal use only. */)
5842a27b 9227 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9228{
b18fad6d
KH
9229 struct terminal *term = get_terminal (terminal, 1);
9230 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
b7826503 9231 CHECK_SYMBOL (coding_system);
b8299c66 9232 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 9233 /* We had better not send unsafe characters to terminal. */
c73bd236 9234 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
ad1746f5 9235 /* Character composition should be disabled. */
c73bd236 9236 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
9237 terminal_coding->src_multibyte = 1;
9238 terminal_coding->dst_multibyte = 0;
b18fad6d
KH
9239 if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9240 term->charset_list = coding_charset_list (terminal_coding);
9241 else
6b4bb703 9242 term->charset_list = Fcons (make_number (charset_ascii), Qnil);
4ed46869
KH
9243 return Qnil;
9244}
9245
c4825358
KH
9246DEFUN ("set-safe-terminal-coding-system-internal",
9247 Fset_safe_terminal_coding_system_internal,
48b0f3ae 9248 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 9249 doc: /* Internal use only. */)
5842a27b 9250 (Lisp_Object coding_system)
d46c5b12 9251{
b7826503 9252 CHECK_SYMBOL (coding_system);
c4825358
KH
9253 setup_coding_system (Fcheck_coding_system (coding_system),
9254 &safe_terminal_coding);
ad1746f5 9255 /* Character composition should be disabled. */
df7492f9 9256 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
9257 safe_terminal_coding.src_multibyte = 1;
9258 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
9259 return Qnil;
9260}
4ed46869 9261
002fdb44 9262DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 9263 Sterminal_coding_system, 0, 1, 0,
6ed8eeff 9264 doc: /* Return coding system specified for terminal output on the given terminal.
708e05dc 9265TERMINAL may be a terminal object, a frame, or nil for the selected
6ed8eeff 9266frame's terminal device. */)
5842a27b 9267 (Lisp_Object terminal)
4ed46869 9268{
985773c9
MB
9269 struct coding_system *terminal_coding
9270 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9271 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 9272
ae6f73fa 9273 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 9274 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
9275}
9276
002fdb44 9277DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 9278 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 9279 doc: /* Internal use only. */)
5842a27b 9280 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9281{
6ed8eeff 9282 struct terminal *t = get_terminal (terminal, 1);
b7826503 9283 CHECK_SYMBOL (coding_system);
624bda09
KH
9284 if (NILP (coding_system))
9285 coding_system = Qno_conversion;
9286 else
9287 Fcheck_coding_system (coding_system);
9288 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
ad1746f5 9289 /* Character composition should be disabled. */
c73bd236
MB
9290 TERMINAL_KEYBOARD_CODING (t)->common_flags
9291 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
9292 return Qnil;
9293}
9294
9295DEFUN ("keyboard-coding-system",
985773c9 9296 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 9297 doc: /* Return coding system specified for decoding keyboard input. */)
5842a27b 9298 (Lisp_Object terminal)
4ed46869 9299{
985773c9
MB
9300 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9301 (get_terminal (terminal, 1))->id);
4ed46869
KH
9302}
9303
4ed46869 9304\f
a5d301df
KH
9305DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9306 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
9307 doc: /* Choose a coding system for an operation based on the target name.
9308The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9309DECODING-SYSTEM is the coding system to use for decoding
9310\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9311for encoding (in case OPERATION does encoding).
05e6f5dc 9312
48b0f3ae
PJ
9313The first argument OPERATION specifies an I/O primitive:
9314 For file I/O, `insert-file-contents' or `write-region'.
9315 For process I/O, `call-process', `call-process-region', or `start-process'.
9316 For network I/O, `open-network-stream'.
05e6f5dc 9317
48b0f3ae
PJ
9318The remaining arguments should be the same arguments that were passed
9319to the primitive. Depending on which primitive, one of those arguments
9320is selected as the TARGET. For example, if OPERATION does file I/O,
9321whichever argument specifies the file name is TARGET.
05e6f5dc 9322
48b0f3ae 9323TARGET has a meaning which depends on OPERATION:
b883cdb2 9324 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 9325 For process I/O, TARGET is a process name.
d4a1d553 9326 For network I/O, TARGET is a service name or a port number.
05e6f5dc 9327
d4a1d553 9328This function looks up what is specified for TARGET in
48b0f3ae
PJ
9329`file-coding-system-alist', `process-coding-system-alist',
9330or `network-coding-system-alist' depending on OPERATION.
9331They may specify a coding system, a cons of coding systems,
9332or a function symbol to call.
9333In the last case, we call the function with one argument,
9334which is a list of all the arguments given to this function.
1011c487
MB
9335If the function can't decide a coding system, it can return
9336`undecided' so that the normal code-detection is performed.
48b0f3ae 9337
b883cdb2
MB
9338If OPERATION is `insert-file-contents', the argument corresponding to
9339TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9340file name to look up, and BUFFER is a buffer that contains the file's
9341contents (not yet decoded). If `file-coding-system-alist' specifies a
9342function to call for FILENAME, that function should examine the
9343contents of BUFFER instead of reading the file.
9344
d918f936 9345usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
5842a27b 9346 (int nargs, Lisp_Object *args)
6b89e3aa 9347{
4ed46869
KH
9348 Lisp_Object operation, target_idx, target, val;
9349 register Lisp_Object chain;
177c0ea7 9350
4ed46869
KH
9351 if (nargs < 2)
9352 error ("Too few arguments");
9353 operation = args[0];
9354 if (!SYMBOLP (operation)
9355 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3ed051d4 9356 error ("Invalid first argument");
4ed46869
KH
9357 if (nargs < 1 + XINT (target_idx))
9358 error ("Too few arguments for operation: %s",
8f924df7 9359 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
9360 target = args[XINT (target_idx) + 1];
9361 if (!(STRINGP (target)
091a0ff0
KH
9362 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9363 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 9364 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 9365 error ("Invalid %dth argument", XINT (target_idx) + 1);
091a0ff0
KH
9366 if (CONSP (target))
9367 target = XCAR (target);
4ed46869 9368
2e34157c
RS
9369 chain = ((EQ (operation, Qinsert_file_contents)
9370 || EQ (operation, Qwrite_region))
02ba4723 9371 ? Vfile_coding_system_alist
2e34157c 9372 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
9373 ? Vnetwork_coding_system_alist
9374 : Vprocess_coding_system_alist));
4ed46869
KH
9375 if (NILP (chain))
9376 return Qnil;
9377
03699b14 9378 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 9379 {
f44d27ce 9380 Lisp_Object elt;
6b89e3aa 9381
df7492f9 9382 elt = XCAR (chain);
4ed46869
KH
9383 if (CONSP (elt)
9384 && ((STRINGP (target)
03699b14
KR
9385 && STRINGP (XCAR (elt))
9386 && fast_string_match (XCAR (elt), target) >= 0)
9387 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 9388 {
03699b14 9389 val = XCDR (elt);
b19fd4c5
KH
9390 /* Here, if VAL is both a valid coding system and a valid
9391 function symbol, we return VAL as a coding system. */
02ba4723
KH
9392 if (CONSP (val))
9393 return val;
9394 if (! SYMBOLP (val))
9395 return Qnil;
9396 if (! NILP (Fcoding_system_p (val)))
9397 return Fcons (val, val);
b19fd4c5 9398 if (! NILP (Ffboundp (val)))
6b89e3aa 9399 {
e2b97060
MB
9400 /* We use call1 rather than safe_call1
9401 so as to get bug reports about functions called here
9402 which don't handle the current interface. */
9403 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
9404 if (CONSP (val))
9405 return val;
9406 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9407 return Fcons (val, val);
6b89e3aa 9408 }
02ba4723 9409 return Qnil;
6b89e3aa
KH
9410 }
9411 }
4ed46869 9412 return Qnil;
6b89e3aa
KH
9413}
9414
df7492f9 9415DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9416 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9417 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9418If multiple coding systems belong to the same category,
a3181084
DL
9419all but the first one are ignored.
9420
d4a1d553 9421usage: (set-coding-system-priority &rest coding-systems) */)
5842a27b 9422 (int nargs, Lisp_Object *args)
df7492f9
KH
9423{
9424 int i, j;
9425 int changed[coding_category_max];
9426 enum coding_category priorities[coding_category_max];
9427
72af86bd 9428 memset (changed, 0, sizeof changed);
6b89e3aa 9429
df7492f9 9430 for (i = j = 0; i < nargs; i++)
6b89e3aa 9431 {
df7492f9
KH
9432 enum coding_category category;
9433 Lisp_Object spec, attrs;
6b89e3aa 9434
df7492f9
KH
9435 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9436 attrs = AREF (spec, 0);
9437 category = XINT (CODING_ATTR_CATEGORY (attrs));
9438 if (changed[category])
9439 /* Ignore this coding system because a coding system of the
9440 same category already had a higher priority. */
9441 continue;
9442 changed[category] = 1;
9443 priorities[j++] = category;
9444 if (coding_categories[category].id >= 0
9445 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9446 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9447 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9448 }
6b89e3aa 9449
df7492f9
KH
9450 /* Now we have decided top J priorities. Reflect the order of the
9451 original priorities to the remaining priorities. */
6b89e3aa 9452
df7492f9 9453 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9454 {
df7492f9
KH
9455 while (j < coding_category_max
9456 && changed[coding_priorities[j]])
9457 j++;
9458 if (j == coding_category_max)
9459 abort ();
9460 priorities[i] = coding_priorities[j];
9461 }
6b89e3aa 9462
72af86bd 9463 memcpy (coding_priorities, priorities, sizeof priorities);
177c0ea7 9464
ff563fce
KH
9465 /* Update `coding-category-list'. */
9466 Vcoding_category_list = Qnil;
9467 for (i = coding_category_max - 1; i >= 0; i--)
9468 Vcoding_category_list
9469 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9470 Vcoding_category_list);
6b89e3aa 9471
df7492f9 9472 return Qnil;
6b89e3aa
KH
9473}
9474
df7492f9
KH
9475DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9476 Scoding_system_priority_list, 0, 1, 0,
da7db224 9477 doc: /* Return a list of coding systems ordered by their priorities.
b811c52b
KH
9478The list contains a subset of coding systems; i.e. coding systems
9479assigned to each coding category (see `coding-category-list').
9480
da7db224 9481HIGHESTP non-nil means just return the highest priority one. */)
5842a27b 9482 (Lisp_Object highestp)
d46c5b12
KH
9483{
9484 int i;
df7492f9 9485 Lisp_Object val;
6b89e3aa 9486
df7492f9 9487 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9488 {
df7492f9
KH
9489 enum coding_category category = coding_priorities[i];
9490 int id = coding_categories[category].id;
9491 Lisp_Object attrs;
068a9dbd 9492
df7492f9
KH
9493 if (id < 0)
9494 continue;
9495 attrs = CODING_ID_ATTRS (id);
9496 if (! NILP (highestp))
9497 return CODING_ATTR_BASE_NAME (attrs);
9498 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9499 }
9500 return Fnreverse (val);
9501}
068a9dbd 9502
91433552 9503static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9504
9505static Lisp_Object
971de7fb 9506make_subsidiaries (Lisp_Object base)
068a9dbd 9507{
df7492f9 9508 Lisp_Object subsidiaries;
8f924df7 9509 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9510 char *buf = (char *) alloca (base_name_len + 6);
9511 int i;
068a9dbd 9512
72af86bd 9513 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
df7492f9
KH
9514 subsidiaries = Fmake_vector (make_number (3), Qnil);
9515 for (i = 0; i < 3; i++)
068a9dbd 9516 {
72af86bd 9517 memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
df7492f9 9518 ASET (subsidiaries, i, intern (buf));
068a9dbd 9519 }
df7492f9 9520 return subsidiaries;
068a9dbd
KH
9521}
9522
9523
df7492f9
KH
9524DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9525 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9526 doc: /* For internal use only.
9527usage: (define-coding-system-internal ...) */)
5842a27b 9528 (int nargs, Lisp_Object *args)
068a9dbd 9529{
df7492f9
KH
9530 Lisp_Object name;
9531 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9532 Lisp_Object attrs; /* Vector of attributes. */
9533 Lisp_Object eol_type;
9534 Lisp_Object aliases;
9535 Lisp_Object coding_type, charset_list, safe_charsets;
9536 enum coding_category category;
9537 Lisp_Object tail, val;
9538 int max_charset_id = 0;
9539 int i;
068a9dbd 9540
df7492f9
KH
9541 if (nargs < coding_arg_max)
9542 goto short_args;
068a9dbd 9543
df7492f9 9544 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9545
df7492f9
KH
9546 name = args[coding_arg_name];
9547 CHECK_SYMBOL (name);
9548 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9549
df7492f9
KH
9550 val = args[coding_arg_mnemonic];
9551 if (! STRINGP (val))
9552 CHECK_CHARACTER (val);
9553 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9554
df7492f9
KH
9555 coding_type = args[coding_arg_coding_type];
9556 CHECK_SYMBOL (coding_type);
9557 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9558
df7492f9
KH
9559 charset_list = args[coding_arg_charset_list];
9560 if (SYMBOLP (charset_list))
9561 {
9562 if (EQ (charset_list, Qiso_2022))
9563 {
9564 if (! EQ (coding_type, Qiso_2022))
9565 error ("Invalid charset-list");
9566 charset_list = Viso_2022_charset_list;
9567 }
9568 else if (EQ (charset_list, Qemacs_mule))
9569 {
9570 if (! EQ (coding_type, Qemacs_mule))
9571 error ("Invalid charset-list");
9572 charset_list = Vemacs_mule_charset_list;
9573 }
9574 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9575 if (max_charset_id < XFASTINT (XCAR (tail)))
9576 max_charset_id = XFASTINT (XCAR (tail));
9577 }
068a9dbd
KH
9578 else
9579 {
df7492f9 9580 charset_list = Fcopy_sequence (charset_list);
985773c9 9581 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9582 {
df7492f9
KH
9583 struct charset *charset;
9584
985773c9 9585 val = XCAR (tail);
df7492f9
KH
9586 CHECK_CHARSET_GET_CHARSET (val, charset);
9587 if (EQ (coding_type, Qiso_2022)
9588 ? CHARSET_ISO_FINAL (charset) < 0
9589 : EQ (coding_type, Qemacs_mule)
9590 ? CHARSET_EMACS_MULE_ID (charset) < 0
9591 : 0)
9592 error ("Can't handle charset `%s'",
8f924df7 9593 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9594
8f924df7 9595 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9596 if (max_charset_id < charset->id)
9597 max_charset_id = charset->id;
068a9dbd
KH
9598 }
9599 }
df7492f9 9600 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9601
1b3b981b
AS
9602 safe_charsets = make_uninit_string (max_charset_id + 1);
9603 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9 9604 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9605 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9606 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9607
584948ac 9608 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9609
df7492f9 9610 val = args[coding_arg_decode_translation_table];
a6f87d34 9611 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9612 CHECK_SYMBOL (val);
df7492f9 9613 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9614
df7492f9 9615 val = args[coding_arg_encode_translation_table];
a6f87d34 9616 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9617 CHECK_SYMBOL (val);
df7492f9 9618 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9619
df7492f9
KH
9620 val = args[coding_arg_post_read_conversion];
9621 CHECK_SYMBOL (val);
9622 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9623
df7492f9
KH
9624 val = args[coding_arg_pre_write_conversion];
9625 CHECK_SYMBOL (val);
9626 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9627
df7492f9
KH
9628 val = args[coding_arg_default_char];
9629 if (NILP (val))
9630 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9631 else
9632 {
8f924df7 9633 CHECK_CHARACTER (val);
df7492f9
KH
9634 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9635 }
4031e2bf 9636
8f924df7
KH
9637 val = args[coding_arg_for_unibyte];
9638 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9639
df7492f9
KH
9640 val = args[coding_arg_plist];
9641 CHECK_LIST (val);
9642 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9643
df7492f9
KH
9644 if (EQ (coding_type, Qcharset))
9645 {
c7c66a95
KH
9646 /* Generate a lisp vector of 256 elements. Each element is nil,
9647 integer, or a list of charset IDs.
3a73fa5d 9648
c7c66a95
KH
9649 If Nth element is nil, the byte code N is invalid in this
9650 coding system.
4ed46869 9651
c7c66a95
KH
9652 If Nth element is a number NUM, N is the first byte of a
9653 charset whose ID is NUM.
4ed46869 9654
c7c66a95
KH
9655 If Nth element is a list of charset IDs, N is the first byte
9656 of one of them. The list is sorted by dimensions of the
ad1746f5 9657 charsets. A charset of smaller dimension comes first. */
df7492f9 9658 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9659
5c99c2e6 9660 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9661 {
c7c66a95
KH
9662 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9663 int dim = CHARSET_DIMENSION (charset);
9664 int idx = (dim - 1) * 4;
4ed46869 9665
5c99c2e6 9666 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9667 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9668
15d143f7
KH
9669 for (i = charset->code_space[idx];
9670 i <= charset->code_space[idx + 1]; i++)
9671 {
c7c66a95
KH
9672 Lisp_Object tmp, tmp2;
9673 int dim2;
ec6d2bb8 9674
c7c66a95
KH
9675 tmp = AREF (val, i);
9676 if (NILP (tmp))
9677 tmp = XCAR (tail);
9678 else if (NUMBERP (tmp))
9679 {
9680 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9681 if (dim < dim2)
c7c66a95 9682 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9683 else
9684 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9685 }
15d143f7 9686 else
c7c66a95
KH
9687 {
9688 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9689 {
9690 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9691 if (dim < dim2)
9692 break;
9693 }
9694 if (NILP (tmp2))
9695 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9696 else
9697 {
9698 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9699 XSETCAR (tmp2, XCAR (tail));
9700 }
9701 }
9702 ASET (val, i, tmp);
15d143f7 9703 }
df7492f9
KH
9704 }
9705 ASET (attrs, coding_attr_charset_valids, val);
9706 category = coding_category_charset;
9707 }
9708 else if (EQ (coding_type, Qccl))
9709 {
9710 Lisp_Object valids;
ecec61c1 9711
df7492f9
KH
9712 if (nargs < coding_arg_ccl_max)
9713 goto short_args;
ecec61c1 9714
df7492f9
KH
9715 val = args[coding_arg_ccl_decoder];
9716 CHECK_CCL_PROGRAM (val);
9717 if (VECTORP (val))
9718 val = Fcopy_sequence (val);
9719 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9720
df7492f9
KH
9721 val = args[coding_arg_ccl_encoder];
9722 CHECK_CCL_PROGRAM (val);
9723 if (VECTORP (val))
9724 val = Fcopy_sequence (val);
9725 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9726
df7492f9
KH
9727 val = args[coding_arg_ccl_valids];
9728 valids = Fmake_string (make_number (256), make_number (0));
9729 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9730 {
8dcbea82 9731 int from, to;
ecec61c1 9732
df7492f9
KH
9733 val = Fcar (tail);
9734 if (INTEGERP (val))
8dcbea82
KH
9735 {
9736 from = to = XINT (val);
9737 if (from < 0 || from > 255)
9738 args_out_of_range_3 (val, make_number (0), make_number (255));
9739 }
df7492f9
KH
9740 else
9741 {
df7492f9 9742 CHECK_CONS (val);
8f924df7
KH
9743 CHECK_NATNUM_CAR (val);
9744 CHECK_NATNUM_CDR (val);
df7492f9 9745 from = XINT (XCAR (val));
8f924df7 9746 if (from > 255)
8dcbea82
KH
9747 args_out_of_range_3 (XCAR (val),
9748 make_number (0), make_number (255));
df7492f9 9749 to = XINT (XCDR (val));
8dcbea82
KH
9750 if (to < from || to > 255)
9751 args_out_of_range_3 (XCDR (val),
9752 XCAR (val), make_number (255));
df7492f9 9753 }
8dcbea82 9754 for (i = from; i <= to; i++)
8f924df7 9755 SSET (valids, i, 1);
df7492f9
KH
9756 }
9757 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9758
df7492f9 9759 category = coding_category_ccl;
55ab7be3 9760 }
df7492f9 9761 else if (EQ (coding_type, Qutf_16))
55ab7be3 9762 {
df7492f9 9763 Lisp_Object bom, endian;
4ed46869 9764
584948ac 9765 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9766
df7492f9
KH
9767 if (nargs < coding_arg_utf16_max)
9768 goto short_args;
4ed46869 9769
df7492f9
KH
9770 bom = args[coding_arg_utf16_bom];
9771 if (! NILP (bom) && ! EQ (bom, Qt))
9772 {
9773 CHECK_CONS (bom);
8f924df7
KH
9774 val = XCAR (bom);
9775 CHECK_CODING_SYSTEM (val);
9776 val = XCDR (bom);
9777 CHECK_CODING_SYSTEM (val);
df7492f9 9778 }
a470d443 9779 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9780
9781 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9782 CHECK_SYMBOL (endian);
9783 if (NILP (endian))
9784 endian = Qbig;
9785 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9786 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9787 ASET (attrs, coding_attr_utf_16_endian, endian);
9788
9789 category = (CONSP (bom)
9790 ? coding_category_utf_16_auto
9791 : NILP (bom)
b49a1807 9792 ? (EQ (endian, Qbig)
df7492f9
KH
9793 ? coding_category_utf_16_be_nosig
9794 : coding_category_utf_16_le_nosig)
b49a1807 9795 : (EQ (endian, Qbig)
df7492f9
KH
9796 ? coding_category_utf_16_be
9797 : coding_category_utf_16_le));
9798 }
9799 else if (EQ (coding_type, Qiso_2022))
9800 {
9801 Lisp_Object initial, reg_usage, request, flags;
4776e638 9802 int i;
1397dc18 9803
df7492f9
KH
9804 if (nargs < coding_arg_iso2022_max)
9805 goto short_args;
9806
9807 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9808 CHECK_VECTOR (initial);
9809 for (i = 0; i < 4; i++)
9810 {
9811 val = Faref (initial, make_number (i));
9812 if (! NILP (val))
9813 {
584948ac
KH
9814 struct charset *charset;
9815
9816 CHECK_CHARSET_GET_CHARSET (val, charset);
9817 ASET (initial, i, make_number (CHARSET_ID (charset)));
9818 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9819 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9820 }
9821 else
9822 ASET (initial, i, make_number (-1));
9823 }
9824
9825 reg_usage = args[coding_arg_iso2022_reg_usage];
9826 CHECK_CONS (reg_usage);
8f924df7
KH
9827 CHECK_NUMBER_CAR (reg_usage);
9828 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9829
9830 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9831 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9832 {
df7492f9 9833 int id;
8f924df7 9834 Lisp_Object tmp;
df7492f9
KH
9835
9836 val = Fcar (tail);
9837 CHECK_CONS (val);
8f924df7
KH
9838 tmp = XCAR (val);
9839 CHECK_CHARSET_GET_ID (tmp, id);
9840 CHECK_NATNUM_CDR (val);
df7492f9
KH
9841 if (XINT (XCDR (val)) >= 4)
9842 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 9843 XSETCAR (val, make_number (id));
1397dc18 9844 }
4ed46869 9845
df7492f9
KH
9846 flags = args[coding_arg_iso2022_flags];
9847 CHECK_NATNUM (flags);
9848 i = XINT (flags);
9849 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9850 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9851
9852 ASET (attrs, coding_attr_iso_initial, initial);
9853 ASET (attrs, coding_attr_iso_usage, reg_usage);
9854 ASET (attrs, coding_attr_iso_request, request);
9855 ASET (attrs, coding_attr_iso_flags, flags);
9856 setup_iso_safe_charsets (attrs);
9857
9858 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9859 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9860 | CODING_ISO_FLAG_SINGLE_SHIFT))
9861 ? coding_category_iso_7_else
9862 : EQ (args[coding_arg_charset_list], Qiso_2022)
9863 ? coding_category_iso_7
9864 : coding_category_iso_7_tight);
9865 else
9866 {
9867 int id = XINT (AREF (initial, 1));
9868
c6fb6e98 9869 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9870 || EQ (args[coding_arg_charset_list], Qiso_2022)
9871 || id < 0)
9872 ? coding_category_iso_8_else
9873 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9874 ? coding_category_iso_8_1
9875 : coding_category_iso_8_2);
9876 }
0ce7886f
KH
9877 if (category != coding_category_iso_8_1
9878 && category != coding_category_iso_8_2)
9879 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9880 }
9881 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9882 {
df7492f9
KH
9883 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9884 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9885 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9886 category = coding_category_emacs_mule;
c28a9453 9887 }
df7492f9 9888 else if (EQ (coding_type, Qshift_jis))
c28a9453 9889 {
df7492f9
KH
9890
9891 struct charset *charset;
9892
7d64c6ad 9893 if (XINT (Flength (charset_list)) != 3
6e07c25f 9894 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9895 error ("There should be three or four charsets");
df7492f9
KH
9896
9897 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9898 if (CHARSET_DIMENSION (charset) != 1)
9899 error ("Dimension of charset %s is not one",
8f924df7 9900 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9901 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9902 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9903
9904 charset_list = XCDR (charset_list);
9905 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9906 if (CHARSET_DIMENSION (charset) != 1)
9907 error ("Dimension of charset %s is not one",
8f924df7 9908 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9909
9910 charset_list = XCDR (charset_list);
9911 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9912 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9913 error ("Dimension of charset %s is not two",
9914 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9915
9916 charset_list = XCDR (charset_list);
2b917a06
KH
9917 if (! NILP (charset_list))
9918 {
9919 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9920 if (CHARSET_DIMENSION (charset) != 2)
9921 error ("Dimension of charset %s is not two",
9922 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9923 }
df7492f9
KH
9924
9925 category = coding_category_sjis;
9926 Vsjis_coding_system = name;
c28a9453 9927 }
df7492f9
KH
9928 else if (EQ (coding_type, Qbig5))
9929 {
9930 struct charset *charset;
4ed46869 9931
df7492f9
KH
9932 if (XINT (Flength (charset_list)) != 2)
9933 error ("There should be just two charsets");
9934
9935 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9936 if (CHARSET_DIMENSION (charset) != 1)
9937 error ("Dimension of charset %s is not one",
8f924df7 9938 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9939 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9940 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9941
9942 charset_list = XCDR (charset_list);
9943 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9944 if (CHARSET_DIMENSION (charset) != 2)
9945 error ("Dimension of charset %s is not two",
8f924df7 9946 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9947
df7492f9
KH
9948 category = coding_category_big5;
9949 Vbig5_coding_system = name;
9950 }
9951 else if (EQ (coding_type, Qraw_text))
c28a9453 9952 {
584948ac
KH
9953 category = coding_category_raw_text;
9954 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 9955 }
df7492f9 9956 else if (EQ (coding_type, Qutf_8))
4ed46869 9957 {
a470d443
KH
9958 Lisp_Object bom;
9959
584948ac 9960 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
9961
9962 if (nargs < coding_arg_utf8_max)
9963 goto short_args;
9964
9965 bom = args[coding_arg_utf8_bom];
9966 if (! NILP (bom) && ! EQ (bom, Qt))
9967 {
9968 CHECK_CONS (bom);
9969 val = XCAR (bom);
9970 CHECK_CODING_SYSTEM (val);
9971 val = XCDR (bom);
9972 CHECK_CODING_SYSTEM (val);
9973 }
9974 ASET (attrs, coding_attr_utf_bom, bom);
9975
9976 category = (CONSP (bom) ? coding_category_utf_8_auto
9977 : NILP (bom) ? coding_category_utf_8_nosig
9978 : coding_category_utf_8_sig);
4ed46869 9979 }
df7492f9
KH
9980 else if (EQ (coding_type, Qundecided))
9981 category = coding_category_undecided;
4ed46869 9982 else
df7492f9 9983 error ("Invalid coding system type: %s",
8f924df7 9984 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9985
df7492f9 9986 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
9987 CODING_ATTR_PLIST (attrs)
9988 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9989 CODING_ATTR_PLIST (attrs)));
35befdaa 9990 CODING_ATTR_PLIST (attrs)
3ed051d4 9991 = Fcons (QCascii_compatible_p,
35befdaa
KH
9992 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9993 CODING_ATTR_PLIST (attrs)));
c4825358 9994
df7492f9
KH
9995 eol_type = args[coding_arg_eol_type];
9996 if (! NILP (eol_type)
9997 && ! EQ (eol_type, Qunix)
9998 && ! EQ (eol_type, Qdos)
9999 && ! EQ (eol_type, Qmac))
10000 error ("Invalid eol-type");
4ed46869 10001
df7492f9 10002 aliases = Fcons (name, Qnil);
4ed46869 10003
df7492f9
KH
10004 if (NILP (eol_type))
10005 {
10006 eol_type = make_subsidiaries (name);
10007 for (i = 0; i < 3; i++)
1397dc18 10008 {
df7492f9
KH
10009 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10010
10011 this_name = AREF (eol_type, i);
10012 this_aliases = Fcons (this_name, Qnil);
10013 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10014 this_spec = Fmake_vector (make_number (3), attrs);
10015 ASET (this_spec, 1, this_aliases);
10016 ASET (this_spec, 2, this_eol_type);
10017 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10018 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
10019 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10020 if (NILP (val))
10021 Vcoding_system_alist
10022 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10023 Vcoding_system_alist);
1397dc18 10024 }
d46c5b12 10025 }
4ed46869 10026
df7492f9
KH
10027 spec_vec = Fmake_vector (make_number (3), attrs);
10028 ASET (spec_vec, 1, aliases);
10029 ASET (spec_vec, 2, eol_type);
48b0f3ae 10030
df7492f9
KH
10031 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10032 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
10033 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10034 if (NILP (val))
10035 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10036 Vcoding_system_alist);
48b0f3ae 10037
df7492f9
KH
10038 {
10039 int id = coding_categories[category].id;
48b0f3ae 10040
df7492f9
KH
10041 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10042 setup_coding_system (name, &coding_categories[category]);
10043 }
48b0f3ae 10044
d46c5b12 10045 return Qnil;
48b0f3ae 10046
df7492f9
KH
10047 short_args:
10048 return Fsignal (Qwrong_number_of_arguments,
10049 Fcons (intern ("define-coding-system-internal"),
10050 make_number (nargs)));
d46c5b12 10051}
4ed46869 10052
d6925f38 10053
a6f87d34
KH
10054DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10055 3, 3, 0,
10056 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
5842a27b 10057 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
a6f87d34 10058{
3dbe7859 10059 Lisp_Object spec, attrs;
a6f87d34
KH
10060
10061 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10062 attrs = AREF (spec, 0);
10063 if (EQ (prop, QCmnemonic))
10064 {
10065 if (! STRINGP (val))
10066 CHECK_CHARACTER (val);
10067 CODING_ATTR_MNEMONIC (attrs) = val;
10068 }
2133e2d1 10069 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
10070 {
10071 if (NILP (val))
10072 val = make_number (' ');
10073 else
10074 CHECK_CHARACTER (val);
10075 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10076 }
10077 else if (EQ (prop, QCdecode_translation_table))
10078 {
10079 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10080 CHECK_SYMBOL (val);
10081 CODING_ATTR_DECODE_TBL (attrs) = val;
10082 }
10083 else if (EQ (prop, QCencode_translation_table))
10084 {
10085 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10086 CHECK_SYMBOL (val);
10087 CODING_ATTR_ENCODE_TBL (attrs) = val;
10088 }
10089 else if (EQ (prop, QCpost_read_conversion))
10090 {
10091 CHECK_SYMBOL (val);
10092 CODING_ATTR_POST_READ (attrs) = val;
10093 }
10094 else if (EQ (prop, QCpre_write_conversion))
10095 {
10096 CHECK_SYMBOL (val);
10097 CODING_ATTR_PRE_WRITE (attrs) = val;
10098 }
35befdaa
KH
10099 else if (EQ (prop, QCascii_compatible_p))
10100 {
10101 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10102 }
a6f87d34
KH
10103
10104 CODING_ATTR_PLIST (attrs)
10105 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10106 return val;
10107}
10108
10109
df7492f9
KH
10110DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10111 Sdefine_coding_system_alias, 2, 2, 0,
10112 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
5842a27b 10113 (Lisp_Object alias, Lisp_Object coding_system)
66cfb530 10114{
583f71ca 10115 Lisp_Object spec, aliases, eol_type, val;
4ed46869 10116
df7492f9
KH
10117 CHECK_SYMBOL (alias);
10118 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10119 aliases = AREF (spec, 1);
d4a1d553 10120 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
10121 element is a base coding system. Append ALIAS at the tail of the
10122 list. */
df7492f9
KH
10123 while (!NILP (XCDR (aliases)))
10124 aliases = XCDR (aliases);
8f924df7 10125 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 10126
df7492f9
KH
10127 eol_type = AREF (spec, 2);
10128 if (VECTORP (eol_type))
4ed46869 10129 {
df7492f9
KH
10130 Lisp_Object subsidiaries;
10131 int i;
4ed46869 10132
df7492f9
KH
10133 subsidiaries = make_subsidiaries (alias);
10134 for (i = 0; i < 3; i++)
10135 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10136 AREF (eol_type, i));
4ed46869 10137 }
df7492f9
KH
10138
10139 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 10140 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
10141 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10142 if (NILP (val))
10143 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10144 Vcoding_system_alist);
66cfb530 10145
4ed46869
KH
10146 return Qnil;
10147}
10148
df7492f9
KH
10149DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10150 1, 1, 0,
10151 doc: /* Return the base of CODING-SYSTEM.
da7db224 10152Any alias or subsidiary coding system is not a base coding system. */)
5842a27b 10153 (Lisp_Object coding_system)
d46c5b12 10154{
df7492f9 10155 Lisp_Object spec, attrs;
d46c5b12 10156
df7492f9
KH
10157 if (NILP (coding_system))
10158 return (Qno_conversion);
10159 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10160 attrs = AREF (spec, 0);
10161 return CODING_ATTR_BASE_NAME (attrs);
10162}
1397dc18 10163
df7492f9
KH
10164DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10165 1, 1, 0,
10166 doc: "Return the property list of CODING-SYSTEM.")
5842a27b 10167 (Lisp_Object coding_system)
df7492f9
KH
10168{
10169 Lisp_Object spec, attrs;
1397dc18 10170
df7492f9
KH
10171 if (NILP (coding_system))
10172 coding_system = Qno_conversion;
10173 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10174 attrs = AREF (spec, 0);
10175 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
10176}
10177
df7492f9
KH
10178
10179DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10180 1, 1, 0,
da7db224 10181 doc: /* Return the list of aliases of CODING-SYSTEM. */)
5842a27b 10182 (Lisp_Object coding_system)
66cfb530 10183{
df7492f9 10184 Lisp_Object spec;
84d60297 10185
df7492f9
KH
10186 if (NILP (coding_system))
10187 coding_system = Qno_conversion;
10188 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 10189 return AREF (spec, 1);
df7492f9 10190}
66cfb530 10191
df7492f9
KH
10192DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10193 Scoding_system_eol_type, 1, 1, 0,
10194 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 10195An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 10196
df7492f9
KH
10197Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10198and CR respectively.
66cfb530 10199
df7492f9
KH
10200A vector value indicates that a format of end-of-line should be
10201detected automatically. Nth element of the vector is the subsidiary
10202coding system whose eol-type is N. */)
5842a27b 10203 (Lisp_Object coding_system)
6b89e3aa 10204{
df7492f9
KH
10205 Lisp_Object spec, eol_type;
10206 int n;
6b89e3aa 10207
df7492f9
KH
10208 if (NILP (coding_system))
10209 coding_system = Qno_conversion;
10210 if (! CODING_SYSTEM_P (coding_system))
10211 return Qnil;
10212 spec = CODING_SYSTEM_SPEC (coding_system);
10213 eol_type = AREF (spec, 2);
10214 if (VECTORP (eol_type))
10215 return Fcopy_sequence (eol_type);
10216 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10217 return make_number (n);
6b89e3aa
KH
10218}
10219
4ed46869
KH
10220#endif /* emacs */
10221
10222\f
1397dc18 10223/*** 9. Post-amble ***/
4ed46869 10224
dfcf069d 10225void
971de7fb 10226init_coding_once (void)
4ed46869
KH
10227{
10228 int i;
10229
df7492f9
KH
10230 for (i = 0; i < coding_category_max; i++)
10231 {
10232 coding_categories[i].id = -1;
10233 coding_priorities[i] = i;
10234 }
4ed46869
KH
10235
10236 /* ISO2022 specific initialize routine. */
10237 for (i = 0; i < 0x20; i++)
b73bfc1c 10238 iso_code_class[i] = ISO_control_0;
4ed46869
KH
10239 for (i = 0x21; i < 0x7F; i++)
10240 iso_code_class[i] = ISO_graphic_plane_0;
10241 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 10242 iso_code_class[i] = ISO_control_1;
4ed46869
KH
10243 for (i = 0xA1; i < 0xFF; i++)
10244 iso_code_class[i] = ISO_graphic_plane_1;
10245 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10246 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
10247 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10248 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10249 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10250 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10251 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10252 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10253 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10254
df7492f9
KH
10255 for (i = 0; i < 256; i++)
10256 {
10257 emacs_mule_bytes[i] = 1;
10258 }
7c78e542
KH
10259 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10260 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10261 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10262 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
10263}
10264
10265#ifdef emacs
10266
dfcf069d 10267void
971de7fb 10268syms_of_coding (void)
e0e989f6 10269{
df7492f9 10270 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
10271 {
10272 Lisp_Object args[2];
10273 args[0] = QCtest;
10274 args[1] = Qeq;
10275 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10276 }
df7492f9
KH
10277
10278 staticpro (&Vsjis_coding_system);
10279 Vsjis_coding_system = Qnil;
e0e989f6 10280
df7492f9
KH
10281 staticpro (&Vbig5_coding_system);
10282 Vbig5_coding_system = Qnil;
10283
24a73b0a
KH
10284 staticpro (&Vcode_conversion_reused_workbuf);
10285 Vcode_conversion_reused_workbuf = Qnil;
10286
10287 staticpro (&Vcode_conversion_workbuf_name);
d67b4f80 10288 Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
e0e989f6 10289
24a73b0a 10290 reused_workbuf_in_use = 0;
df7492f9
KH
10291
10292 DEFSYM (Qcharset, "charset");
10293 DEFSYM (Qtarget_idx, "target-idx");
10294 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
10295 Fset (Qcoding_system_history, Qnil);
10296
9ce27fde 10297 /* Target FILENAME is the first argument. */
e0e989f6 10298 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 10299 /* Target FILENAME is the third argument. */
e0e989f6
KH
10300 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10301
df7492f9 10302 DEFSYM (Qcall_process, "call-process");
9ce27fde 10303 /* Target PROGRAM is the first argument. */
e0e989f6
KH
10304 Fput (Qcall_process, Qtarget_idx, make_number (0));
10305
df7492f9 10306 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 10307 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10308 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10309
df7492f9 10310 DEFSYM (Qstart_process, "start-process");
9ce27fde 10311 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10312 Fput (Qstart_process, Qtarget_idx, make_number (2));
10313
df7492f9 10314 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 10315 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
10316 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10317
df7492f9
KH
10318 DEFSYM (Qcoding_system, "coding-system");
10319 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 10320
df7492f9
KH
10321 DEFSYM (Qeol_type, "eol-type");
10322 DEFSYM (Qunix, "unix");
10323 DEFSYM (Qdos, "dos");
4ed46869 10324
df7492f9
KH
10325 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10326 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10327 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10328 DEFSYM (Qdefault_char, "default-char");
10329 DEFSYM (Qundecided, "undecided");
10330 DEFSYM (Qno_conversion, "no-conversion");
10331 DEFSYM (Qraw_text, "raw-text");
4ed46869 10332
df7492f9 10333 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 10334
df7492f9 10335 DEFSYM (Qutf_8, "utf-8");
8f924df7 10336 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 10337
df7492f9 10338 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
10339 DEFSYM (Qbig, "big");
10340 DEFSYM (Qlittle, "little");
27901516 10341
df7492f9
KH
10342 DEFSYM (Qshift_jis, "shift-jis");
10343 DEFSYM (Qbig5, "big5");
4ed46869 10344
df7492f9 10345 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 10346
df7492f9 10347 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869 10348 Fput (Qcoding_system_error, Qerror_conditions,
d67b4f80 10349 pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
4ed46869 10350 Fput (Qcoding_system_error, Qerror_message,
d67b4f80 10351 make_pure_c_string ("Invalid coding system"));
4ed46869 10352
05e6f5dc
KH
10353 /* Intern this now in case it isn't already done.
10354 Setting this variable twice is harmless.
10355 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 10356 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
70c22245 10357
df7492f9 10358 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 10359 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
10360 DEFSYM (Qtranslation_table_id, "translation-table-id");
10361 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10362 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 10363
df7492f9 10364 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 10365
df7492f9 10366 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 10367
01378f49 10368 DEFSYM (QCcategory, ":category");
a6f87d34 10369 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 10370 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
10371 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10372 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10373 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10374 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 10375 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 10376
df7492f9
KH
10377 Vcoding_category_table
10378 = Fmake_vector (make_number (coding_category_max), Qnil);
10379 staticpro (&Vcoding_category_table);
10380 /* Followings are target of code detection. */
10381 ASET (Vcoding_category_table, coding_category_iso_7,
d67b4f80 10382 intern_c_string ("coding-category-iso-7"));
df7492f9 10383 ASET (Vcoding_category_table, coding_category_iso_7_tight,
d67b4f80 10384 intern_c_string ("coding-category-iso-7-tight"));
df7492f9 10385 ASET (Vcoding_category_table, coding_category_iso_8_1,
d67b4f80 10386 intern_c_string ("coding-category-iso-8-1"));
df7492f9 10387 ASET (Vcoding_category_table, coding_category_iso_8_2,
d67b4f80 10388 intern_c_string ("coding-category-iso-8-2"));
df7492f9 10389 ASET (Vcoding_category_table, coding_category_iso_7_else,
d67b4f80 10390 intern_c_string ("coding-category-iso-7-else"));
df7492f9 10391 ASET (Vcoding_category_table, coding_category_iso_8_else,
d67b4f80 10392 intern_c_string ("coding-category-iso-8-else"));
a470d443 10393 ASET (Vcoding_category_table, coding_category_utf_8_auto,
d67b4f80 10394 intern_c_string ("coding-category-utf-8-auto"));
a470d443 10395 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
d67b4f80 10396 intern_c_string ("coding-category-utf-8"));
a470d443 10397 ASET (Vcoding_category_table, coding_category_utf_8_sig,
d67b4f80 10398 intern_c_string ("coding-category-utf-8-sig"));
df7492f9 10399 ASET (Vcoding_category_table, coding_category_utf_16_be,
d67b4f80 10400 intern_c_string ("coding-category-utf-16-be"));
ff563fce 10401 ASET (Vcoding_category_table, coding_category_utf_16_auto,
d67b4f80 10402 intern_c_string ("coding-category-utf-16-auto"));
df7492f9 10403 ASET (Vcoding_category_table, coding_category_utf_16_le,
d67b4f80 10404 intern_c_string ("coding-category-utf-16-le"));
df7492f9 10405 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
d67b4f80 10406 intern_c_string ("coding-category-utf-16-be-nosig"));
df7492f9 10407 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
d67b4f80 10408 intern_c_string ("coding-category-utf-16-le-nosig"));
df7492f9 10409 ASET (Vcoding_category_table, coding_category_charset,
d67b4f80 10410 intern_c_string ("coding-category-charset"));
df7492f9 10411 ASET (Vcoding_category_table, coding_category_sjis,
d67b4f80 10412 intern_c_string ("coding-category-sjis"));
df7492f9 10413 ASET (Vcoding_category_table, coding_category_big5,
d67b4f80 10414 intern_c_string ("coding-category-big5"));
df7492f9 10415 ASET (Vcoding_category_table, coding_category_ccl,
d67b4f80 10416 intern_c_string ("coding-category-ccl"));
df7492f9 10417 ASET (Vcoding_category_table, coding_category_emacs_mule,
d67b4f80 10418 intern_c_string ("coding-category-emacs-mule"));
df7492f9
KH
10419 /* Followings are NOT target of code detection. */
10420 ASET (Vcoding_category_table, coding_category_raw_text,
d67b4f80 10421 intern_c_string ("coding-category-raw-text"));
df7492f9 10422 ASET (Vcoding_category_table, coding_category_undecided,
d67b4f80 10423 intern_c_string ("coding-category-undecided"));
ecf488bc 10424
065e3595
KH
10425 DEFSYM (Qinsufficient_source, "insufficient-source");
10426 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10427 DEFSYM (Qinvalid_source, "invalid-source");
10428 DEFSYM (Qinterrupted, "interrupted");
10429 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10430 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10431
4ed46869
KH
10432 defsubr (&Scoding_system_p);
10433 defsubr (&Sread_coding_system);
10434 defsubr (&Sread_non_nil_coding_system);
10435 defsubr (&Scheck_coding_system);
10436 defsubr (&Sdetect_coding_region);
d46c5b12 10437 defsubr (&Sdetect_coding_string);
05e6f5dc 10438 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10439 defsubr (&Sunencodable_char_position);
df7492f9 10440 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10441 defsubr (&Sdecode_coding_region);
10442 defsubr (&Sencode_coding_region);
10443 defsubr (&Sdecode_coding_string);
10444 defsubr (&Sencode_coding_string);
10445 defsubr (&Sdecode_sjis_char);
10446 defsubr (&Sencode_sjis_char);
10447 defsubr (&Sdecode_big5_char);
10448 defsubr (&Sencode_big5_char);
1ba9e4ab 10449 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10450 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10451 defsubr (&Sterminal_coding_system);
1ba9e4ab 10452 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10453 defsubr (&Skeyboard_coding_system);
a5d301df 10454 defsubr (&Sfind_operation_coding_system);
df7492f9 10455 defsubr (&Sset_coding_system_priority);
6b89e3aa 10456 defsubr (&Sdefine_coding_system_internal);
df7492f9 10457 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10458 defsubr (&Scoding_system_put);
df7492f9
KH
10459 defsubr (&Scoding_system_base);
10460 defsubr (&Scoding_system_plist);
10461 defsubr (&Scoding_system_aliases);
10462 defsubr (&Scoding_system_eol_type);
10463 defsubr (&Scoding_system_priority_list);
4ed46869 10464
29208e82 10465 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
48b0f3ae
PJ
10466 doc: /* List of coding systems.
10467
10468Do not alter the value of this variable manually. This variable should be
df7492f9 10469updated by the functions `define-coding-system' and
48b0f3ae 10470`define-coding-system-alias'. */);
4608c386
KH
10471 Vcoding_system_list = Qnil;
10472
29208e82 10473 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
48b0f3ae
PJ
10474 doc: /* Alist of coding system names.
10475Each element is one element list of coding system name.
446dcd75 10476This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10477
10478Do not alter the value of this variable manually. This variable should be
10479updated by the functions `make-coding-system' and
10480`define-coding-system-alias'. */);
4608c386
KH
10481 Vcoding_system_alist = Qnil;
10482
29208e82 10483 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
48b0f3ae
PJ
10484 doc: /* List of coding-categories (symbols) ordered by priority.
10485
10486On detecting a coding system, Emacs tries code detection algorithms
10487associated with each coding-category one by one in this order. When
10488one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10489system bound to the corresponding coding-category is selected.
10490
448e17d6 10491Don't modify this variable directly, but use `set-coding-system-priority'. */);
4ed46869
KH
10492 {
10493 int i;
10494
10495 Vcoding_category_list = Qnil;
df7492f9 10496 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10497 Vcoding_category_list
d46c5b12
KH
10498 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10499 Vcoding_category_list);
4ed46869
KH
10500 }
10501
29208e82 10502 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
48b0f3ae
PJ
10503 doc: /* Specify the coding system for read operations.
10504It is useful to bind this variable with `let', but do not set it globally.
10505If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10506If not, an appropriate element is used from one of the coding system alists.
10507There are three such tables: `file-coding-system-alist',
48b0f3ae 10508`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10509 Vcoding_system_for_read = Qnil;
10510
29208e82 10511 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
48b0f3ae
PJ
10512 doc: /* Specify the coding system for write operations.
10513Programs bind this variable with `let', but you should not set it globally.
10514If the value is a coding system, it is used for encoding of output,
10515when writing it to a file and when sending it to a file or subprocess.
10516
10517If this does not specify a coding system, an appropriate element
446dcd75
JB
10518is used from one of the coding system alists.
10519There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10520`process-coding-system-alist', and `network-coding-system-alist'.
10521For output to files, if the above procedure does not specify a coding system,
10522the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10523 Vcoding_system_for_write = Qnil;
10524
29208e82 10525 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
df7492f9
KH
10526 doc: /*
10527Coding system used in the latest file or process I/O. */);
4ed46869
KH
10528 Vlast_coding_system_used = Qnil;
10529
29208e82 10530 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
065e3595
KH
10531 doc: /*
10532Error status of the last code conversion.
10533
10534When an error was detected in the last code conversion, this variable
10535is set to one of the following symbols.
10536 `insufficient-source'
10537 `inconsistent-eol'
10538 `invalid-source'
10539 `interrupted'
10540 `insufficient-memory'
10541When no error was detected, the value doesn't change. So, to check
10542the error status of a code conversion by this variable, you must
10543explicitly set this variable to nil before performing code
10544conversion. */);
10545 Vlast_code_conversion_error = Qnil;
10546
29208e82 10547 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
df7492f9
KH
10548 doc: /*
10549*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10550See info node `Coding Systems' and info node `Text and Binary' concerning
10551such conversion. */);
9ce27fde
KH
10552 inhibit_eol_conversion = 0;
10553
29208e82 10554 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
df7492f9
KH
10555 doc: /*
10556Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10557Bind it to t if the process output is to be treated as if it were a file
10558read from some filesystem. */);
ed29121d
EZ
10559 inherit_process_coding_system = 0;
10560
29208e82 10561 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
df7492f9
KH
10562 doc: /*
10563Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10564The format is ((PATTERN . VAL) ...),
10565where PATTERN is a regular expression matching a file name,
10566VAL is a coding system, a cons of coding systems, or a function symbol.
10567If VAL is a coding system, it is used for both decoding and encoding
10568the file contents.
10569If VAL is a cons of coding systems, the car part is used for decoding,
10570and the cdr part is used for encoding.
10571If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10572or a cons of coding systems which are used as above. The function is
10573called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10574`find-operation-coding-system' was called. If the function can't decide
10575a coding system, it can return `undecided' so that the normal
10576code-detection is performed.
48b0f3ae
PJ
10577
10578See also the function `find-operation-coding-system'
10579and the variable `auto-coding-alist'. */);
02ba4723
KH
10580 Vfile_coding_system_alist = Qnil;
10581
29208e82 10582 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
df7492f9
KH
10583 doc: /*
10584Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10585The format is ((PATTERN . VAL) ...),
10586where PATTERN is a regular expression matching a program name,
10587VAL is a coding system, a cons of coding systems, or a function symbol.
10588If VAL is a coding system, it is used for both decoding what received
10589from the program and encoding what sent to the program.
10590If VAL is a cons of coding systems, the car part is used for decoding,
10591and the cdr part is used for encoding.
10592If VAL is a function symbol, the function must return a coding system
10593or a cons of coding systems which are used as above.
10594
10595See also the function `find-operation-coding-system'. */);
02ba4723
KH
10596 Vprocess_coding_system_alist = Qnil;
10597
29208e82 10598 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
df7492f9
KH
10599 doc: /*
10600Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10601The format is ((PATTERN . VAL) ...),
10602where PATTERN is a regular expression matching a network service name
10603or is a port number to connect to,
10604VAL is a coding system, a cons of coding systems, or a function symbol.
10605If VAL is a coding system, it is used for both decoding what received
10606from the network stream and encoding what sent to the network stream.
10607If VAL is a cons of coding systems, the car part is used for decoding,
10608and the cdr part is used for encoding.
10609If VAL is a function symbol, the function must return a coding system
10610or a cons of coding systems which are used as above.
10611
10612See also the function `find-operation-coding-system'. */);
02ba4723 10613 Vnetwork_coding_system_alist = Qnil;
4ed46869 10614
29208e82 10615 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
75205970
RS
10616 doc: /* Coding system to use with system messages.
10617Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10618 Vlocale_coding_system = Qnil;
10619
005f0d35 10620 /* The eol mnemonics are reset in startup.el system-dependently. */
29208e82 10621 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
df7492f9
KH
10622 doc: /*
10623*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
d67b4f80 10624 eol_mnemonic_unix = make_pure_c_string (":");
4ed46869 10625
29208e82 10626 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
df7492f9
KH
10627 doc: /*
10628*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
d67b4f80 10629 eol_mnemonic_dos = make_pure_c_string ("\\");
4ed46869 10630
29208e82 10631 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
df7492f9
KH
10632 doc: /*
10633*String displayed in mode line for MAC-like (CR) end-of-line format. */);
d67b4f80 10634 eol_mnemonic_mac = make_pure_c_string ("/");
4ed46869 10635
29208e82 10636 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
df7492f9
KH
10637 doc: /*
10638*String displayed in mode line when end-of-line format is not yet determined. */);
d67b4f80 10639 eol_mnemonic_undecided = make_pure_c_string (":");
4ed46869 10640
29208e82 10641 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
df7492f9
KH
10642 doc: /*
10643*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10644 Venable_character_translation = Qt;
bdd9fb48 10645
f967223b 10646 DEFVAR_LISP ("standard-translation-table-for-decode",
29208e82 10647 Vstandard_translation_table_for_decode,
48b0f3ae 10648 doc: /* Table for translating characters while decoding. */);
f967223b 10649 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10650
f967223b 10651 DEFVAR_LISP ("standard-translation-table-for-encode",
29208e82 10652 Vstandard_translation_table_for_encode,
48b0f3ae 10653 doc: /* Table for translating characters while encoding. */);
f967223b 10654 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10655
29208e82 10656 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
48b0f3ae
PJ
10657 doc: /* Alist of charsets vs revision numbers.
10658While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10659designate it with the escape sequence identifying revision (cdr part
10660of the element). */);
10661 Vcharset_revision_table = Qnil;
02ba4723
KH
10662
10663 DEFVAR_LISP ("default-process-coding-system",
29208e82 10664 Vdefault_process_coding_system,
48b0f3ae
PJ
10665 doc: /* Cons of coding systems used for process I/O by default.
10666The car part is used for decoding a process output,
10667the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10668 Vdefault_process_coding_system = Qnil;
c4825358 10669
29208e82 10670 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
df7492f9
KH
10671 doc: /*
10672Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10673This is a vector of length 256.
10674If Nth element is non-nil, the existence of code N in a file
10675\(or output of subprocess) doesn't prevent it to be detected as
10676a coding system of ISO 2022 variant which has a flag
10677`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10678or reading output of a subprocess.
446dcd75 10679Only 128th through 159th elements have a meaning. */);
3f003981 10680 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10681
10682 DEFVAR_LISP ("select-safe-coding-system-function",
29208e82 10683 Vselect_safe_coding_system_function,
df7492f9
KH
10684 doc: /*
10685Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10686
10687If set, this function is called to force a user to select a proper
10688coding system which can encode the text in the case that a default
fdecf907
GM
10689coding system used in each operation can't encode the text. The
10690function should take care that the buffer is not modified while
10691the coding system is being selected.
48b0f3ae
PJ
10692
10693The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10694 Vselect_safe_coding_system_function = Qnil;
10695
5d5bf4d8 10696 DEFVAR_BOOL ("coding-system-require-warning",
29208e82 10697 coding_system_require_warning,
5d5bf4d8 10698 doc: /* Internal use only.
6b89e3aa
KH
10699If non-nil, on writing a file, `select-safe-coding-system-function' is
10700called even if `coding-system-for-write' is non-nil. The command
10701`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10702 coding_system_require_warning = 0;
10703
10704
22ab2303 10705 DEFVAR_BOOL ("inhibit-iso-escape-detection",
29208e82 10706 inhibit_iso_escape_detection,
df7492f9 10707 doc: /*
97b1b294 10708If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10709
97b1b294
EZ
10710When Emacs reads text, it tries to detect how the text is encoded.
10711This code detection is sensitive to escape sequences. If Emacs sees
10712a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10713of the ISO2022 encodings, and decodes text by the corresponding coding
10714system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10715
10716However, there may be a case that you want to read escape sequences in
10717a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10718Then the code detection will ignore any escape sequences, and no text is
10719detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10720escape sequences become visible in a buffer.
10721
10722The default value is nil, and it is strongly recommended not to change
10723it. That is because many Emacs Lisp source files that contain
10724non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10725in Emacs's distribution, and they won't be decoded correctly on
10726reading if you suppress escape sequence detection.
10727
10728The other way to read escape sequences in a file without decoding is
97b1b294 10729to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10730escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10731 inhibit_iso_escape_detection = 0;
002fdb44 10732
97b1b294 10733 DEFVAR_BOOL ("inhibit-null-byte-detection",
29208e82 10734 inhibit_null_byte_detection,
97b1b294
EZ
10735 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10736By default, Emacs treats it as binary data, and does not attempt to
10737decode it. The effect is as if you specified `no-conversion' for
10738reading that text.
10739
10740Set this to non-nil when a regular text happens to include null bytes.
10741Examples are Index nodes of Info files and null-byte delimited output
10742from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10743decode text as usual. */);
10744 inhibit_null_byte_detection = 0;
10745
29208e82 10746 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
15c8f9d1 10747 doc: /* Char table for translating self-inserting characters.
446dcd75 10748This is applied to the result of input methods, not their input.
8434d0b8
EZ
10749See also `keyboard-translate-table'.
10750
10751Use of this variable for character code unification was rendered
10752obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10753internal character representation. */);
002fdb44 10754 Vtranslation_table_for_input = Qnil;
8f924df7 10755
2c78b7e1
KH
10756 {
10757 Lisp_Object args[coding_arg_max];
8f924df7 10758 Lisp_Object plist[16];
2c78b7e1
KH
10759 int i;
10760
10761 for (i = 0; i < coding_arg_max; i++)
10762 args[i] = Qnil;
10763
d67b4f80 10764 plist[0] = intern_c_string (":name");
2c78b7e1 10765 plist[1] = args[coding_arg_name] = Qno_conversion;
d67b4f80 10766 plist[2] = intern_c_string (":mnemonic");
2c78b7e1 10767 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
d67b4f80 10768 plist[4] = intern_c_string (":coding-type");
2c78b7e1 10769 plist[5] = args[coding_arg_coding_type] = Qraw_text;
d67b4f80 10770 plist[6] = intern_c_string (":ascii-compatible-p");
2c78b7e1 10771 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
d67b4f80 10772 plist[8] = intern_c_string (":default-char");
2c78b7e1 10773 plist[9] = args[coding_arg_default_char] = make_number (0);
d67b4f80 10774 plist[10] = intern_c_string (":for-unibyte");
8f924df7 10775 plist[11] = args[coding_arg_for_unibyte] = Qt;
d67b4f80
DN
10776 plist[12] = intern_c_string (":docstring");
10777 plist[13] = make_pure_c_string ("Do no conversion.\n\
2c78b7e1
KH
10778\n\
10779When you visit a file with this coding, the file is read into a\n\
10780unibyte buffer as is, thus each byte of a file is treated as a\n\
10781character.");
d67b4f80 10782 plist[14] = intern_c_string (":eol-type");
8f924df7
KH
10783 plist[15] = args[coding_arg_eol_type] = Qunix;
10784 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10785 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10786
10787 plist[1] = args[coding_arg_name] = Qundecided;
10788 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10789 plist[5] = args[coding_arg_coding_type] = Qundecided;
10790 /* This is already set.
35befdaa 10791 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
d67b4f80 10792 plist[8] = intern_c_string (":charset-list");
ae6f73fa
KH
10793 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10794 plist[11] = args[coding_arg_for_unibyte] = Qnil;
d67b4f80 10795 plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
ae6f73fa
KH
10796 plist[15] = args[coding_arg_eol_type] = Qnil;
10797 args[coding_arg_plist] = Flist (16, plist);
10798 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10799 }
10800
2c78b7e1 10801 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10802
10803 {
10804 int i;
10805
10806 for (i = 0; i < coding_category_max; i++)
10807 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10808 }
1a4990fb 10809#if defined (DOS_NT)
fcbcfb64
KH
10810 system_eol_type = Qdos;
10811#else
10812 system_eol_type = Qunix;
10813#endif
10814 staticpro (&system_eol_type);
4ed46869
KH
10815}
10816
68c45bf0 10817char *
971de7fb 10818emacs_strerror (int error_number)
68c45bf0
PE
10819{
10820 char *str;
10821
ca9c0567 10822 synchronize_system_messages_locale ();
68c45bf0
PE
10823 str = strerror (error_number);
10824
10825 if (! NILP (Vlocale_coding_system))
10826 {
10827 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10828 Vlocale_coding_system,
10829 0);
51b59d79 10830 str = SSDATA (dec);
68c45bf0
PE
10831 }
10832
10833 return str;
10834}
10835
4ed46869 10836#endif /* emacs */